コード例 #1
0
ファイル: bias_add_ad.py プロジェクト: zhuyawen/akg
def bias_add_ad(head, input_shape, data_format):
    """
    Compute gradient for bias_add operator using automatic differentiate.

    Args:
        head (tvm.tensor.Tensor): Input tensor.
        input_shape (Union[list, tuple]): Input shape of head.
        data_format (str): Data format of input tensors.

    Returns:
        tvm.tensor.Tensor of same shape and type as head.
    """

    check_list = ["NHWC", "NC1HWC0", "DefaultFormat"]
    if data_format not in check_list:
        raise RuntimeError("bias_add_grad only support %s while dataformat is %s" % (",".join(check_list), data_format))
    vc_util.check_shape(head.shape)
    shape1 = [x.value for x in head.shape]
    vc_util.davinci_format_check(shape1, data_format)
    a = akg.tvm.placeholder(head.shape, head.dtype, "A")
    if data_format == "NC1HWC0":
        bias_shape = (1, head.shape[1], 1, 1, head.shape[4])
        b = akg.tvm.placeholder(bias_shape, head.dtype, "B")
    elif data_format == "NHWC":
        bias_shape = (input_shape[-1],)
        b = akg.tvm.placeholder(bias_shape, head.dtype, "B")
    else:
        bias_shape = (input_shape[1],)
        b = akg.tvm.placeholder(bias_shape, head.dtype, "B")
    c = bias_add.bias_add(a, b, data_format)

    jacs = list(akg.differentiate(c, [b], head))
    attrs = {}
    return jacs[0], attrs
コード例 #2
0
ファイル: four2five.py プロジェクト: zhuyawen/akg
def four2five(data, format_, dst_dtype='float16', need_custom_tiling=True):
    """
    Convert 4-dims "data" to 5-dims,the format of "data" is defined in "format_"

    Args:
        data (tvm.tensor.Tensor): 4-dims tensor of type float16, float32
        format_ (str): a str defined the format of "data"
        dst_dtype (str): a str defined the type of output, could be float16 or float32

    Returns:
        5-dims tvm.tensor.Tensor,type is defined by dst_dtype,
        which shape is [N, ceil(C / 16), H, W, 16] and attr about tiling args

    Raises:
        ValueError: If the type of format_ is invalid.

    """
    # Check dtype
    vc_util.ops_dtype_check(data.dtype, vc_util.DtypeForDavinci.ALL_FLOAT)
    # Check shape
    shape = get_shape(data)
    vc_util.davinci_format_check(shape, format_, dim=4)

    # Check format
    if format_ not in ['NCHW', 'NHWC']:
        raise ValueError(
            "{} format is not support, four2five only support NCHW and NHWC format input"
            .format(format_))
    last_channel = 16
    if format_ == "NCHW":
        bs, c, h, w = get_shape(data)
    else:
        bs, h, w, c = get_shape(data)
    pad_c = c
    if c % last_channel != 0:
        pad_c = (c + 15) // last_channel * last_channel
    c1 = pad_c // last_channel
    c0 = last_channel
    is_dynamic = ds.shape_is_dynamic(data)
    if not is_dynamic:
        attrs = get_attrs()
    else:
        attrs = get_dynamic_attrs()
    # Check size c when casting happens
    if data.dtype != dst_dtype and c0 * c1 >= C_LIMIT_FOR_CAST:
        raise ValueError(
            "When input and output data type is not matched, shape of 'c' axis should not exceed {}, "
            "while currently set is {}".format(C_LIMIT_FOR_CAST, c0 * c1))

    @script(capture=locals())
    def nchw_to_nc1hwc0_step(inputs, bs, c1, h, w, c0):
        output = allocate((bs, c1, h, c0, w), inputs.dtype, "local")
        for n_i in range(bs):
            for c_i in range(c1):
                for h_i in range(h):
                    for w_i in range(w):
                        for c_i0 in range(c0):
                            output[n_i, c_i, h_i, c_i0,
                                   w_i] = inputs[n_i,
                                                 c_i * last_channel + c_i0,
                                                 h_i, w_i]
        output1 = allocate((bs, c1, h, w, c0), inputs.dtype, "local")
        for n_i in range(bs):
            for c_i in range(c1):
                for h_i in range(h):
                    for w_i in range(w):
                        for c_i0 in range(c0):
                            output1[n_i, c_i, h_i, w_i,
                                    c_i0] = output[n_i, c_i, h_i, c_i0, w_i]
        return output1

    @script(capture=locals())
    def nchw_to_nc1hwc0(inputs, bs, c1, h, w, c0):
        output = allocate((bs, c1, h, w, c0), inputs.dtype, "local")
        for n_i in range(bs):
            for c_i in range(c1):
                for h_i in range(h):
                    for w_i in range(w):
                        for c_i0 in range(c0):
                            output[n_i, c_i, h_i, w_i,
                                   c_i0] = inputs[n_i,
                                                  c_i * last_channel + c_i0,
                                                  h_i, w_i]
        return output

    @script(capture=locals())
    def nhwc_to_nc1hwc0(inputs, zero, bs, c1, h, w, c0):
        output = allocate((bs, c1, h, w, c0), inputs.dtype, "local")
        for n_i in range(bs):
            for c_i in range(c1):
                for h_i in range(h):
                    for w_i in range(w):
                        for c_i0 in range(c0):
                            if c_i * last_channel + c_i0 < c:
                                output[n_i, c_i, h_i, w_i,
                                       c_i0] = inputs[n_i, h_i, w_i,
                                                      c_i * last_channel +
                                                      c_i0]
                            else:
                                output[n_i, c_i, h_i, w_i, c_i0] = zero

        return output

    cast_data = data
    need_cast = data.dtype == 'float32' and dst_dtype == 'float16'
    if c % last_channel != 0 or need_cast:
        expansion = int(ct_util.BLOCK_SIZE / get_bytes(data.dtype))
    else:
        expansion = None
    # float32 -> float16, need to cast before transform
    if need_cast:
        cast_data = akg.lang.cce.cast_to(data, dst_dtype)

    zero_ = akg.tvm.const(0.0, cast_data.dtype)
    if format_ == "NCHW":
        if c % last_channel != 0:
            pad_shape = [bs, pad_c, h, w]
            if h == 1 and w == 1:
                # if h and w both are 1, it is pad last dim case
                output_shape = [bs, pad_c // last_channel, h, w, last_channel]

                output = akg.tvm.compute(
                    output_shape,
                    lambda i, c1, k, l, c0: akg.tvm.expr.Select(
                        c0 < c - c1 * last_channel, cast_data[
                            i, c1 * last_channel + c0, k, l],
                        akg.tvm.const(0, cast_data.dtype)),
                    name="output")
            else:
                # if need to pad c dim, separate transpose to two steps
                # first is nchw -> nc1hc0w, second is nc1hc0w -> nc1hwc0
                pad_data = akg.tvm.compute(
                    pad_shape,
                    lambda i, j, k, l: akg.tvm.expr.Select(
                        j < c, cast_data[i, j, k, l], zero_),
                    name="pad_data")
                output = nchw_to_nc1hwc0_step(pad_data, to_tvm_const(bs),
                                              to_tvm_const(c1),
                                              to_tvm_const(h), to_tvm_const(w),
                                              to_tvm_const(c0))

        else:
            if not is_dynamic and data.dtype == "float16" and h * w % last_channel == 0 and h * w < 3600:
                output_shape = [bs, c1, h, w, c0]
                output = akg.tvm.compute(
                    output_shape,
                    lambda n, c1, h, w, c0: akg.lang.cce.four2five_nchw(
                        cast_data[n, c1 * last_channel + c0, h, w]),
                    name="output")

            else:
                output = nchw_to_nc1hwc0(cast_data, to_tvm_const(bs),
                                         to_tvm_const(c1), to_tvm_const(h),
                                         to_tvm_const(w), to_tvm_const(c0))

    else:
        if not is_dynamic and c < last_channel:
            rank = 5  # (n, c1, h, w, c0)
            pad_before = []
            pad_after = []
            for _ in range(rank):
                pad_before.append(0)
                pad_after.append(0)
            pad_after[-1] = last_channel - c
            # As c < last_channel, c1 is 1
            output = akg.tvm.compute(
                (bs, c1, h, w, c),
                lambda bs_i, _, h_i, w_i, c_i: cast_data[bs_i, h_i, w_i, c_i],
                name="output")
            output = tvm_pad(output,
                             pad_before,
                             pad_after=pad_after,
                             name='pad_output')
        else:
            output = nhwc_to_nc1hwc0(cast_data, zero_, to_tvm_const(bs),
                                     to_tvm_const(c1), to_tvm_const(h),
                                     to_tvm_const(w), to_tvm_const(c0))

    # float16 -> float32, need to cast after transform
    if data.dtype == 'float16' and dst_dtype == 'float32':
        output = akg.lang.cce.cast_to(output, dst_dtype)

    vc_util.davinci_format_check(output.shape, "NC1HWC0", dim=5)

    if not is_dynamic:
        dim_info, _ = four2five_set_dim_func(data, format_, dst_dtype)
        if dim_info != "":
            attrs["dim"] = dim_info
        if need_custom_tiling:
            attrs["custom_tiling"] = four2five_tiling_strategy(
                output, format_, expansion)
    elif need_custom_tiling:
        attrs["custom_tiling"] = four2five_tiling_strategy_dynamic(
            output, format_)

    if is_dynamic:
        attrs["enable_feature_library_pre_poly"] = True
    return output, attrs
コード例 #3
0
def bias_add(data1, data2, data_format):
    """
    Adds bias data2 to input tensor data1.

    Args:
        data1 (tvm.tensor.Tensor): Tensor of type float16, float32.
        data2 (tvm.tensor.Tensor): The bias tensor, should be of same type as data1.
                                   If shape(data2) != shape(data1), broadcast will happen.
        data_format (str): Data format of input tensors, could be NC1HWC0, NHWC or DefaultFormat.

    Returns:
        tvm.tensor.Tensor of same shape and type as data1.
    """
    vc_util.check_shape(data1.shape)
    vc_util.check_shape(data2.shape)
    shape1 = get_shape(data1)
    shape2 = get_shape(data2)
    vc_util.davinci_format_check(shape1, data_format)
    vc_util.ops_dtype_check([data1.dtype, data2.dtype],
                            vc_util.DtypeForDavinci.ALL_FLOAT)

    if data_format == 'NC1HWC0':
        data2_new = akg.lang.cce.broadcast(data2, shape1)
        res = akg.lang.cce.vadd(data1, data2_new)
    else:
        if len(shape2) != 1:
            raise RuntimeError("data2 should be a 1D Tensor!")

        if data_format == "NHWC":
            if len(shape1) != 4:
                raise RuntimeError(
                    "bias_add only support 4D shape when data format is NHWC!")
            c_dim_len = shape1[3]
            if c_dim_len != shape2[0]:
                raise ValueError(
                    "The size of bias should be equal to the channel dimension, "
                    " while the size of bias is {0} and the channel dimension is "
                    "{1}".format(shape2[0], c_dim_len))
            data2_reshaped, _ = reshape(data2, [1, 1, 1, shape2[0]])
        elif data_format == "DefaultFormat":
            if len(shape1) != 2 and len(shape1) != 4:
                raise RuntimeError(
                    "bias_add only support 2D and 4D shape when data format is DefaultFormat!"
                )
            c_dim_len = shape1[1]
            if c_dim_len != shape2[0]:
                raise ValueError(
                    "The size of bias should be equal to the channel dimension, "
                    " while the size of bias is {0} and the channel dimension is "
                    "{1}".format(shape2[0], c_dim_len))
            if len(shape1) == 2:
                data2_reshaped, _ = reshape(data2, [1, shape2[0]])
            else:
                # NCHW
                data2_reshaped, _ = reshape(data2, [1, shape2[0], 1, 1])

        data2_new = akg.lang.cce.broadcast(data2_reshaped, shape1)
        res = akg.lang.cce.vadd(data1, data2_new)

        akg.register_variables("reshape_diff", [data2], data2_reshaped)

    return res
コード例 #4
0
ファイル: five2four.py プロジェクト: zhuyawen/akg
def five2four(data, shape4d, dst_type, format_):
    """
    Convert 5-dims "data" to 4-dims,the format of "data" is defined in "format_"

    Args:
        data (tvm.tensor.Tensor): 5-dims tensor of type float16, float32
        shape4d (Union[list, tuple]): a list has 4 nums, shape of output Tensor
        dst_type (str): data type of output Tensor
        format_ (str): a str defined the format of returns, support NCHW and NHWC

    Returns:
        4-dims tvm.tensor.Tensor.

    """
    vc_util.ops_dtype_check([data.dtype, dst_type],
                            vc_util.DtypeForDavinci.ALL_FLOAT)
    shape5d = get_shape(data)
    if not shape_is_dynamic(data):
        if len(shape5d) != 5 or shape5d[-1] != 16:
            raise ValueError(
                "five2four_cce only support 5-dim data and last dim should be 16"
            )

    bs, c1, h, w, c0 = shape5d
    if not shape_is_dynamic(data):
        vc_util.davinci_format_check(shape5d, "NC1HWC0", dim=5)
    # Check format
    if format_ not in ['NCHW', 'NHWC']:
        raise ValueError(
            "{} format is not support, five2four only support NCHW and NHWC format input"
            .format(format_))
    if format_ == "NCHW":
        if shape_is_dynamic(data):
            shape4d = [bs, c1 * c0, h, w]
        _, c, h_4d, w_4d = shape4d
    else:
        if shape_is_dynamic(data):
            shape4d = [bs, h, w, c1 * c0]
        _, h_4d, w_4d, c = shape4d
    vc_util.davinci_format_check(shape4d, format_, dim=4)

    # Check is shape4d and shape5d match
    if False not in [
            isinstance(s, (int, akg.tvm.expr.IntImm)) for s in shape5d
    ]:
        if h_4d != h or w_4d != w:
            raise ValueError(
                "five2four_cce's shape4d h and w should equal to data shape's h and w"
            )
        if c > c1 * c0 or c <= (c1 - 1) * c0:
            raise ValueError(
                "five2four_cce's shape4d c should in set ((c1 - 1) * c0, c1 * c0]"
            )

    # Check size c when casting happens
    if not shape_is_dynamic(data):
        if data.dtype != dst_type and c >= C_LIMIT_FOR_CAST:
            raise ValueError(
                "When input and output data type is not matched, shape of 'c' axis should not exceed {}, "
                "while currently set is {}".format(C_LIMIT_FOR_CAST, c))

    @script(capture=locals())
    def nc1hwc0_to_nhwc(inputs, bs, h, w, c, c1, c0):
        output = allocate((bs, h, w, c), inputs.dtype, "local")
        for n_i in range(bs):
            for h_i in range(h):
                for w_i in range(w):
                    for c_i in range(c1):
                        for c_i0 in range(c0):
                            output[n_i, h_i, w_i,
                                   c_i * c0 + c_i0] = inputs[n_i, c_i, h_i,
                                                             w_i, c_i0]
        return output

    @script(capture=locals())
    def nc1hwc0_to_nchw(inputs, bs, h, w, c, c1, c0):
        output = allocate((bs, c, h, w), inputs.dtype, "local")
        for n_i in range(bs):
            for c_i in range(c1):
                for h_i in range(h):
                    for w_i in range(w):
                        for c_i0 in range(c0):
                            output[n_i, c_i * c0 + c_i0, h_i,
                                   w_i] = inputs[n_i, c_i, h_i, w_i, c_i0]
        return output

    # if c % 16 == 0, h and w == 1, five2four is a reshape operation
    if shape_is_dynamic(data):
        call_reshape = isinstance(h, int) and isinstance(
            w, int) and h == 1 and w == 1
    else:
        call_reshape = h == 1 and w == 1 and c % 16 == 0
    c_value = None
    expansion = None
    if format_ == "NHWC":
        if call_reshape:
            output = akg.topi.reshape(data, (bs, h, w, c))
            if shape_is_dynamic(data):
                output = akg.tvm.compute((bs, h, w, c),
                                         lambda *indice: output(*indice),
                                         name="reshape")
        elif c < c0:
            reshape_output = akg.topi.reshape(data, (bs, h, w, c0))
            output = akg.tvm.compute((bs, h, w, c),
                                     lambda *i: reshape_output(*i),
                                     name='slice_output')
        else:
            output = nc1hwc0_to_nhwc(data, to_tvm_const(bs), to_tvm_const(h),
                                     to_tvm_const(w), to_tvm_const(c),
                                     to_tvm_const(c1), to_tvm_const(c0))

    else:
        if call_reshape:
            output = akg.topi.reshape(data, (bs, c, h, w))
            if shape_is_dynamic(data):
                output = akg.tvm.compute((bs, c, h, w),
                                         lambda *indice: output(*indice),
                                         name="reshape")
        else:
            output = nc1hwc0_to_nchw(data, to_tvm_const(bs), to_tvm_const(h),
                                     to_tvm_const(w), to_tvm_const(c),
                                     to_tvm_const(c1), to_tvm_const(c0))

    # two special cases for tiling strategy
    if not shape_is_dynamic(data):
        if c < c0 or output.dtype != dst_type:
            c_value = c
        if c % c0 != 0 and output.dtype != dst_type:
            expansion = int(ct_util.BLOCK_SIZE / get_bytes(data.dtype))
    attrs = get_attrs()
    if not call_reshape:
        attrs["custom_tiling"] = five2four_tiling_strategy(
            data, c_value, expansion)

    if output.dtype != dst_type:
        output = akg.topi.cast(output, dst_type)
    return output, attrs
コード例 #5
0
def maxpool_with_argmax_dynamic(data, kernel, stride, strategy):
    """
    Performs the max pooling on the input datas.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): two int numbers for pooling window's size.
        stride (Union[list, tuple]): two int numbers for window's stride.
        strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or
            instance of list(four int numbers, as 'CONSTANTS' strategy).
            Support **Strategies** is the same as avgpool.

    Returns:
        tvm.tensor.Tensor, result for gradient of maxpooling.
    """
    attrs = get_dynamic_attrs()
    dim_info = maxpool_with_argmax_set_dim_func(data, kernel, stride,
                                                strategy)[0]
    for k, v in attr_map_v2.items():
        attrs[k] = v
    if dim_info != "":
        attrs['dim'] = dim_info
    # attrs["custom_tiling"] = maxpool_with_argmax_custom_tiling_strategy(data)
    attrs["enable_feature_library"] = True
    shape = get_shape(data)
    dtype = data.dtype

    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16)
    vc_util.check_shape(kernel, 2, 'Kernel')
    vc_util.check_shape(stride, 2, 'Stride')

    pad_strategy_check(strategy)

    kernel_h, kernel_w = kernel
    in_n, in_c1, _, _, in_c0 = shape

    [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)

    pad = [ph_h, ph_t, pw_h, pw_t]
    zero = akg.tvm.const(0.0, dtype=dtype)
    min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else
                              -340282346638528859811704183484516925440.0,
                              dtype=dtype)

    # fmap img2col l1 -> ub in zZ format by fractal
    fmap_img2col_shape_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w,
                             in_c0)

    fmap_img2col_ub = img2col(data,
                              fmap_img2col_shape_ub,
                              kernel_h,
                              kernel_w,
                              pad,
                              stride,
                              min_value,
                              tag='')

    out_shape = (in_n, in_c1, out_h, out_w, in_c0)
    reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h")
    reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w")
    output = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: akg.tvm.max(
            fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0],
            axis=[reduce_axis_h, reduce_axis_w]),
        name="pooling_max")

    zero = akg.tvm.const(0.0, dtype=dtype)
    mask_first_max_shape = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w,
                            in_c0)
    mask_first_max = akg.tvm.compute(mask_first_max_shape,
                                     lambda *indice: zero,
                                     name="mask_first_max")

    attrs["custom_tiling"] = maxpool_with_argmax_dynamic_tensor_strategy(
        data, fmap_img2col_ub, mask_first_max)
    attrs["dynamic_shape"] = ds.set_dynamic_shape_limit_for_tensor(
        output, [64, 64], [2, 3])
    return output, mask_first_max, attrs
コード例 #6
0
def maxpool_with_argmax(data, kernel, stride, strategy):
    """
    Performs the max pooling on the input datas.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): two int numbers for pooling window's size.
        stride (Union[list, tuple]): two int numbers for window's stride.
        strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or
            instance of list(four int numbers, as 'CONSTANTS' strategy).
            Support **Strategies** is the same as avgpool.

    Returns:
        tvm.tensor.Tensor, result for gradient of maxpooling.
    """
    attrs = get_attrs()
    dim_info = maxpool_with_argmax_set_dim_func(data, kernel, stride,
                                                strategy)[0]
    for k, v in attr_map_v2.items():
        attrs[k] = v
    if dim_info != "":
        attrs['dim'] = dim_info
    attrs["custom_tiling"] = maxpool_with_argmax_tiling_strategy(
        data, kernel, stride, strategy)
    shape = get_shape(data)
    dtype = data.dtype

    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16)
    vc_util.check_shape(kernel, 2, 'Kernel')
    vc_util.check_shape(stride, 2, 'Stride')

    pad_strategy_check(strategy)

    kernel_h, kernel_w = kernel
    in_n, in_c1, _, _, in_c0 = shape

    [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)

    pad = [ph_h, ph_t, pw_h, pw_t]
    zero = akg.tvm.const(0.0, dtype=dtype)
    one = akg.tvm.const(1.0, dtype=dtype)
    min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else
                              -340282346638528859811704183484516925440.0,
                              dtype=dtype)

    # fmap img2col l1 -> ub in zZ format by fractal
    fmap_img2col_shape_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w,
                             in_c0)

    fmap_img2col_ub = img2col(data,
                              fmap_img2col_shape_ub,
                              kernel_h,
                              kernel_w,
                              pad,
                              stride,
                              min_value,
                              tag='')

    out_shape = (in_n, in_c1, out_h, out_w, in_c0)
    reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h")
    reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w")
    output = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: akg.tvm.max(
            fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0],
            axis=[reduce_axis_h, reduce_axis_w]),
        name="pooling_max")

    pooling_mask = akg.tvm.compute(
        fmap_img2col_shape_ub,
        lambda n, c1, kh, kw, oh, ow, c0: akg.tvm.if_then_else(
            fmap_img2col_ub[n, c1, kh, kw, oh, ow, c0] < output[
                n, c1, oh, ow, c0], zero, one),
        name="pooling_mask")

    mask_flag = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: pooling_mask[n, c1, 0, 0, oh, ow, c0],
        name="mask_flag")

    mask_init = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: pooling_mask[n, c1, 0, 0, oh, ow, c0],
        name="mask_init")

    # spec 2
    @script(capture=locals())
    def hybrid_first_max(mask_, flag_, flag2_, zero_, one_):
        output_ = allocate(
            (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0),
            mask_.dtype, 'local')
        for n_i in range(in_n):
            for c1_i in range(in_c1):
                for oh_i in range(out_h):
                    for ow_i in range(out_w):
                        for c0_i in range(in_c0):
                            output_[n_i, c1_i, 0, 0, oh_i, ow_i,
                                    c0_i] = flag2_[n_i, c1_i, oh_i, ow_i, c0_i]
                for kh_i in range(kernel_h):
                    for kw_i in range(kernel_w):
                        for oh_i in range(out_h):
                            for ow_i in range(out_w):
                                for c0_i in range(in_c0):
                                    output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] = \
                                        mask_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] -\
                                        flag_[n_i, c1_i, oh_i, ow_i, c0_i]
                                    output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] = \
                                        max(output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i], zero_)
                                    flag_[n_i, c1_i, oh_i, ow_i, c0_i] =\
                                        flag_[n_i, c1_i, oh_i, ow_i, c0_i] +\
                                        output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i]
        return output_

    mask_first_max = hybrid_first_max(pooling_mask, mask_flag, mask_init, zero,
                                      one)
    return output, mask_first_max, attrs
コード例 #7
0
def maxpool(data, kernel, stride, strategy):
    """
    Performs the max pooling on the input data.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): two int numbers for pooling window's size.
        stride (Union[list, tuple]): two int numbers for window's stride.
        strategy (Union[str, list, tuple]): padding, should be 'VALID',
            'SAME' or instance of list(four int numbers for 'CONSTANTS' strategy).
            Support **Strategies** is same as avgpool.

    Returns:
        tvm.tensor.Tensor, as result for max pooling.
    """
    attrs = attr_map
    attrs['dim'] = maxpool_set_dim_func(data, kernel, stride, strategy)[0]

    shape = get_shape(data)
    dtype = data.dtype
    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT)
    vc_util.check_shape(kernel, 2, "Kernel")
    vc_util.check_shape(stride, 2, "Stride")

    pad_strategy_check(strategy)

    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    in_n, in_c1, in_h, in_w, in_c0 = shape

    [ph_h, _, pw_h, _], [out_h, out_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)
    if attrs.get("dynamic") is True:
        # dynamic shape: although we can represent out_h and out_w using input shapes, they are too complicated
        out_h = akg.tvm.var("OUT_H")
        out_w = akg.tvm.var("OUT_W")

    @script(capture=locals())
    def dynamic_max_pool_hybrid_0(zero_, one_, min_value_, x_, in_n, in_c1,
                                  in_h, in_w, in_c0, out_h, out_w):
        output = output_tensor((in_n, in_c1, out_h, out_w, in_c0), x_.dtype)

        for n in range(in_n):
            for c1 in range(in_c1):
                # Head
                for ow in range(out_w):
                    for c0 in range(in_c0):
                        output[n, c1, 0, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for ow in range(out_w):
                            for c0 in range(in_c0):
                                if ph_h <= kh <= in_h + ph_h - 1 and 0 <= ow * stride_w + kw - pw_h <= in_w - 1:
                                    output[n, c1, 0, ow, c0] = \
                                        max(output[n, c1, 0, ow, c0],
                                            x_[n, c1, kh - ph_h, ow * stride_w + kw - pw_h, c0])
                # Tail
                for oh in range(out_h - 1):
                    for ow in range(out_w):
                        for c0 in range(in_c0):
                            output[n, c1, oh + 1, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for oh in range(out_h - 1):
                            for ow in range(out_w):
                                for c0 in range(in_c0):
                                    if ph_h <= (oh + 1) * stride_h + kh <= in_h + ph_h - 1\
                                            and pw_h <= ow * stride_w + kw <= in_w + pw_h - 1:
                                        output[n, c1, oh + 1, ow, c0] = max(
                                            output[n, c1, oh + 1, ow, c0],
                                            x_[n, c1,
                                               (oh + 1) * stride_h + kh - ph_h,
                                               ow * stride_w + kw - pw_h, c0])

        return output

    # static shape's hybrid
    @script(capture=locals())
    def static_max_pool_hybrid_0(zero_, one_, min_value_, x_):
        output = output_tensor((in_n, in_c1, out_h, out_w, in_c0), x_.dtype)

        for n in range(in_n):
            for c1 in range(in_c1):
                # Head
                for ow in range(out_w):
                    for c0 in range(in_c0):
                        output[n, c1, 0, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for ow in range(out_w):
                            for c0 in range(in_c0):
                                if ph_h <= kh <= in_h + ph_h - 1 and 0 <= ow * stride_w + kw - pw_h <= in_w - 1:
                                    output[n, c1, 0, ow, c0] = \
                                        max(output[n, c1, 0, ow, c0],
                                            x_[n, c1, kh - ph_h, ow * stride_w + kw - pw_h, c0])
                # Tail
                for oh in range(out_h - 1):
                    for ow in range(out_w):
                        for c0 in range(in_c0):
                            output[n, c1, oh + 1, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for oh in range(out_h - 1):
                            for ow in range(out_w):
                                for c0 in range(in_c0):
                                    if ph_h <= (oh + 1) * stride_h + kh <= in_h + ph_h - 1 \
                                            and pw_h <= ow * stride_w + kw <= in_w + pw_h - 1:
                                        output[n, c1, oh + 1, ow, c0] = max(
                                            output[n, c1, oh + 1, ow, c0],
                                            x_[n, c1,
                                               (oh + 1) * stride_h + kh - ph_h,
                                               ow * stride_w + kw - pw_h, c0])

        return output

    zero = akg.tvm.const(0.0, dtype=dtype)
    one = akg.tvm.const(1.0, dtype=dtype)
    min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else
                              -340282346638528859811704183484516925440.0,
                              dtype=dtype)
    if attrs.get("dynamic") is True:
        output = dynamic_max_pool_hybrid_0(zero, one, min_value, data, in_n,
                                           in_c1, in_h, in_w, in_c0, out_h,
                                           out_w)
    else:
        output = static_max_pool_hybrid_0(zero, one, min_value, data)

    return output, attrs
コード例 #8
0
def maxpool_manual_schedule(shape,
                            kernel,
                            stride,
                            padding,
                            dtype,
                            attrs=None,
                            polyhedral=False):
    """maxpool with manual schedule"""
    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT)

    maxpool_param_check(kernel, stride, padding)

    data = akg.tvm.placeholder(shape, dtype, name="input_data")
    batch_size, in_c1, input_h, input_w, in_c0 = data.shape

    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    if len(padding) == 2:
        pad_h, pad_w = padding
    elif len(padding) == 4:
        pad_h, pad_w = padding[0], padding[2]

    out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1
    out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1

    # padding operation
    if pad_h != 0 or pad_w != 0:
        pad_shape = (batch_size, in_c1, input_h + 2 * pad_h,
                     input_w + 2 * pad_w, in_c0)

        padded_input = akg.tvm.compute(
            pad_shape,
            lambda n, c1, h, w, c0: akg.tvm.if_then_else(
                akg.tvm.any(
                    h > input_h + pad_h - 1,
                    h < pad_h,
                    w > input_w + pad_w - 1,
                    w < pad_w,
                ),
                akg.tvm.const(0.0, dtype=dtype),
                data[n, c1, h - pad_h, w - pad_w, c0],
            ),
            name="padded_input")
    else:
        padded_input = data

    # reduce iterators
    it_kernel_h = akg.tvm.reduce_axis((0, kernel_h),
                                      name="iterator_reduction_height")
    it_kernel_w = akg.tvm.reduce_axis((0, kernel_w),
                                      name="iterator_reduction_width")

    out_shape = (batch_size, in_c1, out_size_h, out_size_w, in_c0)

    res = akg.tvm.compute(out_shape,
                          lambda n, c1, h, w, c0: akg.tvm.max(
                              padded_input[n, c1, (h * stride_h + it_kernel_h),
                                           (w * stride_w + it_kernel_w), c0],
                              axis=[it_kernel_h, it_kernel_w]),
                          name="maxpool_not_hybrid")

    s = akg.tvm.create_schedule([res.op])

    if pad_w != 0 or pad_h != 0:
        padded_input = res.op.input_tensors[0]
    else:
        padded_input = res

    # cache reads and writes
    # after this cache write: reference to res_ub to change the reduction axis
    res_ub = s.cache_write(res, "local.UB")
    if pad_w != 0 or pad_h != 0:
        data_ub = s.cache_read(data, "local.UB", [padded_input])
    else:
        data_ub = s.cache_read(data, "local.UB", [res_ub])

    # get tiling attributes
    if attrs is None:
        raise Exception('attrs is None')
    tiling_factors = attrs['tile']
    split_iterators = []
    if len(tiling_factors) != len(res.shape):
        raise RuntimeError("tiling factors mismatch out shape")
    # split the final compute and save the iterators
    for index, factor in enumerate(tiling_factors):
        split_iterators.append(s[res_ub].split(res_ub.op.axis[index], factor))

    # get iterators
    iterator_b_outer = split_iterators[0][0]
    iterator_b_inner = split_iterators[0][1]
    iterator_c1_outer = split_iterators[1][0]
    iterator_c1_inner = split_iterators[1][1]
    iterator_h_outer = split_iterators[2][0]
    iterator_h_inner = split_iterators[2][1]
    iterator_w_outer = split_iterators[3][0]
    iterator_w_inner = split_iterators[3][1]
    iterator_c0_outer = split_iterators[4][0]
    iterator_c0_inner = split_iterators[4][1]
    # reduction axis
    iterator_reduce_h = res_ub.op.reduce_axis[0]
    iterator_reduce_w = res_ub.op.reduce_axis[1]

    # move caches
    s[res_ub].compute_at(s[res], res.op.axis[0])
    s[data_ub].compute_at(s[res_ub], iterator_c1_outer)

    if pad_w != 0 or pad_h != 0:
        s[padded_input].compute_at(s[res_ub], iterator_c1_outer)
        s[padded_input].set_scope("local.UB")

    # reorder computation
    s[res_ub].reorder(iterator_b_outer, iterator_b_inner, iterator_c1_outer,
                      iterator_c1_inner, iterator_h_outer, iterator_h_inner,
                      iterator_w_outer, iterator_w_inner, iterator_reduce_h,
                      iterator_reduce_w, iterator_c0_outer, iterator_c0_inner)

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [data, res],
                        "cce",
                        name="maxpool_manual_schedule",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "maxpool_ad_manual_schedule"
        utils.create_cce(kernel_name, './', source_code)
    return mod
コード例 #9
0
def old_maxpool(data, kernel, stride, pad):
    """
    Old implement for maxpool.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16 or float32, \"NC1HWC0\"
                                  format (N: batch, C1: channel, H: height, W:
                                  width, C0: block size)
        kernel (Union[list, tuple]): List or tuple with two int number as
                                     window sizes of H and W.
        stride (Union[list, tuple]): List or tuple with two int number as
                                     stride sizes of H and W.
        pad (Union[list, tuple]): List or tuple with two int number as
                                  pad sizes of H and W.

    Returns:
        tvm.tensor.Tensor, result of maxpool operator.
    """
    shape = get_shape(data)
    dtype = data.dtype
    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT)

    maxpool_param_check(kernel, stride, pad)

    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    if len(pad) == 2:
        pad_height, pad_width = pad
    else:
        pad_height, pad_width = pad[0], pad[2]

    in_n, in_c1, in_h, in_w, in_c0 = shape

    out_h = int(
        math.floor((in_h + 2 * pad_height - kernel_h) / float(stride_h)) + 1)
    out_w = int(
        math.floor((in_w + 2 * pad_width - kernel_w) / float(stride_w)) + 1)

    if pad[0] != 0 or pad[1] != 0:
        pad_shape = (in_n, in_c1, in_h + 2 * pad_height, in_w + 2 * pad_width,
                     in_c0)

        pad2d = akg.tvm.compute(
            pad_shape,
            lambda n, c1, h, w, c0: akg.tvm.const(0.0, dtype=dtype),
            name="pad2d")
        pad2d = akg.tvm.compute(
            pad_shape,
            lambda n, c1, h, w, c0: akg.tvm.if_then_else(
                akg.tvm.any(h < pad_height, h > in_h + pad_height - 1, w <
                            pad_width, w > in_w + pad_width - 1),
                pad2d[n, c1, h, w, c0],
                data[n, c1, h - pad_height, w - pad_width, c0],
            ),
            name="pad2d")
    else:
        pad2d = data

    axis_kernel_h = akg.tvm.reduce_axis((0, kernel_h), name="ah")
    axis_kernel_w = akg.tvm.reduce_axis((0, kernel_w), name="aw")

    out_shape = (in_n, in_c1, out_h, out_w, in_c0)

    res_value = akg.tvm.compute(out_shape,
                                lambda n, c1, h, w, c0: akg.tvm.max(
                                    pad2d[n, c1, h * stride_h + axis_kernel_h,
                                          w * stride_w + axis_kernel_w, c0],
                                    axis=[axis_kernel_h, axis_kernel_w]),
                                name="res_value")
    return res_value
コード例 #10
0
ファイル: avgpool.py プロジェクト: zhuyawen/akg
def avgpool_with_img2col(data, kernel, stride, strategy):
    """
    Performs the avgpool with img2col.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): two int numbers for pooling window's size.
        stride (Union[list, tuple]): two int numbers for window's stride.
        strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or
            instance of list(four int numbers, as 'CONSTANTS' strategy).
            Support **Strategies** is the same as avgpool.

    Returns:
        tvm.tensor.Tensor, result for gradient of avgpooling.
    """
    shape = get_shape(data)
    dtype = data.dtype

    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16)
    vc_util.check_shape(kernel, 2, "Kernel")
    vc_util.check_shape(stride, 2, "Stride")

    kernel_h, kernel_w = kernel
    in_n, in_c1, _, _, in_c0 = shape

    [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)

    pad = [ph_h, ph_t, pw_h, pw_t]
    pad_value = zero_const(dtype)

    # fmap img2col l1 -> ub in zZ format by fractal
    fmap_img2col_shp_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w,
                           in_c0)
    fmap_img2col_ub = img2col(data,
                              fmap_img2col_shp_ub,
                              kernel_h,
                              kernel_w,
                              pad,
                              stride,
                              pad_value,
                              tag="")

    out_shape = (in_n, in_c1, out_h, out_w, in_c0)
    reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h")
    reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w")
    res_sum = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: akg.tvm.sum(
            fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0],
            axis=[reduce_axis_h, reduce_axis_w]),
        name="pooling_avg")

    dividor = akg.tvm.const(kernel_h * kernel_w, dtype)
    output = akg.tvm.compute(out_shape,
                             lambda *i: res_sum(*i) / dividor,
                             name="res_value")
    return output
コード例 #11
0
ファイル: avgpool.py プロジェクト: zhuyawen/akg
def avgpool(data, kernel, stride, strategy):
    """
    Performs the average pooling on the input datas.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.
        Support **Strategies**:

        .. hlist::
          * VALID: will not pad, and drop tailed elements when pooling.
                   Output shape will be  `ceil((pool_shapes[i] - (kernel[i] - 1)) / stride[i])`
            > **example**:
            > params: inputs => 11, kernel width => 5, stride => 4
            > inputs: 1  2  3  4  5  6  7  8  9  10 11
            > 1st window contains: 1 2 3 4 5
            > 2nd window contains: 5 6 7 8 9
            > dropped: 10 11
          * SAME: will pad with zero evenly each side, but will add extra to tail
                  if the total padding amount is odd.
                  Output shape will be  `ceil(pool_shapes[i] / stride[i])`
            > **example**:
            > params: inputs => 10, kernel width => 5, stride => 4
            > inputs: 1  2  3  4  5  6  7  8  9  10
            > paded: 0(pad1) | 1  2  3  4  5  6  7  8  9  10 | 0(pad2) 0(pad3)
            > 1st window contains: 0(pad1) 1 2 3 4
            > 2nd window contains: 4 5 6 7 8
            > 3rd window contains: 8 9 10 0(pad2) 0(pad3)
            > dropped: None
          * CONSTANTS: will pad with zero according to given constants
                       (also dropped tailed elements when pooling).
            > **example**:
            > params: inputs => 10, kernel width => 5, stride => 4, pad => (2, 2)
            > inputs: 1  2  3  4  5  6  7  8  9  10
            > paded: 0(pad1) 0(pad2) | 1  2  3  4  5  6  7  8  9  10 | 0(pad2) 0(pad3)
            > 1st window contains: 0(pad1) 0(pad2) 1 2 3
            > 2nd window contains: 3 4 5 6 7
            > 3rd window contains: 7 8 9 10 0(pad3)
            > dropped: 0(pad4)

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): List or tuple of two int numbers for pooling window's size.
        stride (Union[list, tuple]): List or tuple of two int numbers for window's stride.
        strategy (Union[str, list, tuple]): A string or list or tuple for padding strategy,
            should be 'VALID', 'SAME' or instance of list(including four int numbers,
            as 'CONSTANTS' strategy).

    Returns:
        Tensor as result for average pooling.
    """
    dim_info, _ = avgpool_set_dim_func(data, kernel, stride, strategy)
    attrs = {DIM: dim_info}
    attrs['disable_half_to_float_sum_opt'] = True

    shape = [x.value for x in data.shape]
    dtype = data.dtype
    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.check_shape(kernel, 2, 'Kernel')
    vc_util.check_shape(stride, 2, 'Stride')

    if shape[2] > 60 and shape[3] > 60:
        return avg_pool_5d_hybrid(data, kernel, stride, strategy)

    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    batch_size, c1, in_size_h, in_size_w, c0 = shape

    [pad_height_head, pad_height_tail, pad_width_head, pad_width_tail], [out_size_h, out_size_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)

    pad_shape = (batch_size, c1, in_size_h + pad_height_head + pad_height_tail,
                 in_size_w + pad_width_head + pad_width_tail, c0)

    pad2d = akg.tvm.compute(
        pad_shape,
        lambda n, c1, h, w, c0: akg.tvm.if_then_else(
            akg.tvm.
            any(h < pad_height_head, h > in_size_h + pad_height_head - 1, w <
                pad_width_head, w > in_size_w + pad_width_head - 1),
            akg.tvm.const(0.0, dtype=dtype),
            data[n, c1, h - pad_height_head, w - pad_width_head, c0],
        ),
        name="pad2d")

    axis_kernel_h = akg.tvm.reduce_axis((0, kernel_h), name="axis_kernel_h")
    axis_kernel_w = akg.tvm.reduce_axis((0, kernel_w), name="axis_kernel_w")

    out_shape = (batch_size, c1, out_size_h, out_size_w, c0)

    dividor = akg.tvm.const(kernel_h * kernel_w, dtype)
    res = akg.tvm.compute(out_shape,
                          lambda n, c1, h, w, c0: akg.tvm.sum(
                              pad2d[n, c1, h * stride_h + axis_kernel_h, w *
                                    stride_w + axis_kernel_w, c0],
                              axis=[axis_kernel_h, axis_kernel_w]),
                          name="res")
    res_value = akg.tvm.compute(
        out_shape,
        lambda n, c1, h, w, c0: res[n, c1, h, w, c0] / dividor,
        name="res_value")
    return res_value, attrs