Example #1
0
 def _fallback_schedule(N, F, Y, X):
     # pylint: disable=unused-argument
     # split N (batch dimension)
     if N > 1:
         cfg["tile_n"] = SplitEntity([-1, 1, 1, 4])
     else:
         cfg["tile_n"] = SplitEntity([1, 1, 1, 1])
     # split F (output channel dimension)
     if F > 1:
         cfg["tile_f"] = SplitEntity([-1, 1, 64, 1])
     # split Y (height dimension)
     y_split_factor = 1
     for candidate in range(5, 17):
         if Y % candidate == 0:
             y_split_factor = candidate
             break
     cfg["tile_y"] = SplitEntity([-1, 1, 1, y_split_factor])
     # split X (width dimension)
     x_split_factor = 1
     for candidate in range(5, 17):
         if X % candidate == 0:
             x_split_factor = candidate
             break
     cfg["tile_x"] = SplitEntity([-1, x_split_factor, 1, 1])
     # split RC (input channel dimension, which is a reduction axis)
     cfg["tile_rc"] = SplitEntity([-1, 1, 16])
     # other configurations
     cfg["fuse_yx"] = OtherOptionEntity(False)
     cfg["unroll_explicit"] = OtherOptionEntity(True)
     cfg["auto_unroll_max_step"] = OtherOptionEntity(1500)
Example #2
0
    def schedule_conv(conv):
        conv_data = conv.op.input_tensors[0]
        kernel_data = conv.op.input_tensors[1]
        in_type = conv_data.dtype

        _, _, IC, channel_multiplier = get_const_tuple(kernel_data.shape)

        n, w, h, c = conv.op.axis
        r_h, r_w = conv.op.reduce_axis
        ho, hi = cfg["tile_h"].apply(s, conv, h)
        wo, wi = cfg["tile_w"].apply(s, conv, w)
        co, ci = cfg["tile_c"].apply(s, conv, c)

        split_val = cfg["tile_c"].size[-1]
        use_tensorization = ((in_type == "int16") and (split_val == 8)
                             and (IC % split_val == 0)
                             and (channel_multiplier == 1)
                             and is_aarch64_arm())

        data_pad_value = -1
        if conv_data.name == "data_pad":
            assert isinstance(conv_data.op, tvm.te.ComputeOp)
            # Define a strategy for padding computation
            cfg.define_knob("data_pad_strategy", [1, 2, 3])
            if cfg.is_fallback:
                # We cannot inline padding when tensorizing.
                # So, if we can tensorize, let's compute_at the closest axis
                cfg["data_pad_strategy"] = (OtherOptionEntity(2)
                                            if use_tensorization else
                                            OtherOptionEntity(3))
            # Compute padding on the third to last axis of the computation
            if cfg["data_pad_strategy"].val == 1:
                s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                s[conv_data].compute_at(s[conv], ho)
            # Compute padding on the second to last axis of the computation
            if cfg["data_pad_strategy"].val == 2:
                s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                s[conv_data].compute_at(s[conv], wo)
            # Inline padding during computation
            if cfg["data_pad_strategy"].val == 3:
                s[conv_data].compute_inline()
            data_pad_value = cfg["data_pad_strategy"].val

        if use_tensorization and data_pad_value != 3:
            smlal = smlal_int16_int32()
            s[conv].tensorize(ci, smlal)
        else:
            s[conv].vectorize(ci)

        if cfg["unroll_tile"].val:
            s[conv].unroll(r_h)
            s[conv].unroll(r_w)
            s[conv].unroll(wi)
            s[conv].unroll(hi)

        s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci)
        fused_n_ho = s[conv].fuse(n, ho)
        return fused_n_ho
Example #3
0
def _fallback_schedule_int8(cfg, wkl):
    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride
    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1

    oc_bn = 16
    assert wkl.out_filter % oc_bn == 0

    ic_bn = 1
    for bn in range(oc_bn, 0, -4):
        if wkl.in_filter % bn == 0:
            ic_bn = bn
            break
    assert wkl.in_filter % 4 == 0

    reg_n = 1
    for n in range(31, 0, -1):
        if out_width % n == 0:
            reg_n = n
            break

    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
    cfg["unroll_kw"] = OtherOptionEntity(False)
Example #4
0
def _fallback_schedule_int8(cfg, wkl):
    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
    HSTR, WSTR = wkl.stride_h, wkl.stride_w
    out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1

    oc_bn = 16
    assert wkl.out_filter % oc_bn == 0

    ic_bn = 1
    for bn in range(oc_bn, 0, -4):
        if wkl.in_filter % bn == 0:
            ic_bn = bn
            break
    assert wkl.in_filter % 4 == 0

    reg_n = 1
    for n in range(31, 0, -1):
        if out_width % n == 0:
            reg_n = n
            break

    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
    cfg["unroll_kw"] = OtherOptionEntity(False)
Example #5
0
def _fallback_schedule(cfg, wkl):
    simd_width = get_simd_32bit_lanes()
    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
    HSTR, WSTR = wkl.stride_h, wkl.stride_w
    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1

    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1

    oc_bn = 1
    for bn in range(simd_width, 0, -1):
        if wkl.out_filter % bn == 0:
            oc_bn = bn
            break

    ic_bn = 1
    for bn in range(oc_bn, 0, -1):
        if wkl.in_filter % bn == 0:
            ic_bn = bn
            break

    reg_n = 1
    for n in range(31, 0, -1):
        if out_width % n == 0:
            reg_n = n
            break

    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
    cfg["unroll_kw"] = OtherOptionEntity(False)
Example #6
0
    def schedule_conv(conv):
        conv_data = conv.op.input_tensors[0]

        n, w, h, c = conv.op.axis
        r_h, r_w = conv.op.reduce_axis
        ho, hi = cfg["tile_h"].apply(s, conv, h)
        wo, wi = cfg["tile_w"].apply(s, conv, w)
        co, ci = cfg["tile_c"].apply(s, conv, c)

        if conv_data.name == "data_pad":
            assert isinstance(conv_data.op, tvm.te.ComputeOp)
            # Define a policy for padding computation
            cfg.define_knob("data_pad_inline", [1, 2, 3])
            if cfg.is_fallback:
                cfg["data_pad_inline"] = OtherOptionEntity(3)
            if cfg["data_pad_inline"].val == 1:
                s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                s[conv_data].compute_at(s[conv], ho)
            if cfg["data_pad_inline"].val == 2:
                s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                s[conv_data].compute_at(s[conv], wo)
            if cfg["data_pad_inline"].val == 3:
                s[conv_data].compute_inline()

        s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci)
        fused_n_ho = s[conv].fuse(n, ho)
        s[conv].vectorize(ci)
        return fused_n_ho
Example #7
0
def _fallback_schedule(cfg, wkl):
    simd_width = get_fp32_len()
    DPAD, HPAD, WPAD = wkl.dpad, wkl.hpad, wkl.wpad
    DSTR, HSTR, WSTR = wkl.dstride, wkl.hstride, wkl.wstride
    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1

    oc_bn = 1
    for bn in range(simd_width, 0, -1):
        if wkl.out_filter % bn == 0:
            oc_bn = bn
            break

    ic_bn = 1
    for bn in range(oc_bn, 0, -1):
        if wkl.in_filter % bn == 0:
            ic_bn = bn
            break

    reg_n = 1
    for n in range(7, 0, -1):
        if out_width % n == 0:
            reg_n = n
            break
    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
    cfg["unroll_kw"] = OtherOptionEntity(False)
Example #8
0
def _fallback_schedule(cfg, wkl):
    simd_width = get_fp32_len()
    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride
    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1

    oc_bn = 1
    for bn in range(simd_width, 0, -1):
        if wkl.out_filter % bn == 0:
            oc_bn = bn
            break

    ic_bn = 1
    for bn in range(oc_bn, 0, -1):
        if wkl.in_filter % bn == 0:
            ic_bn = bn
            break

    for ow_factor in range(out_width, 0, -1):
        if out_width % ow_factor == 0:
            for oh_factor in range(out_height, 0, -1):
                if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
                    cfg["tile_ic"] = SplitEntity(
                        [wkl.in_filter // ic_bn, ic_bn])
                    cfg["tile_oc"] = SplitEntity(
                        [wkl.out_filter // oc_bn, oc_bn])
                    cfg["tile_oh"] = OtherOptionEntity(oh_factor)
                    cfg["tile_ow"] = SplitEntity(
                        [out_width // ow_factor, ow_factor])
                    return
    raise ValueError(
        "cannot decide default schedule for workload: {}".format(wkl))
Example #9
0
def conv2d_cudnn(cfg,
                 data,
                 kernel,
                 strides,
                 padding,
                 dilation,
                 groups=1,
                 layout="NCHW",
                 out_dtype="float32"):
    """Compute conv2d using CuDNN library"""
    if layout == "NCHW":
        tensor_format = 0  # CUDNN_TENSOR_NCHW
        N, _, H, W = get_const_tuple(data.shape)
    elif layout == "NHWC":
        tensor_format = 1  # CUDNN_TENSOR_NHWC
        N, H, W, _ = get_const_tuple(data.shape)
    else:
        raise ValueError("Unsupported layout %s in cudnn" % layout)
    CO, CI, KH, KW = get_const_tuple(kernel.shape)

    # handle dilation
    stride_h, stride_w = (strides,
                          strides) if isinstance(strides, int) else strides
    dilation_h, dilation_w = (dilation, dilation) if isinstance(
        dilation, int) else dilation

    if (isinstance(padding, (list, tuple)) and len(padding) == 4
            and (padding[0] != padding[2] or padding[1] != padding[3])):
        raise ValueError("Cudnn doesn't support asymmetric padding.")
    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
    OH = (H + pt + pb - KH) // stride_h + 1
    OW = (W + pl + pr - KW) // stride_w + 1
    cfg.add_flop(groups * 2 * N * OH * OW * CO * CI *
                 ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1))

    if data.dtype == "int8" or kernel.dtype == "int8":
        if layout == "NCHW":
            raise ValueError("NCHW layout do not support int8 in cudnn")
        dtype = "int32"
    else:
        dtype = data.dtype

    cfg.define_knob("algo", range(8))
    if cfg.is_fallback:  # Let CUDNN choose the best algo
        cfg["algo"] = OtherOptionEntity(-1)

    return cudnn.conv_forward(
        data,
        kernel,
        [pt, pl],  # cudnn padding pt, pl on both sides of input
        [stride_h, stride_w],
        [dilation_h, dilation_w],
        conv_mode=1,
        tensor_format=tensor_format,
        algo=cfg["algo"].val,
        conv_dtype=dtype,
        groups=groups,
    )
Example #10
0
def _get_default_config(cfg,
                        data,
                        kernel,
                        strides,
                        padding,
                        out_dtype,
                        is_depthwise=False):
    if is_depthwise:
        raise RuntimeError("Depthwise not supported for intel graphics.")

    batch_size, in_channel, height, width = get_const_tuple(data.shape)
    out_channel, _, hkernel, _ = get_const_tuple(kernel.shape)
    HSTR, _ = strides

    ic_bn = 1
    oc_bn, oc_bn_upper = 16, 16
    for i in range(oc_bn_upper, 0, -1):
        if out_channel % i == 0:
            oc_bn = i
            break

    if HSTR == 2:
        if out_channel + hkernel == 515:
            block_oh = 4
            block_ow = 4
        else:
            block_oh = 4
            block_ow = 5
    elif hkernel == 3:
        if out_channel == 512:
            block_oh = 2
            block_ow = 7
        else:
            block_oh = 2
            block_ow = 14
    else:
        block_oh = 1
        block_ow = 16
    cfg["tile_ic"] = SplitEntity([in_channel // ic_bn, ic_bn])
    cfg["tile_oc"] = SplitEntity([out_channel // oc_bn, oc_bn])
    cfg["block_oh"] = OtherOptionEntity(block_oh)
    cfg["block_ow"] = OtherOptionEntity(block_ow)
Example #11
0
def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
    """Fallback schedule for 1x1 conv2d int8 on cpu.
    Normally the inner most pattern takes two int8/uint8 tensors
    data[num_int8_elements] and kernel[int32_lanes, num_int8_elements],
    produces a dot product int32/uint32 output[int32_lanes].

    Parameters
    ----------
    int32_lanes : int
        How many numbers of int32/uint32 will be produced using intrinsic.
        This is related to output channel.
    num_int8_elements : int
        How many numbers of input int32/uint32 will be multiplied and reduced.
        This is related to input channel.
    """
    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
    HSTR, WSTR = wkl.stride_h, wkl.stride_w
    out_height = (wkl.height + pt + pb - wkl.kernel_h) // HSTR + 1
    out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1

    assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % (
        wkl.out_filter,
        int32_lanes,
    )
    assert wkl.in_filter % num_int8_elements == 0, "wkl.in_filter=%d, num_int8_elements=%d" % (
        wkl.in_filter,
        num_int8_elements,
    )

    oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
    ic_bn = 1
    for bn in range(oc_bn, 0, -4):
        if wkl.in_filter % bn == 0:
            ic_bn = bn
            break

    for ow_factor in range(out_width, 0, -1):
        if out_width % ow_factor == 0:
            for oh_factor in range(out_height, 0, -1):
                if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
                    cfg["tile_ic"] = SplitEntity(
                        [wkl.in_filter // ic_bn, ic_bn])
                    cfg["tile_oc"] = SplitEntity(
                        [wkl.out_filter // oc_bn, oc_bn])
                    cfg["tile_oh"] = OtherOptionEntity(oh_factor)
                    cfg["tile_ow"] = SplitEntity(
                        [out_width // ow_factor, ow_factor])
                    return
    raise ValueError(
        "cannot decide default schedule for workload: {}".format(wkl))
Example #12
0
def configure_knobs(cfg, M, K):
    """ Configure auto-tuning knobs for the interleaved strategy """

    x, y = cfg.axis(M // 4), cfg.axis(K // 16)
    cfg.define_reorder("reorder_gemm", [x, y],
                       policy="candidate",
                       candidate=[[x, y], [y, x]])

    outer_loop, inner_loop = cfg.axis(4), cfg.axis(16)
    cfg.define_annotate("A_interleaved_unroll_vec", [outer_loop, inner_loop],
                        policy="try_unroll_vec")

    # Fallback configuration
    if cfg.is_fallback:
        cfg["reorder_gemm"] = ReorderEntity([0, 1])
        cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"])

    if not is_dotprod_available():
        cfg.define_knob("gemm_quantized_unroll", [True, False])
        cfg.define_knob("gemm_quantized_interleave", [True, False])

        if cfg.is_fallback:
            cfg["gemm_quantized_unroll"] = OtherOptionEntity(False)
            cfg["gemm_quantized_interleave"] = OtherOptionEntity(True)
Example #13
0
def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes,
                                      num_int8_elements):
    """Fallback schedule for conv2d int8 on cpu.
    Normally the inner most pattern takes two int8/uint8 tensors
    data[num_int8_elements] and kernel[int32_lanes, num_int8_elements],
    produces a dot product int32/uint32 output[int32_lanes].

    Parameters
    ----------
    int32_lanes : int
        How many numbers of int32/uint32 will be produced using intrinsic.
        This is related to output channel.
    num_int8_elements : int
        How many numbers of input int32/uint32 will be multiplied and reduced.
        This is related to input channel.
    """
    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
    HSTR, WSTR = wkl.stride_h, wkl.stride_w
    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1

    assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % (
        wkl.out_filter,
        int32_lanes,
    )
    assert wkl.in_filter % num_int8_elements == 0, "wkl.in_filter=%d, num_int8_elements=%d" % (
        wkl.in_filter,
        num_int8_elements,
    )

    oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
    ic_bn = 1
    for bn in range(oc_bn, 0, -4):
        if wkl.in_filter % bn == 0:
            ic_bn = bn
            break

    reg_n = 1
    for n in range(31, 0, -1):
        if out_width % n == 0:
            reg_n = n
            break

    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
    cfg["unroll_kw"] = OtherOptionEntity(False)
Example #14
0
def _fallback_schedule(cfg, wkl):
    """
    Get default schedule for the workload
    Parameters
    ----------
    cfg : tvm.autotvm.task.space.FallbackConfigEntity
        Fallback config to be updated
    wkl : topi.nn.depthwise_conv2d.Workload
        Convolution workload
    """
    simd_width = get_simd_32bit_lanes()

    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
    HSTR, WSTR = wkl.stride_h, wkl.stride_w
    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1

    out_width = (wkl.width - dilated_kernel_w + pl + pr) // WSTR + 1

    oc_bn = 1
    for bn in range(simd_width, 0, -1):
        if wkl.out_filter % bn == 0:
            oc_bn = bn
            break

    ic_bn = 1
    for bn in range(oc_bn, 0, -1):
        if wkl.in_filter % bn == 0:
            ic_bn = bn
            break

    reg_n = 1
    for n in range(31, 0, -1):
        if out_width % n == 0:
            reg_n = n
            break

    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
    cfg["unroll_kw"] = OtherOptionEntity(False)
Example #15
0
def _fallback_schedule(cfg, wkl):
    simd_width = 4  # assume ARM SIMD Width is 4
    pad_left, pad_right = wkl.padl, wkl.padr
    stride_w = wkl.stride_w
    out_width = (wkl.width + pad_left + pad_right -
                 wkl.kernel_w) // stride_w + 1
    groups = wkl.groups
    kernels_per_group = wkl.out_filter // groups
    kernel_depth = wkl.in_filter // groups

    oc_bn = 1

    oc_bn = 1
    for bn in range(simd_width, 0, -1):
        if kernels_per_group % bn == 0:
            oc_bn = bn
            break
    if oc_bn > kernels_per_group:
        oc_bn = kernels_per_group

    ic_bn = 1
    for bn in range(oc_bn, 0, -1):
        if kernel_depth % bn == 0:
            ic_bn = bn
            break
    if ic_bn > kernel_depth:
        ic_bn = kernel_depth

    reg_n = 1
    for n in range(31, 0, -1):
        if out_width % n == 0:
            reg_n = n
            break

    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
    cfg["unroll_kw"] = OtherOptionEntity(False)
Example #16
0
def _fallback_schedule(cfg, wkl):
    """
    Get default schedule for the workload
    Parameters
    ----------
    cfg : tvm.autotvm.task.space.FallbackConfigEntity
        Fallback config to be updated
    wkl : topi.nn.depthwise_conv2d.Workload
        Convolution workload
    """
    simd_width = get_fp32_len()

    HPAD, WPAD = wkl.hpad, wkl.wpad
    HSTR, WSTR = wkl.hstride, wkl.wstride
    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1

    oc_bn = 1
    for bn in range(simd_width, 0, -1):
        if wkl.out_filter % bn == 0:
            oc_bn = bn
            break

    ic_bn = 1
    for bn in range(oc_bn, 0, -1):
        if wkl.in_filter % bn == 0:
            ic_bn = bn
            break

    reg_n = 1
    for n in range(31, 0, -1):
        if out_width % n == 0:
            reg_n = n
            break

    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
    cfg["unroll_kw"] = OtherOptionEntity(False)
Example #17
0
def _fallback_schedule(cfg, wkl):
    simd_width = get_simd_32bit_lanes()
    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
    HSTR, WSTR = wkl.stride_h, wkl.stride_w
    dilated_kernel_h = (wkl.kernel_h - 1) * wkl.dilation_h + 1
    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1

    out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1
    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1

    oc_bn = 1
    for bn in range(simd_width, 0, -1):
        if wkl.out_filter % bn == 0:
            oc_bn = bn
            break

    ic_bn = 1
    for bn in range(oc_bn, 0, -1):
        if wkl.in_filter % bn == 0:
            ic_bn = bn
            break

    for ow_factor in range(out_width, 0, -1):
        if out_width % ow_factor == 0:
            for oh_factor in range(out_height, 0, -1):
                if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
                    cfg["tile_ic"] = SplitEntity(
                        [wkl.in_filter // ic_bn, ic_bn])
                    cfg["tile_oc"] = SplitEntity(
                        [wkl.out_filter // oc_bn, oc_bn])
                    cfg["tile_oh"] = OtherOptionEntity(oh_factor)
                    cfg["tile_ow"] = SplitEntity(
                        [out_width // ow_factor, ow_factor])
                    return
    raise ValueError(
        "cannot decide default schedule for workload: {}".format(wkl))
def schedule_depthwise_conv2d_nhwc(cfg, outs):
    """Create the schedule for depthwise_conv2d_nchw_spatial_pack"""
    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
    s = te.create_schedule([x.op for x in outs])
    out = outs[0]

    ##### space definition begin #####
    n, h, w, c = s[out].op.axis
    # Split the number of input/output channels
    cfg.define_split("tile_c", c, num_outputs=2)
    # Split the height of the convolution
    _, hi = cfg.define_split("tile_h", h, num_outputs=2)
    # Split the width of the convolution
    _, wi = cfg.define_split("tile_w", w, num_outputs=2)
    # Additional out (e.g., requantization, bias addition, etc..)
    # 0: locate the output on the second last axis of the main compuation
    # 1: locate the output closest to the main computation
    cfg.define_knob("locate_output", [0, 1])
    # Determine if we should unroll the computation of the inner tile
    cfg.define_knob("unroll_tile", [True, False])

    # fallback support
    if cfg.is_fallback:
        cfg["tile_c"] = SplitEntity([-1, 8])
        cfg["tile_h"] = SplitEntity([-1, 2])
        cfg["tile_w"] = SplitEntity([-1, 2])
        cfg["locate_output"] = OtherOptionEntity(1)
        cfg["unroll_tile"] = OtherOptionEntity(True)
    ##### space definition end #####

    def schedule_conv(conv):
        conv_data = conv.op.input_tensors[0]
        kernel_data = conv.op.input_tensors[1]
        in_type = conv_data.dtype

        _, _, IC, channel_multiplier = get_const_tuple(kernel_data.shape)

        n, w, h, c = conv.op.axis
        r_h, r_w = conv.op.reduce_axis
        ho, hi = cfg["tile_h"].apply(s, conv, h)
        wo, wi = cfg["tile_w"].apply(s, conv, w)
        co, ci = cfg["tile_c"].apply(s, conv, c)

        split_val = cfg["tile_c"].size[-1]
        use_tensorization = (
            (in_type == "int16")
            and (split_val == 8)
            and (IC % split_val == 0)
            and (channel_multiplier == 1)
            and is_aarch64_arm()
        )

        data_pad_value = -1
        if conv_data.name == "data_pad":
            assert isinstance(conv_data.op, tvm.te.ComputeOp)
            # Define a strategy for padding computation
            cfg.define_knob("data_pad_strategy", [1, 2, 3])
            if cfg.is_fallback:
                # We cannot inline padding when tensorizing.
                # So, if we can tensorize, let's compute_at the closest axis
                cfg["data_pad_strategy"] = (
                    OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3)
                )
            # Compute padding on the third to last axis of the computation
            if cfg["data_pad_strategy"].val == 1:
                s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                s[conv_data].compute_at(s[conv], ho)
            # Compute padding on the second to last axis of the computation
            if cfg["data_pad_strategy"].val == 2:
                s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                s[conv_data].compute_at(s[conv], wo)
            # Inline padding during computation
            if cfg["data_pad_strategy"].val == 3:
                s[conv_data].compute_inline()
            data_pad_value = cfg["data_pad_strategy"].val

        if use_tensorization and data_pad_value != 3:
            smlal = smlal_int16_int32()
            s[conv].tensorize(ci, smlal)
        else:
            s[conv].vectorize(ci)

        if cfg["unroll_tile"].val:
            s[conv].unroll(r_h)
            s[conv].unroll(r_w)
            s[conv].unroll(wi)
            s[conv].unroll(hi)

        s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci)
        fused_n_ho = s[conv].fuse(n, ho)
        return fused_n_ho

    def schedule_conv_out(out):
        n, h, w, c = out.op.axis
        co, ci = cfg["tile_c"].apply(s, out, c)
        wo, wi = cfg["tile_w"].apply(s, out, w)
        ho, hi = cfg["tile_h"].apply(s, out, h)
        s[out].reorder(n, ho, wo, co, hi, wi, ci)
        if cfg["unroll_tile"]:
            s[out].unroll(wi)
            s[out].unroll(hi)

        if out.dtype in ["int8", "uint8"]:
            # In case of quantized convolution further split the channel in batches of 4 elements
            # so that we can use arm intrinsics to run fixed_point_multiplication
            ci_outer, ci_inner = s[out].split(ci, 4)
            s[out].vectorize(ci_inner)
            s[out].unroll(ci_outer)

        fused_n_ho = s[out].fuse(n, ho)
        return hi, wi, fused_n_ho

    def _callback(op):
        if op.name == "depthwise_conv2d_nhwc_output":
            conv = op.output(0)
            if conv != out:
                hi, wi, p_axis = schedule_conv_out(out)
                schedule_conv(conv)
                if cfg["locate_output"].val == 0:
                    s[conv].compute_at(s[out], hi)
                if cfg["locate_output"].val == 1:
                    s[conv].compute_at(s[out], wi)
            else:
                p_axis = schedule_conv(out)

            s[out].parallel(p_axis)

    traverse_inline(s, outs[0].op, _callback)
    return s
Example #19
0
def schedule_depthwise_conv2d_nhwc(cfg, outs):
    """Create the schedule for depthwise_conv2d_nchw_spatial_pack"""
    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
    s = te.create_schedule([x.op for x in outs])
    out = outs[0]

    ##### space definition begin #####
    n, h, w, c = s[out].op.axis
    cfg.define_split("tile_c", c, num_outputs=2)
    _, hi = cfg.define_split("tile_h", h, num_outputs=2)
    _, wi = cfg.define_split("tile_w", w, num_outputs=2)
    cfg.define_knob("locate_output", [0, 1])

    # fallback support
    if cfg.is_fallback:
        cfg["tile_c"] = SplitEntity([-1, 8])
        cfg["tile_h"] = SplitEntity([-1, 2])
        cfg["tile_w"] = SplitEntity([-1, 2])
        cfg["locate_output"] = OtherOptionEntity(1)
    ##### space definition end #####

    def schedule_conv(conv):
        conv_data = conv.op.input_tensors[0]

        n, w, h, c = conv.op.axis
        r_h, r_w = conv.op.reduce_axis
        ho, hi = cfg["tile_h"].apply(s, conv, h)
        wo, wi = cfg["tile_w"].apply(s, conv, w)
        co, ci = cfg["tile_c"].apply(s, conv, c)

        if conv_data.name == "data_pad":
            assert isinstance(conv_data.op, tvm.te.ComputeOp)
            # Define a policy for padding computation
            cfg.define_knob("data_pad_inline", [1, 2, 3])
            if cfg.is_fallback:
                cfg["data_pad_inline"] = OtherOptionEntity(3)
            if cfg["data_pad_inline"].val == 1:
                s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                s[conv_data].compute_at(s[conv], ho)
            if cfg["data_pad_inline"].val == 2:
                s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                s[conv_data].compute_at(s[conv], wo)
            if cfg["data_pad_inline"].val == 3:
                s[conv_data].compute_inline()

        s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci)
        fused_n_ho = s[conv].fuse(n, ho)
        s[conv].vectorize(ci)
        return fused_n_ho

    def schedule_conv_out(out):
        n, h, w, c = out.op.axis
        co, ci = cfg["tile_c"].apply(s, out, c)
        wo, wi = cfg["tile_w"].apply(s, out, w)
        ho, hi = cfg["tile_h"].apply(s, out, h)
        s[out].reorder(n, ho, wo, co, hi, wi)

        if out.dtype in ["int8", "uint8"]:
            # In case of quantized convolution further split the channel in batches of 4 elements
            # so that we can use arm intrinsics to run fixed_point_multiplication
            ci_outer, ci_inner = s[out].split(ci, 4)
            s[out].vectorize(ci_inner)

        fused_n_ho = s[out].fuse(n, ho)
        return hi, wi, fused_n_ho

    def _callback(op):
        if op.name == "depthwise_conv2d_nhwc_output":
            conv = op.output(0)
            if conv != out:
                hi, wi, p_axis = schedule_conv_out(out)
                schedule_conv(conv)
                if cfg["locate_output"].val == 0:
                    s[conv].compute_at(s[out], hi)
                if cfg["locate_output"].val == 1:
                    s[conv].compute_at(s[out], wi)
            else:
                p_axis = schedule_conv(out)

            s[out].parallel(p_axis)

    traverse_inline(s, outs[0].op, _callback)
    return s
Example #20
0
    def _schedule(cfg, op):
        C = op.output(0)
        A, B = s[C].op.input_tensors
        if len(B.op.input_tensors) == 1 and B.op.input_tensors[0] == A:
            s[B].compute_inline()
        _, M, N = get_const_tuple(C.shape)
        AA = s.cache_read(A, "shared", [C])
        AL = s.cache_read(AA, "local", [C])
        BB = s.cache_read(B, "shared", [C])
        BL = s.cache_read(BB, "local", [C])
        CC = s.cache_write(C, "local")
        if op not in s.outputs:
            s[C].compute_inline()
            C = s.outputs[0].output(0)

        b, y, x = s[C].op.axis
        (k, ) = s[CC].op.reduce_axis

        cfg.define_split("tile_y", y, num_outputs=3)
        cfg.define_split("tile_x", x, num_outputs=3)
        cfg.define_split("tile_k", k, num_outputs=2)
        cfg.define_knob("auto_unroll_max_step", [8, 16, 32, 64])
        target = tvm.target.Target.current()
        if target.kind.name in ["nvptx", "rocm"]:
            # llvm-based backends cannot do non-explicit unrolling
            cfg.define_knob("unroll_explicit", [1])
        else:
            cfg.define_knob("unroll_explicit", [0, 1])

        if cfg.is_fallback:
            y_bn = get_max_power2_factor(M, 64)
            x_bn = get_max_power2_factor(N, 64)
            y_nthreads = min(y_bn, 8)
            x_nthreads = min(x_bn, 8)
            cfg["tile_x"] = SplitEntity([-1, x_nthreads, x_bn // x_nthreads])
            cfg["tile_y"] = SplitEntity([-1, y_nthreads, y_bn // y_nthreads])
            cfg["tile_k"] = SplitEntity([-1, 8])
            cfg["auto_unroll_max_step"] = OtherOptionEntity(16)

        by, ty, yi = cfg["tile_y"].apply(s, C, y)
        bx, tx, xi = cfg["tile_x"].apply(s, C, x)

        thread_x = te.thread_axis("threadIdx.x")
        thread_y = te.thread_axis("threadIdx.y")

        s[C].reorder(b, by, bx, ty, tx, yi, xi)
        s[C].bind(b, te.thread_axis("blockIdx.z"))
        s[C].bind(by, te.thread_axis("blockIdx.y"))
        s[C].bind(bx, te.thread_axis("blockIdx.x"))
        s[C].bind(ty, thread_y)
        s[C].bind(tx, thread_x)
        s[C].pragma(yi, "auto_unroll_max_step",
                    cfg["auto_unroll_max_step"].val)
        s[C].pragma(yi, "unroll_explicit", cfg["unroll_explicit"].val)

        s[CC].compute_at(s[C], tx)
        _, yi, xi = s[CC].op.axis
        ko, ki = cfg["tile_k"].apply(s, CC, k)
        s[CC].reorder(ko, ki, yi, xi)
        s[CC].pragma(ki, "auto_unroll_max_step",
                     cfg["auto_unroll_max_step"].val)
        s[CC].pragma(ki, "unroll_explicit", cfg["unroll_explicit"].val)

        s[AA].compute_at(s[CC], ko)
        s[AL].compute_at(s[CC], ki)
        s[BB].compute_at(s[CC], ko)
        s[BL].compute_at(s[CC], ki)
        _, y, k = s[AA].op.axis
        ty, yi = s[AA].split(y, nparts=cfg["tile_y"].size[1])
        tx, ki = s[AA].split(k, nparts=cfg["tile_x"].size[1])
        s[AA].reorder(ty, tx, yi, ki)
        s[AA].bind(ty, thread_y)
        s[AA].bind(tx, thread_x)
        s[AA].pragma(yi, "auto_unroll_max_step",
                     cfg["auto_unroll_max_step"].val)
        s[AA].pragma(yi, "unroll_explicit", cfg["unroll_explicit"].val)

        _, x, k = s[BB].op.axis
        ty, xi = s[BB].split(x, nparts=cfg["tile_y"].size[1])
        tx, ki = s[BB].split(k, nparts=cfg["tile_x"].size[1])
        s[BB].bind(ty, thread_y)
        s[BB].bind(tx, thread_x)
        s[BB].reorder(ty, tx, xi, ki)
        s[BB].pragma(xi, "auto_unroll_max_step",
                     cfg["auto_unroll_max_step"].val)
        s[BB].pragma(xi, "unroll_explicit", cfg["unroll_explicit"].val)
Example #21
0
def compute_conv2d_gemm_without_weight_transform(cfg, data, B_interleaved_t,
                                                 strides, padding, dilation,
                                                 out_dtype, kernel_size,
                                                 output_channels):
    """Compute conv2d by transforming the input,
    executing GEMM and transforming the output back"""
    batches, IH, IW, IC = get_const_tuple(data.shape)

    KH, KW = get_const_tuple(kernel_size)
    OC = get_const_int(output_channels)

    K_AREA = KH * KW

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = get_const_tuple(dilation)

    dilated_kernel_h = (KH - 1) * dilation_h + 1
    dilated_kernel_w = (KW - 1) * dilation_w + 1

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)

    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
    if pad_top or pad_left:
        data_pad = nn.pad(data, [0, pad_top, pad_left, 0],
                          [0, pad_down, pad_right, 0],
                          name="data_pad")
    else:
        data_pad = data

    # --- Im2col
    M = OH * OW
    K = IC * K_AREA
    N = OC

    A_shape = (batches, M, K)
    if K_AREA == 1:
        A = te.compute(
            A_shape,
            lambda n, x, y: data_pad[n, HSTR * (x // OW), WSTR * (x % OW), y],
            name="data_flatten",
        )
    else:
        A = te.compute(
            A_shape,
            lambda n, x, y: data_pad[n, HSTR * (x // OW) + dilation_h * (
                (y // IC) // KW), WSTR * (x % OW) + dilation_w *
                                     ((y // IC) % KW), y % IC, ],
            name="data_im2col",
        )
    N_transformed = B_interleaved_t.shape[0]

    # --- Pad if necessary
    idxm = tvm.tir.indexmod

    pad_m = 0
    pad_k = 0

    if M % 4 != 0:
        pad_m = 4 - (M % 4)

    if K % 16 != 0:
        pad_k = 16 - (K % 16)

    M_padded = M + pad_m
    K_padded = K + pad_k

    pad_before = (0, 0, 0)
    pad_after = (0, pad_m, pad_k)

    if pad_m != 0 or pad_k != 0:
        A = nn.pad(A,
                   pad_before=pad_before,
                   pad_after=pad_after,
                   name="A_padded")

    # --- GEMM: A*B'
    k = te.reduce_axis((0, K_padded), "k")

    A_interleaved = te.compute(
        (batches, M_padded // 4, K_padded // 16, 4, 16),
        lambda b, x, y, z, w: A[b, z + 4 * x, w + 16 * y],
        name="A_interleaved",
    )

    C_interleaved = te.compute(
        (batches, M_padded // 4, N_transformed, 4, 4),
        lambda b, x, y, w, z: te.sum(
            A_interleaved[b, x, k // 16, w, idxm(k, 16)].astype(out_dtype) *
            B_interleaved_t[y, k // 16, z, idxm(k, 16)].astype(out_dtype),
            axis=k,
        ),
        name="C_interleaved",
    )

    # --- Unpack C
    C = te.compute(
        (batches, M, N),
        lambda b, x, y: C_interleaved[b, x // 4, y // 4,
                                      idxm(x, 4),
                                      idxm(y, 4)],
        name="C",
    )

    # --- Produce the conv output
    out_shape = (batches, OH, OW, OC)
    out = te.compute(out_shape,
                     lambda b, x, y, z: C(b, y + OW * x, z),
                     name="conv2d_gemm_output")

    # Configuration space
    x, y = cfg.axis(M_padded // 4), cfg.axis(K_padded // 16)
    cfg.define_reorder("reorder_gemm", [x, y],
                       policy="candidate",
                       candidate=[[x, y], [y, x]])

    outer_loop, inner_loop = cfg.axis(4), cfg.axis(16)
    cfg.define_annotate("A_interleaved_unroll_vec", [outer_loop, inner_loop],
                        policy="try_unroll_vec")
    cfg.define_knob("gemm_quantized_unroll", [True, False])
    cfg.define_knob("gemm_quantized_interleave", [True, False])

    # Fallback configuration
    if cfg.is_fallback:
        cfg["reorder_gemm"] = ReorderEntity([0, 1])
        cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"])
        cfg["gemm_quantized_unroll"] = OtherOptionEntity(False)
        cfg["gemm_quantized_interleave"] = OtherOptionEntity(True)
    return out
Example #22
0
def conv2d_cudnn(
    cfg, data, kernel, strides, padding, dilation, groups=1, layout="NCHW", out_dtype="float32"
):
    """Compute conv2d using CuDNN library"""
    if layout == "NCHW":
        tensor_format = 0  # CUDNN_TENSOR_NCHW
        N, _, H, W = get_const_tuple(data.shape)
    elif layout == "NHWC":
        tensor_format = 1  # CUDNN_TENSOR_NHWC
        N, H, W, _ = get_const_tuple(data.shape)
    else:
        raise ValueError("Unsupported layout %s in cudnn" % layout)
    CO, CI, KH, KW = get_const_tuple(kernel.shape)

    # handle dilation
    stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
    dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
    KH_dilated = (KH - 1) * dilation_h + 1
    KW_dilated = (KW - 1) * dilation_h + 1

    pt, pl, pb, pr = get_pad_tuple(padding, (KH_dilated, KW_dilated))
    if (pt != pb) or (pl != pr):
        raise ValueError("Cudnn doesn't support asymmetric padding.")

    OH = (H + pt + pb - KH) // stride_h + 1
    OW = (W + pl + pr - KW) // stride_w + 1

    if isinstance(N, int):
        cfg.add_flop(
            groups
            * 2
            * N
            * OH
            * OW
            * CO
            * CI
            * ((KH - 1) * dilation_h + 1)
            * ((KW - 1) * dilation_w + 1)
        )

    if data.dtype == "int8" or kernel.dtype == "int8":
        if layout == "NCHW":
            raise ValueError("NCHW layout do not support int8 in cudnn")
        dtype = "int32"
    else:
        dtype = data.dtype

    cfg.define_knob("algo", range(cudnn.algo_to_index("fwd", "CUDNN_CONVOLUTION_FWD_ALGO_COUNT")))
    if cfg.is_fallback:
        if cudnn.exists():
            # Let CUDNN choose the best algo, based on benchmarks run
            # on the local machine.  In the future, this should be
            # based on parameters stored in the Target.
            cfg["algo"] = OtherOptionEntity(-1)
        else:
            cfg["algo"] = OtherOptionEntity(0)

    return cudnn.conv_forward(
        data,
        kernel,
        [pt, pl],  # cudnn padding pt, pl on both sides of input
        [stride_h, stride_w],
        [dilation_h, dilation_w],
        conv_mode=1,
        tensor_format=tensor_format,
        algo=cfg["algo"].val,
        conv_dtype=dtype,
        groups=groups,
    )