Example #1
0
def matmul_v3(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)
    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M),
                    lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k),
                    name='C')
    s = tvm.create_schedule(C.op)
    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]
    ##### define space begin #####
    cfg = autotvm.get_config()
    # use a filter that only accepts the split scheme whose inner most tile is less then 4
    cfg.define_split('tile_y',
                     y,
                     policy='factors',
                     filter=lambda x: x.size[-1] <= 4)
    cfg.define_split('tile_y',
                     x,
                     policy='factors',
                     filter=lambda x: x.size[-1] <= 4)
    ##### define space end #####
    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)
    s[C].reorder(yo, xo, k, yi, xi)
    return s, [A, B, C]
Example #2
0
def matmul_v4(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)
    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M),
                    lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k),
                    name='C')
    s = tvm.create_schedule(C.op)

    y, x = s[C].op.axis

    k = s[C].op.reduce_axis[0]

    cfg = autotvm.get_config()
    # define search space
    #tiling
    cfg.define_knob("tile_y", [1, 2, 4, 8, 16, 32, 64])
    cfg.define_knob("tile_x", [1, 2, 4, 8, 16, 32, 64])
    yo, yi = s[C].split(y, cfg['tile_y'].val)
    xo, xi = s[C].split(x, cfg['tile_x'].val)
    # cfg.define_split("tile_f", f, num_outputs=4)
    # cfg.define_split("tile_y", y, num_outputs=4)
    # cfg.define_split("tile_x", x, num_outputs=4)
    # cfg.define_split("tile_rc", rc, num_outputs=3)
    # cfg.define_split("tile_ry", ry, num_outputs=3)
    # cfg.define_split("tile_rx", rx, num_outputs=3)
    #reordering
    cfg.define_reorder("ordering", (yo, xo, k, yi, xi),
                       policy="all")  #interval_all

    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
    cfg.define_knob("unroll_explicit", [0, 1])
    #other optimization skills/tricks
    s[C].vectorize(yi)
    return s, [A, B, C]
Example #3
0
def matmul(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)

    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
    s = tvm.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    ##### define space begin #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2)
    ##### define space end #####

    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]
Example #4
0
def conv2d_winograd_autotvm(s, ic, oc):
    cfg = autotvm.get_config()
    cfg.define_knob('unroll', [1])
    cfg.define_knob('compute_at', [0])
    cfg.define_knob('vectorize', [1])
    cfg.define_knob('tensorize', [1])
    cfg.define_knob('VK', [6])
    cfg.define_knob('VP', [8])
    for intermediate in ["M", "A_T_dot_M", "input_tile", "B_T_dot_X", "V"]:
        cfg.define_knob("{}_COMPUTE_AT".format(intermediate), [0, 1])
    for intermediate in ["input_tile", "V"]:  # , "B_T_dot_X",
        cfg.define_knob("{}_REORDER_C".format(intermediate), [0, 1])

    cfg.define_knob('data_pad_inline', [0, 1])

    VK = cfg['VK'].val
    VP = cfg['VP'].val
    X = tvm.placeholder(shape=(1, ic, s, s), dtype="float32", name="X")
    W = tvm.placeholder(shape=(oc, ic, 3, 3), dtype="float32", name="W")

    Y, input_tile, U, output = decl_winograd(cfg,
                                             X,
                                             W,
                                             strides=1,
                                             padding=1,
                                             layout="NCHW",
                                             out_dtype="float32",
                                             VK=VK,
                                             VP=VP)
    s = schedule_winograd(cfg, Y, VK=VK, VP=VP)
    if cfg.flop == 0:
        cfg.add_flop(2 * ic * oc * s * s * 3 * 3)
    #print(tvm.lower(s, [X, W, output], simple_mode=True))
    return s, [input_tile, U, Y]
    def test_workload_padding(
        self,
        out_dtype,
        layout,
        input_shape,
        filter_shape,
        target,
        ref_data,
        stride,
        padding,
        dilation,
    ):
        input_np, filter_np, scale_np, shift_np, output_np = ref_data
        if layout == "NCHW":
            _, _, out_height, out_width = output_np.shape
        elif layout == "NHWC":
            _, out_height, out_width, _ = output_np.shape
        elif layout == "NCHWc":
            _, _, out_height, out_width, _ = output_np.shape

        Input = te.placeholder(input_shape, name="Input")
        Filter = te.placeholder(filter_shape, name="Filter")
        wkl = _get_workload(Input, Filter, (stride, stride), padding, dilation,
                            out_dtype, layout)

        # check if tile_ow candidates are the factors of the right output weight.
        with tvm.target.Target(target):
            cfg = autotvm.get_config()
            _fallback_schedule(cfg, wkl)
            ow_tile = np.prod(cfg["tile_ow"].size)

            tvm.testing.assert_allclose(ow_tile, out_width)
def matmul(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)

    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M),
                    lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k),
                    name='C')
    s = tvm.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    ##### define space begin #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2)
    ##### define space end #####

    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]
Example #7
0
def pixelcnn_autotvm(N, H, W, CO, CI, KH, KW, mask_type, bias, stride, padding,
                     dilation):
    # assert N == 1, "Only consider batch_size = 1 in this template"

    # data = tvm.te.placeholder((N, CI, H, W), name='data')
    # kernel = tvm.te.placeholder((CO, CI, KH, KW), name='kernel')
    # conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=dilation, out_dtype='float32')
    convop, tensors = pixelcnn(N,
                               H,
                               W,
                               CI,
                               CO,
                               KH,
                               KW,
                               mask_type,
                               bias=bias,
                               stride=stride,
                               padding=padding,
                               dilation=dilation)
    s = tvm.te.create_schedule(convop)

    cfg = autotvm.get_config()

    ##### space definition begin #####
    schedule_direct_cuda(cfg, s, *tensors[-2:])

    return s, [*tensors]
Example #8
0
def Gemm_tv2_reorder2_3_vec1_para1_unrollv1_config_define(N, K, M, dtype):
    A = tvm.placeholder((N, K), name='A', dtype=dtype)
    B = tvm.placeholder((K, M), name='B', dtype=dtype)
    k = tvm.reduce_axis((0, K), name='k')
    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')

    s = tvm.create_schedule(C.op)
    k = s[C].op.reduce_axis[0]
    y, x = s[C].op.axis

    cfg = autotvm.get_config()
    cfg.define_split("tile_x", x, num_outputs=2)
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_k", k, num_outputs=2)
    # cfg.define_split('tile_x', x, policy='factors', filter=lambda x: x.size[-1] <= 64)
    # cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 64)
    # cfg.define_split('tile_k', k, policy='factors', filter=lambda x: x.size[-1] <= 64)
    xo, xi = cfg["tile_x"].apply(s, C, x)
    yo, yi = cfg["tile_y"].apply(s, C, y)
    ko, ki = cfg["tile_k"].apply(s, C, k)
    # cfg.define_knob("tile_x", [1, 4, 8, 16, 32, 64])
    # cfg.define_knob("tile_y", [1, 4, 8, 16, 32, 64])
    # cfg.define_knob("tile_k", [1, 4, 8, 16, 32, 64])
    # xo, xi = s[C].split(x, cfg['tile_x'].val)
    # yo, yi = s[C].split(y, cfg['tile_y'].val)
    # ko, ki = s[C].split(k, cfg['tile_k'].val)

    s[C].reorder(xo, yo, ko, xi, ki, yi)
    s[C].vectorize(yi)
    s[C].parallel(xo)
    s[C].unroll(ki)
    return s, [A, B, C]
def matmul_v1(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)
    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M),
                    lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k),
                    name='C')
    s = tvm.create_schedule(C.op)
    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]
    ##### define space begin #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2)
    cfg.define_annotate("unroll", policy="try_unroll")
    #policy (str) – name of policy If is ‘unroll’, unroll the axes.
    # If is ‘try_unroll’, try to unroll the axes.
    # If is ‘try_unroll_vec’, try to unroll or vectorize the axes.
    # If is ‘bind_gpu’, bind the first few axes to gpu threads.
    # If is ‘locate_cache’, choose n axes to attach shared/local cache.
    cfg.define_annotate("vec", policy="try_unroll_vec")
    cfg.define_annotate("cache", policy="locate_cache")
    #policy (str) – name of policy If is ‘identity’, do an identity permutation.
    # If is ‘all’, try all permutations.
    # If is ‘interval_all’, try all permutations of an interval of axes.
    # If is ‘candidate’, try listed candidate.
    # If is ‘interleave’, interleave chains of spatial axes and chains of reduction axes.
    cfg.define_reorder("reorder", policy="interval_all")
    ##### define space end #####
    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)
    s[C].reorder(yo, xo, k, yi, xi)
    return s, [A, B, C]
Example #10
0
    def my_tune(n):
        #tx, ty, tk = 64, 64, 16  # tile sizes

        cfg = autotvm.get_config()
        cfg.define_knob("tile_x", [16, 64, 128])
        cfg.define_knob("tile_y", [16, 64, 128])
        cfg.define_knob("tile_k", [4, 16, 64, 128])

        tx = cfg["tile_x"].val
        ty = cfg["tile_y"].val
        tk = cfg["tile_k"].val

        A, B, C = matmul(n, n, n)
        s = te.create_schedule(C.op)

        # Create a write cache for C
        CachedC = s.cache_write(C, 'local')
        # Same as before, first tile by blocks, and then parallelize the
        # computation of each block
        xo, yo, xi, yi = s[C].tile(*C.op.axis, tx, ty)
        xy = s[C].fuse(xo, yo)
        s[C].parallel(xy)
        # Use the write cache for the output of the xy axis, namely a block.
        s[CachedC].compute_at(s[C], xy)
        # Same as before to optimize the computation of a block .
        xc, yc = s[CachedC].op.axis
        ko, ki = s[CachedC].split(CachedC.op.reduce_axis[0], factor=tk)
        s[CachedC].reorder(ko, xc, ki, yc)
        s[CachedC].unroll(ki)
        s[CachedC].vectorize(yc)

        return s, [A, B, C]
Example #11
0
def matmul_v1(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)

    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
    s = tvm.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    # 2. get the config object
    cfg = autotvm.get_config()

    # 3. define search space
    cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])

    # 4. schedule according to config
    yo, yi = s[C].split(y, cfg['tile_y'].val)
    xo, xi = s[C].split(x, cfg['tile_x'].val)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]
Example #12
0
def softmax_naive(a, b, c, d):
    A = tvm.placeholder((a, b, c, d), dtype='float32', name='A')
    B = topi.nn.softmax(A, axis=1)
    s = tvm.create_schedule([B.op])
    Passes.enable_autotune(s, [B], autotvm.get_config(), mode=Passes.NAIVE)

    return s, [A, B]
Example #13
0
def gemm_v1(M, N, K, dtype):
    A = te.placeholder((M, K), name='A', dtype=dtype)
    B = te.placeholder((K, N), name='B', dtype=dtype)
    # compute
    k = te.reduce_axis((0, K), name='k')
    C = te.compute((M, N),
                   lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
                   name='C')
    # schedule
    s = te.create_schedule(C.op)
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]
    # define search space
    cfg = autotvm.get_config()
    cfg.define_split("tile_x", x, num_outputs=2)
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_k", k, num_outputs=2)
    # apply config
    xo, xi = cfg["tile_x"].apply(s, C, x)
    yo, yi = cfg["tile_y"].apply(s, C, y)
    ko, ki = cfg["tile_k"].apply(s, C, k)
    # define order
    # cfg.define_reorder("reorder", [yo, xo, ko, yi, ki, xi], "all")
    # cfg["reorder"].apply(s, C, [yo, xo, ko, yi, ki, xi])

    # other
    s[C].reorder(yo, xo, ko, yi, ki, xi)
    s[C].vectorize(xi)
    s[C].unroll(ki)
    s[C].parallel(xo)
    return s, [A, B, C]
Example #14
0
def Gemm_tv2_reorder2_3_vec1_para1_config_define(N, K, M, dtype):
    A = tvm.placeholder((N, K), name='A', dtype=dtype)
    B = tvm.placeholder((K, M), name='B', dtype=dtype)
    k = tvm.reduce_axis((0, K), name='k')
    C = tvm.compute((N, M),
                    lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k),
                    name='C')

    s = tvm.create_schedule(C.op)
    k = s[C].op.reduce_axis[0]
    y, x = s[C].op.axis

    cfg = autotvm.get_config()
    cfg.define_split("tile_x", x, num_outputs=2)
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_k", k, num_outputs=2)
    # cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4,8], [4, 1, 4,8]])
    # cfg.define_split('tile_y', y, policy='candidate', candidate=[[1, 4, 4,8], [4, 1, 4,8]])
    # cfg.define_split('tile_k', k, policy='candidate', candidate=[[1, 4, 4,8], [4, 1, 4,8]])
    xo, xi = cfg["tile_x"].apply(s, C, x)
    yo, yi = cfg["tile_y"].apply(s, C, y)
    ko, ki = cfg["tile_k"].apply(s, C, k)
    print(xo)
    s[C].reorder(xo, yo, ko, xi, ki, yi)
    s[C].vectorize(yi)
    s[C].parallel(xo)
    return s, [A, B, C]
Example #15
0
    def test_workload_padding(
        self,
        target,
        input_shape,
        weight_shape,
        stride,
        padding,
        dilation,
        dtype,
        ref_data,
    ):
        a_np, w_np, b_np, c_np = ref_data
        _, _, out_height, out_width = c_np.shape

        A = te.placeholder(input_shape, name="A", dtype=dtype)
        W = te.placeholder(weight_shape, name="W", dtype=dtype)

        with tvm.target.Target(target):
            wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)

            # check if tile_ow candidates are the factors of the right output weight.
            cfg = autotvm.get_config()
            _fallback_schedule(cfg, wkl)
            ow_tile = np.prod(cfg["tile_ow"].size)

        tvm.testing.assert_allclose(ow_tile, out_width)
Example #16
0
def matmul(N, L, M, dtype):
    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    C = te.compute((N, M),
                   lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
                   name="C")
    s = te.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    ##### define space begin #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2)
    ##### define space end #####

    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    # Make sure configurations have a varied number of itervars. Splitting adds
    # new itervars, so conditionally splitting with cause the number of
    # itervars to depend on the tile size.
    if cfg["tile_x"].size[-1] > 1:
        xo, xi = cfg["tile_x"].apply(s, C, x)
        s[C].reorder(yo, xo, k, yi, xi)
    else:
        s[C].reorder(yo, k, yi, x)

    return s, [A, B, C]
Example #17
0
def output_transform_autotvm(dtype):
    cfg = autotvm.get_config()
    cfg.define_knob('VK', [2, 4, 8, 16])
    cfg.define_knob('VP', [4, 8, 16])
    VK = cfg['VK'].val
    VP = cfg['VP'].val
    X = tvm.placeholder(shape=(1, 64, 56, 56), dtype="float32", name="X")
    W = tvm.placeholder(shape=(64, 64, 56, 56), dtype="float32", name="W")
    N = get_const_int(X.shape[0])
    IH = get_const_int(X.shape[2])
    IW = get_const_int(X.shape[3])
    OH = get_const_int((IH + 2 * HPAD - 3) // HSTR + 1)
    OW = get_const_int((IW + 2 * WPAD - 3) // WSTR + 1)
    nH, nW = get_const_int((OH + m - 1) // m), get_const_int((OW + m - 1) // m)

    def round_up(a, b):
        return ((a + b - 1) // b) * b

    P = round_up(N * nH * nW, VP)
    K = get_const_int(W.shape[0])
    assert K % VK == 0
    assert P % VP == 0

    cfg.define_knob('use_minimal', [1])
    M = tvm.placeholder(shape=(K // VK, P // VP, alpha, alpha, VK, VP),
                        name="M")
    if cfg['use_minimal'].val:
        output = decl_output_transform_minimal(cfg, X, M, VK, VP)
    else:
        output = decl_output_transform(cfg, X, M, VK, VP)
    s = schedule_output_transform(cfg, output)
    #print(tvm.lower(s, [X, M, output], simple_mode=True))
    return s, [X, M, output]
Example #18
0
def Gemm_tv2_reorder2_3_vec1_para1_config_define(N, K, M, dtype):
    A = tvm.placeholder((N, K), name='A', dtype=dtype)
    B = tvm.placeholder((K, M), name='B', dtype=dtype)
    k = tvm.reduce_axis((0, K), name='k')
    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')

    s = tvm.create_schedule(C.op)
    k = s[C].op.reduce_axis[0]
    y, x = s[C].op.axis

    cfg = autotvm.get_config()
    cfg.define_split("tile_x", x , num_outputs=2)
    cfg.define_split("tile_y", y , num_outputs=2)
    cfg.define_split("tile_k", k , num_outputs=2)
    '''
    >>> # use custom candidates
    >>> cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4], [4, 1, 4]])'''
    #>>> # use a filter that only accepts the split scheme whose inner most tile is less then 64
    # cfg.define_split('tile_x', x, policy='factors', filter=lambda x: x.size[-1] <= 64)
    # cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 64)
    # cfg.define_split('tile_k', k, policy='factors', filter=lambda x: x.size[-1] <= 64)

    # cfg.define_knob("tile_x", [1, 4, 8, 16, 32, 64])
    # cfg.define_knob("tile_y", [1, 4, 8, 16, 32, 64])
    # cfg.define_knob("tile_k", [1, 4, 8, 16, 32, 64])
    # xo, xi = s[C].split(x, cfg['tile_x'].val)
    # yo, yi = s[C].split(y, cfg['tile_y'].val)
    # ko, ki = s[C].split(k, cfg['tile_k'].val)
    xo, xi = cfg["tile_x"].apply(s, C, x)
    yo, yi = cfg["tile_y"].apply(s, C, y)
    ko, ki = cfg["tile_k"].apply(s, C, k)
    s[C].reorder(xo,yo,ko,xi,ki,yi)
    s[C].vectorize(yi)
    s[C].parallel(xo)
    return s, [A, B, C]
def matmul_v1(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)

    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M),
                    lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k),
                    name='C')
    s = tvm.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    # 2. get the config object
    cfg = autotvm.get_config()

    # 3. define search space
    cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])

    # 4. schedule according to config
    yo, yi = s[C].split(y, cfg['tile_y'].val)
    xo, xi = s[C].split(x, cfg['tile_x'].val)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]
Example #20
0
def CONVAutoTVM(*args):
    global function
    def getSplit(maxNum):
        splitList = []
        para = 2
        while (True):
            if para < maxNum / 4 and para <= 32:
                splitList.append(para)
                para *= 2
            else:
                break
        if len(splitList) == 0:
            splitList.append(1)
        return splitList
    
    ops, bufs = function(*args)
    s = tvm.create_schedule(ops)

    # get bias_tensor, conv_tensor, pad_tensor and their ops relatively
    bias_tensor = None
    conv_tensor = None
    pad_tensor = None
    conv_tensor = bufs[len(bufs) - 1]
    in_tensor2 = conv_tensor.op.input_tensors[1]
    in_tensor1 = conv_tensor.op.input_tensors[0]
    if in_tensor2.op.name == "bias":
        bias_tensor = conv_tensor
        conv_tensor = in_tensor1
    in_tensor1 = conv_tensor.op.input_tensors[0]
    pad_tensor = in_tensor1
    if bias_tensor != None:
        bias_op = s[bias_tensor]
    conv_op = s[conv_tensor]
    pad_op = s[pad_tensor]


    oc = conv_op.op.axis[1]
    x = conv_op.op.axis[2]
    y = conv_op.op.axis[3]
    ic = conv_op.op.reduce_axis[0]
    kh = conv_op.op.reduce_axis[1]
    kw = conv_op.op.reduce_axis[2]

    cfg = autotvm.get_config()
    cfg.define_knob("split_oc", getSplit(int(oc.dom.extent)))
    cfg.define_knob("split_x", getSplit(int(x.dom.extent)))
    cfg.define_knob("split_y", getSplit(int(y.dom.extent)))
    cfg.define_knob("split_ic", getSplit(int(ic.dom.extent)))

    oco, oci = conv_op.split(oc, cfg["split_oc"].val)
    xo, xi = conv_op.split(x, cfg["split_x"].val)
    yo, yi = conv_op.split(y, cfg["split_y"].val)
    ico, ici = conv_op.split(ic, cfg["split_ic"].val)
    conv_op.reorder(oco, ico, xo, yo, oci, ici, kh, kw, xi, yi)
    cfg.define_annotate("yi_unroll", [yi], policy='try_unroll')
    
    pad_op.compute_inline()

    return s, bufs
Example #21
0
def matmul():
    # Algorithm
    k = te.reduce_axis((0, K), 'k')
    A = te.placeholder((M, K), name='A')
    B = te.placeholder((K, N), name='B')

    ##### define space begin #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_x", M, num_outputs=3)
    cfg.define_split("tile_y", N, num_outputs=3)
    cfg.define_split("tile_k", K, num_outputs=2)
    ##### define space end #####

    # We have to re-write the algorithm slightly.
    bn = cfg["tile_y"].size[-1]
    packedB = te.compute((N / bn, K, bn),
                         lambda x, y, z: B[y, x * bn + z],
                         name='packedB')
    C = te.compute(
        (M, N),
        lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, y % bn], axis=k),
        name='C')
    s = te.create_schedule(C.op)
    x, y = s[C].op.axis
    k, = s[C].op.reduce_axis

    # schedule according to config
    # Allocate write cache
    CC = s.cache_write(C, 'global')
    xt, xo, xi = cfg["tile_x"].apply(s, C, x)
    yt, yo, yi = cfg["tile_y"].apply(s, C, y)
    s[C].reorder(xt, yt, xo, yo, xi, yi)
    xyt = s[C].fuse(xt, yt)
    # parallel
    s[C].parallel(xyt)
    xyo = s[C].fuse(xo, yo)
    s[C].unroll(xi)
    s[C].vectorize(yi)

    # Write cache is computed at xyo
    s[CC].compute_at(s[C], xyt)

    # New inner axes
    xc, yc = s[CC].op.axis

    k, = s[CC].op.reduce_axis
    ko, ki = cfg["tile_k"].apply(s, CC, k)
    s[CC].reorder(ko, xc)
    code = tvm.lower(s, [A, B, C], simple_mode=True)
    cfg.define_reorder("reorder", [ko, xc, ki, yc], "all")
    cfg["reorder"].apply(s, CC, [ko, xc, ki, yc])
    cfg.define_annotate('ann', [ko, xc, ki, yc], policy='try_unroll_vec')
    cfg['ann'].apply(s, CC, [ko, xc, ki, yc])

    x, y, z = s[packedB].op.axis
    s[packedB].vectorize(z)
    s[packedB].parallel(x)

    return s, [A, B, C]
Example #22
0
def gemm(M, N, K):
    A = tvm.placeholder((M, K), name='A')
    B = tvm.placeholder((K, N), name='B')
    k = tvm.reduce_axis((0, K), 'k')
    C = tvm.compute((M, N),
                    lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k),
                    name='C')
    s = tvm.create_schedule([C.op])
    cfg = autotvm.get_config()

    local_C = s.cache_write(C, "local")

    #schedule C
    h_C, w_C = s[C].op.axis
    cfg.define_split("h_C", h_C, num_outputs=4)
    cfg.define_split("w_C", w_C, num_outputs=4)
    bh, vth, th, h = cfg["h_C"].apply(s, C, h_C)
    bw, vtw, tw, w = cfg["w_C"].apply(s, C, w_C)
    s[C].bind(bh, tvm.thread_axis("blockIdx.x"))
    s[C].bind(bw, tvm.thread_axis("blockIdx.y"))
    s[C].bind(vth, tvm.thread_axis("vthread"))
    s[C].bind(vtw, tvm.thread_axis("vthread"))
    s[C].bind(th, tvm.thread_axis("threadIdx.x"))
    s[C].bind(tw, tvm.thread_axis("threadIdx.y"))
    s[C].reorder(bh, bw, vth, vtw, th, tw, h, w)

    #schedule local_C
    s[local_C].compute_at(s[C], tw)
    hi, wi = s[local_C].op.axis
    rk = s[local_C].op.reduce_axis[0]
    cfg.define_split("rk", rk, num_outputs=2)
    rko, rki = cfg["rk"].apply(s, local_C, rk)

    s[local_C].reorder(rko, rki, hi, wi)

    #schedule share_A and share_B
    share_A = s.cache_read(A, 'shared', local_C)
    s[share_A].compute_at(s[local_C], rko)
    sh_h, sh_w = s[share_A].op.axis
    th, sh_h = s[share_A].split(sh_h, nparts=cfg["h_C"].size[2])
    tw, sh_w = s[share_A].split(sh_w, nparts=cfg["w_C"].size[2])
    s[share_A].bind(th, tvm.thread_axis("threadIdx.x"))
    s[share_A].bind(tw, tvm.thread_axis("threadIdx.y"))

    share_B = s.cache_read(B, "shared", local_C)
    s[share_B].compute_at(s[local_C], rko)
    sh_h, sh_w = s[share_B].op.axis
    th, sh_h = s[share_B].split(sh_h, nparts=cfg["h_C"].size[2])
    tw, sh_w = s[share_B].split(sh_w, nparts=cfg["w_C"].size[2])
    s[share_B].bind(th, tvm.thread_axis("threadIdx.x"))
    s[share_B].bind(tw, tvm.thread_axis("threadIdx.y"))

    #schedule local_A and local_B
    local_A = s.cache_read(share_A, "local", local_C)
    s[local_A].compute_at(s[local_C], rki)
    local_B = s.cache_read(share_B, "local", local_C)
    s[local_B].compute_at(s[local_C], rki)
    return s, [A, B, C]
def conv2d_NCHWc_KCRSk(input_shape, filter_shape):
    data = te.placeholder(input_shape, name="data", dtype="float32")
    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
    conv = compute_conv2d_NCHWc_KCRSk(data, filt, [1, 1], [0, 0], [1, 1],
                                      "float32")
    cfg = autotvm.get_config()
    s = te.create_schedule([x.op for x in [conv]])
    schedule_conv2d_NCHWc_KCRSk(cfg, s, conv)
    return s, (data, filt, conv)
def depthwise_conv2d_NCHWc_KCRSk_acc32(input_shape, filter_shape):
    data = te.placeholder(input_shape, name="data", dtype="float32")
    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
    output = compute_depthwise_conv2d_NCHWc_KCRSk_acc32(
        data, filt, [1, 1], [0, 0], [1, 1], "float32")
    cfg = autotvm.get_config()
    s = te.create_schedule([x.op for x in [output]])
    schedule_depthwise_conv2d_NCHWc_KCRSk_acc32(cfg, s, output)
    return s, (data, filt, output)
Example #25
0
def bgemm_topi(Y, X, K, dtype="uint64"):
    DB = 1
    WB = 1
    out_dtype = dtype
    data_packed = tvm.placeholder((Y, DB, K), dtype=dtype, name="A")
    weight_packed = tvm.placeholder((X, WB, K), dtype=dtype, name="B")

    oshape = (Y, X)
    k = tvm.reduce_axis((0, K), name='k')
    db = tvm.reduce_axis((0, DB), name='db')
    wb = tvm.reduce_axis((0, WB), name='wb')

    matmul = tvm.compute(
        oshape,
        lambda i, j: tvm.sum(tvm.popcount(weight_packed[
            j, wb, k] & data_packed[i, db, k]).astype(out_dtype) <<
                             (db + wb).astype(out_dtype),
                             axis=[wb, db, k]),
        tag='bitserial_dense')

    s = tvm.create_schedule(matmul.op)
    cfg = autotvm.get_config()

    CC = s.cache_write(matmul, "global")

    y, x = s[matmul].op.axis

    yo, yi = cfg.define_split("tile_y",
                              y,
                              num_outputs=2,
                              filter=lambda x: x.size[-1] <= 8)
    xo, xi = cfg.define_split("tile_x",
                              x,
                              num_outputs=2,
                              filter=lambda x: x.size[-1] <= 8)

    yo, yi = cfg["tile_y"].apply(s, matmul, y)
    xo, xi = cfg["tile_x"].apply(s, matmul, x)

    s[matmul].reorder(yo, xo, yi, xi)
    cfg.define_knob("compute_at_axis", [0, 1, 2])
    if cfg["compute_at_axis"].val == 0:
        s[CC].compute_at(s[matmul], xo)
    elif cfg["compute_at_axis"].val == 1:
        s[CC].compute_at(s[matmul], yi)
    elif cfg["compute_at_axis"].val == 2:
        s[CC].compute_at(s[matmul], xi)

    yc, xc = s[CC].op.axis
    wb, db, k = s[CC].op.reduce_axis

    cfg.define_reorder("reorder_0", [k, yc, xc], policy="all")
    cfg["reorder_0"].apply(s, CC, [k, yc, xc])

    cfg.add_flop(2 * Y * X * K * int(dtype[4:]))

    return s, [data_packed, weight_packed, matmul]
Example #26
0
    def verify_workload_padding():
        _, _, out_height, out_width = get_const_tuple(c_np.shape)
        wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)

        # check if tile_ow candidates are the factors of the right output weight.
        cfg = autotvm.get_config()
        _fallback_schedule(cfg, wkl)
        ow_tile = np.prod(cfg["tile_ow"].size)

        tvm.testing.assert_allclose(ow_tile, out_width)
Example #27
0
def conv2d_channel_batch(B,
                         N,
                         M,
                         C,
                         K,
                         L,
                         O,
                         stride=1,
                         padding=0,
                         dtype="float32"):
    A = tvm.placeholder((B, N, M, C), dtype=dtype, name="A")
    W = tvm.placeholder((K, L, C, O), dtype=dtype, name="W")
    N_out = max(0, (N + padding * 2 - K) // stride) + 1
    M_out = max(0, (M + padding * 2 - L) // stride) + 1
    Apad = tvm.compute(
        (B, N + 2 * padding, M + 2 * padding, C),
        lambda b, i, j, k: tvm.if_then_else(
            tvm.all(i >= padding, j >= padding, i < N + padding, j < M +
                    padding), A[b, i - padding, j - padding, k], 0.0),
        name="Apad")
    rx, ry = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis((0, L),
                                                                 name="ry")
    rc = tvm.reduce_axis((0, C), name="rc")
    Output = tvm.compute(
        (B, N_out, M_out, O),
        lambda b, i, j, k: tvm.sum(Apad[b, i * stride + rx, j * stride + ry, rc
                                        ] * W[rx, ry, rc, k],
                                   axis=[rx, ry, rc]),
        name="Output")

    s = tvm.create_schedule(Output.op)
    s[Apad].compute_inline()
    CL = s.cache_write(Output, "local")

    n, h, w, c = s[Output].op.axis
    out = s[Output].fuse(h, w)
    cfg = autotvm.get_config()
    cfg.define_split("split_n", n, num_outputs=2)
    cfg.define_split("split_c", c, num_outputs=2)
    no, ni = cfg["split_n"].apply(s, Output, n)
    co, ci = cfg["split_c"].apply(s, Output, c)
    s[Output].reorder(no, out, co, ni, ci)
    s[Output].parallel(out)

    # schedule CL
    s[CL].compute_at(s[Output], co)
    ni, hi, wi, ci = s[CL].op.axis
    xi, yi, ki = s[CL].op.reduce_axis
    cfg.define_split("split_k", ki, num_outputs=2)
    ko, ki = cfg["split_k"].apply(s, CL, ki)
    s[CL].reorder(ko, xi, yi, ni, ki, ci)
    s[CL].unroll(ki)
    s[CL].vectorize(ci)

    return s, [A, W, Output]
Example #28
0
def conv2d_turning(*args):
    global function
    ops, bufs = function(*args)
    data, kernel, conv = bufs
    s = tvm.create_schedule(conv.op)

    ##### space definition begin #####
    n, f, y, x = s[conv].op.axis
    rc, ry, rx = s[conv].op.reduce_axis

    cfg = autotvm.get_config()
    cfg.define_split("tile_f", f, num_outputs=4)
    cfg.define_split("tile_y", y, num_outputs=4)
    cfg.define_split("tile_x", x, num_outputs=4)
    cfg.define_split("tile_rc", rc, num_outputs=3)
    cfg.define_split("tile_ry", ry, num_outputs=3)
    cfg.define_split("tile_rx", rx, num_outputs=3)
    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
    ##### space definition end #####

    # inline padding
    pad_data = s[conv].op.input_tensors[0]
    s[pad_data].compute_inline()
    data, raw_data = pad_data, data

    output = conv
    OL = s.cache_write(conv, 'global')
    # tile and bind spatial axes
    n, f, y, x = s[output].op.axis
    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
    kernel_scope = n  # this is the scope to attach global config inside this kernel

    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
    fuse = s[output].fuse(yi, xi)
    s[output].vectorize(fuse)
    if (args[0] == 4 and args[1] == 112 and args[2] == 14 and args[4] == 224):
        s[output].unroll(fi)
    s[OL].compute_at(s[output], tx)

    # tile reduction axes
    n, f, y, x = s[OL].op.axis
    rc, ry, rx = s[OL].op.reduce_axis
    rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
    ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
    rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
    s[OL].reorder(n, rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, f, y, x)

    n, f, y, x = s[conv].op.axis
    s[conv].parallel(n)

    #print(tvm.lower(s, [data, kernel, conv], simple_mode=True))
    return s, [raw_data, kernel, conv]
Example #29
0
def vectoradd_naive(K,dtype):
    A = tvm.placeholder((K,), name='A', dtype=dtype)
    B = tvm.placeholder((K,), name='B', dtype=dtype)
    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name='C')
    s = tvm.create_schedule(C.op)

    #### ADDED AUTO AUTOTUNING ####
    Passes.enable_autotune(s,[C],autotvm.get_config(),mode=Passes.NAIVE)
    ###############################

    return s, [A, B, C]
def block_sparse_template(W_sp_np_data_shape, W_sp_np_indices_shape, W_sp_np_indptr_shape, X_np_shape):
    W_data = te.placeholder(shape=W_sp_np_data_shape, dtype='float32', name='W_data')
    W_indices = te.placeholder(shape=W_sp_np_indices_shape, dtype='int32', name='W_indices')
    W_indptr = te.placeholder(shape=W_sp_np_indptr_shape, dtype='int32', name='W_indptr')
    X = te.placeholder(shape=X_np_shape, dtype='float32', name='X')
    Y = topi.nn.sparse_dense(X, W_data, W_indices, W_indptr)

    cfg = autotvm.get_config()
    cfg.add_flop(W_sp_np_data_shape[0] * X_np_shape[0] * W_sp_np_data_shape[1] * W_sp_np_data_shape[2] * 2)
    s = schedule_sparse_dense_cuda_allreduce_autotune(cfg, [Y])
    return s, [X, W_data, W_indices, W_indptr, Y]
Example #31
0
def conv2d(data, kernel, *args):
    cfg = autotvm.get_config()
    out = conv2d_NCHWc_int8(cfg, data, kernel, *args)
    s = tvm.create_schedule(out.op)
    s = schedule_conv2d_NCHWc_int8(cfg, s, out)
    fadd = tvm.build(s, [data, kernel, out], 'cuda', name="conv")
    dev_module = fadd.imported_modules[0]
    print("-----GPU code-----")
    print(dev_module.get_source())
    print(s)
    return s, [data, kernel, out]
Example #32
0
def relu_naive(m, n):
    A = tvm.placeholder((m, n), name='A')
    B = topi.nn.relu(A)
    with tvm.target.create('llvm'):
        s = tvm.create_schedule(B.op)

    cfg = autotvm.get_config()

    Passes.enable_autotune(s, [B], cfg, mode=Passes.NAIVE)

    return s, [A, B]
Example #33
0
def bad_matmul(N, L, M, dtype):
    if 'bad_device' in tvm.target.current_target().keys:
        A = tvm.placeholder((N, L), name='A', dtype=dtype)
        B = tvm.placeholder((L, M), name='B', dtype=dtype)

        k = tvm.reduce_axis((0, L-1), name='k')
        C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
        s = tvm.create_schedule(C.op)

        # schedule
        y, x = s[C].op.axis
        cfg = autotvm.get_config()
        cfg.define_split("tile_y", y, num_outputs=2)
        cfg.define_split("tile_x", x, num_outputs=2)
        return s, [A, B, C]

    return matmul(N, L, M, dtype)
Example #34
0
def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
    """An example template for testing"""
    assert N == 1, "Only consider batch_size = 1 in this template"

    data = tvm.placeholder((N, CI, H, W), name='data')
    kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')

    rc = tvm.reduce_axis((0, CI), name='rc')
    ry = tvm.reduce_axis((0, KH), name='ry')
    rx = tvm.reduce_axis((0, KW), name='rx')

    conv = tvm.compute(
        (N, CO, H - KH + 1, W - KW + 1),
        lambda nn, ff, yy, xx: tvm.sum(
            data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx],
            axis=[rc, ry, rx]), tag="conv2d_nchw")

    s = tvm.create_schedule([conv.op])

    output = conv
    OL = s.cache_write(conv, 'local')

    # create cache stage
    AA = s.cache_read(data, 'shared', [OL])
    WW = s.cache_read(kernel, 'shared', [OL])
    AL = s.cache_read(AA, 'local', [OL])
    WL = s.cache_read(WW, 'local', [OL])

    # tile and bind spatial axes
    n, f, y, x = s[output].op.axis
    cfg = autotvm.get_config()
    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
    kernel_scope = n  # this is the scope to attach global config inside this kernel

    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
    s[output].bind(vf, tvm.thread_axis("vthread"))
    s[output].bind(vy, tvm.thread_axis("vthread"))
    s[output].bind(vx, tvm.thread_axis("vthread"))
    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
    s[OL].compute_at(s[output], tx)

    # tile and bind reduction axes
    n, f, y, x = s[OL].op.axis
    rc, ry, rx = s[OL].op.reduce_axis
    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
    rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
    ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
    rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
    s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)

    s[AA].compute_at(s[OL], rxo)
    s[WW].compute_at(s[OL], rxo)
    s[AL].compute_at(s[OL], rxm)
    s[WL].compute_at(s[OL], rxm)

    # cooperative fetching
    for load in [AA, WW]:
        n, f, y, x = s[load].op.axis
        fused = s[load].fuse(n, f, y, x)
        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))

    # tune unroll
    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
    cfg.define_knob("unroll_explicit", [0, 1])
    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)

    return s, [data, kernel, conv]
 def simple_template(a, b):
     cfg = autotvm.get_config()
     assert cfg.is_fallback
Example #36
0
def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
    assert N == 1, "Only consider batch_size = 1 in this template"

    data = tvm.placeholder((N, CI, H, W), name='data')
    kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32')
    s = tvm.create_schedule([conv.op])

    ##### space definition begin #####
    n, f, y, x = s[conv].op.axis
    rc, ry, rx = s[conv].op.reduce_axis

    cfg = autotvm.get_config()
    cfg.define_split("tile_f", f, num_outputs=4)
    cfg.define_split("tile_y", y, num_outputs=4)
    cfg.define_split("tile_x", x, num_outputs=4)
    cfg.define_split("tile_rc", rc, num_outputs=3)
    cfg.define_split("tile_ry", ry, num_outputs=3)
    cfg.define_split("tile_rx", rx, num_outputs=3)
    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
    cfg.define_knob("unroll_explicit", [0, 1])
    ##### space definition end #####

    # inline padding
    pad_data = s[conv].op.input_tensors[0]
    s[pad_data].compute_inline()
    data, raw_data = pad_data, data

    output = conv
    OL = s.cache_write(conv, 'local')

    # create cache stage
    AA = s.cache_read(data, 'shared', [OL])
    WW = s.cache_read(kernel, 'shared', [OL])
    AL = s.cache_read(AA, 'local', [OL])
    WL = s.cache_read(WW, 'local', [OL])

    # tile and bind spatial axes
    n, f, y, x = s[output].op.axis
    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
    kernel_scope = n  # this is the scope to attach global config inside this kernel

    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
    s[output].bind(vf, tvm.thread_axis("vthread"))
    s[output].bind(vy, tvm.thread_axis("vthread"))
    s[output].bind(vx, tvm.thread_axis("vthread"))
    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
    s[OL].compute_at(s[output], tx)

    # tile reduction axes
    n, f, y, x = s[OL].op.axis
    rc, ry, rx = s[OL].op.reduce_axis
    rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
    ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
    rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
    s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)

    s[AA].compute_at(s[OL], rxo)
    s[WW].compute_at(s[OL], rxo)
    s[AL].compute_at(s[OL], rxm)
    s[WL].compute_at(s[OL], rxm)

    # cooperative fetching
    for load in [AA, WW]:
        n, f, y, x = s[load].op.axis
        fused = s[load].fuse(n, f, y, x)
        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))

    # tune unroll
    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)

    return s, [raw_data, kernel, conv]