Example #1
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes))
     C = tvm.placeholder((n,), name='C', dtype="int32")
     D = tvm.compute((n,),
                     lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
     s = tvm.create_schedule(D.op)
     xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
     s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
     s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
     fun = tvm.build(s, [A, B, C, D], "cuda")
     np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
     np_b = np.random.randint(low=-128, high=127, size=(n,lanes))
     np_c = np.random.randint(low=0, high=127, size=(n,))
     np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
     b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b)
     c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
     d = tvm.nd.empty((n,), D.dtype, ctx)
     fun(a, b, c, d)
     tvm.testing.assert_allclose(d.asnumpy(), np_d)
Example #2
0
def test_matmul_add():
    n = 1024
    l = 128
    m = 235
    bias = tvm.var('bias', dtype=tvm.float32)
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((l, m), name='B')
    C = cblas.matmul(A, B)
    D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
    s = tvm.create_schedule(D.op)

    def verify(target="llvm"):
        if not tvm.module.enabled(target):
            print("skip because %s is not enabled..." % target)
            return
        if not tvm.get_global_func("tvm.contrib.cblas.matmul", True):
            print("skip because extern function is not available")
            return
        ctx = tvm.cpu(0)
        f = tvm.build(s, [A, B, D, bias], target)
        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
        d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
        bb = 10.0
        f(a, b, d, bb)
        tvm.testing.assert_allclose(
            d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5)
    verify()
Example #3
0
def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, padding):
    in_height = in_width = in_size

    with tvm.target.rasp():
        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
        B = topi.nn.conv2d(A, W, stride, padding)
        s = topi.generic.schedule_conv2d_nchw([B])

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d.verify_conv2d")
    def get_ref_data():
        a_np = np.random.uniform(size=a_shape).astype(dtype)
        w_np = np.random.uniform(size=w_shape).astype(dtype)
        b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
        return a_np, w_np, b_np

    a_np, w_np, b_np = get_ref_data()

    ctx = tvm.cpu(0)
    a = tvm.nd.array(a_np, ctx)
    w = tvm.nd.array(w_np, ctx)
    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
    func = tvm.build(s, [A, W, B], "llvm")
    func(a, w, b)
    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
Example #4
0
def _topi_nn_depthwise_conv2d_NCHWc(*args, **kwargs):
    assert not kwargs, "Do not support kwargs in template function call"
    data, kernel, strides, padding, dilation, dtype = deserialize_args(args)

    batch, in_channel, height, width = get_const_tuple(data.shape)
    filter_channel, channel_multiplier, kh, kw = get_const_tuple(kernel.shape)
    ph, pw = padding if isinstance(padding, (tuple, list)) else (padding, padding)
    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
    out_height = (height - kh + 2 * ph) // sh + 1
    out_width = (width - kw + 2 * pw) // sw + 1
    out_channel = filter_channel * channel_multiplier

    # get config here
    cfg = get_config()
    cfg.define_split("tile_ic", in_channel, num_outputs=2)
    cfg.define_split("tile_oc", out_channel, num_outputs=2)
    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)

    # change shape with the value in config
    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
    new_data_shape = (batch, in_channel // ic_bn, height, width, ic_bn)
    new_kernel_shape = (out_channel // oc_bn, kh, kw, oc_bn)
    new_data = tvm.placeholder(new_data_shape, data.dtype)
    new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)

    data_layout = "NCHW%dc" % ic_bn
    out_layout = "NCHW%dc" % oc_bn

    C = _depthwise_conv2d_NCHWc_cpu(cfg, new_data, new_kernel, strides, padding, dilation,
                                    data_layout, out_layout, dtype)
    s = schedule_depthwise_conv2d_NCHWc(cfg, [C])
    return s, [new_data, new_kernel, C]
Example #5
0
def verify_gather_nd(src_shape, indices_src, indices_dtype):
    src_dtype = "float32"
    indices_src = np.array(indices_src, dtype=indices_dtype)
    A = tvm.placeholder(shape=src_shape, dtype=src_dtype, name="A")
    indices = tvm.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
    out_tensor = topi.gather_nd(a=A, indices=indices)

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_injective(out_tensor)

        func = tvm.build(s, [A, indices, out_tensor] , device, name="take")
        shape_size = 1
        for i in range(len(src_shape)):
            shape_size = shape_size * src_shape[i]
        data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
        out_npys = topi.testing.gather_nd_python(data_npy, indices_src)

        data_nd = tvm.nd.array(data_npy, ctx)
        indices_nd = tvm.nd.array(indices_src, ctx)
        out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
        func(data_nd, indices_nd, out_nd)
        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)

    for device in get_all_backend():
        check_device(device)
Example #6
0
def verify_expand_like(in_shape, out_shape, axis):
    A = tvm.placeholder(shape=in_shape, name="A")
    B = tvm.placeholder(shape=out_shape, name="B")
    C = topi.expand_like(A, B, axis)
    s = tvm.create_schedule([C.op])

    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)

        ctx = tvm.context(device, 0)
        f = tvm.build(s, [A, B, C], device, name="expand_like")
        input = np.random.uniform(size=in_shape).astype(A.dtype)
        tvm_input = tvm.nd.array(input, ctx)

        odim = len(out_shape)
        real_axis = [x if x >= 0 else x + odim for x in axis]
        real_axis = sorted(real_axis)
        for x in real_axis:
            input = np.expand_dims(input, x).astype(A.dtype)
        for x in real_axis:
            input = np.concatenate([input]*out_shape[x], axis=x).astype(A.dtype)
        assert input.shape == out_shape

        tvm_shape_like = tvm.nd.array(np.zeros(out_shape).astype(B.dtype), ctx)
        out = tvm.nd.array(np.zeros(out_shape).astype(A.dtype), ctx)
        f(tvm_input, tvm_shape_like, out)
        tvm.testing.assert_allclose(out.asnumpy(), input)

    for device in ["llvm"]:
        check_device(device)
Example #7
0
def verify_bitserial_dense(batch, in_dim, out_dim, activation_bits, weight_bits, unipolar):
    input_dtype = 'uint32'
    out_dtype = 'int16'

    with tvm.target.create('llvm'):
        A = tvm.placeholder((batch, in_dim), dtype=input_dtype, name='A')
        B = tvm.placeholder((out_dim, in_dim), dtype=input_dtype, name='B')
        C = topi.nn.bitserial_dense(A, B, activation_bits, weight_bits, out_dtype=out_dtype,
                                    unipolar=unipolar)
        s = topi.generic.schedule_bitserial_dense([C])

    a_shape = get_const_tuple(A.shape)
    b_shape = get_const_tuple(B.shape)

    @memoize("topi.tests.test_topi_bitseral_dense")
    def get_ref_data():
        a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype)
        b_np = generate_quantized_np(get_const_tuple(b_shape), weight_bits, input_dtype)
        if unipolar:
            b_ = np.copy(b_np).astype(out_dtype)
            for x in np.nditer(b_, op_flags=['readwrite']):
                x[...] = 1 if x == 1 else -1
            c_np = np.dot(a_np, b_.T)
        else:
            c_np = np.dot(a_np, b_np.T)
        return a_np, b_np, c_np
    a_np, b_np, c_np = get_ref_data()

    ctx = tvm.cpu(0)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(b_np, ctx)
    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
    func = tvm.build(s, [A, B, C], "llvm")
    func(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
Example #8
0
def intrin_gemv(m, l):
    a = tvm.placeholder((l,), name='a')
    b = tvm.placeholder((m, l), name='b')
    k = tvm.reduce_axis((0, l), name='k')
    c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c')
    Ab = tvm.decl_buffer(a.shape, a.dtype,
                         name="A",
                         offset_factor=1,
                         strides=[1])
    Bb = tvm.decl_buffer(b.shape, b.dtype,
                         name="B",
                         offset_factor=1,
                         strides=[tvm.var("s1"), 1])
    Cb = tvm.decl_buffer(c.shape, c.dtype,
                         name="C",
                         offset_factor=1,
                         strides=[1])
    def intrin_func(ins, outs):
        ib = tvm.ir_builder.create()
        aa, bb = ins
        cc = outs[0]
        ib.emit(tvm.call_extern("int32", "gemv_update",
                                cc.access_ptr("w"),
                                aa.access_ptr("r"),
                                bb.access_ptr("r"),
                                m, l, bb.strides[0]))
        return ib.get()
    with tvm.build_config(offset_factor=1):
        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
Example #9
0
def test_conv2d():
    if not tvm.module.enabled("metal"):
        print("skip because %s is not enabled..." % "metal")
        return
    n = 1
    h = 14
    w = 14
    ci = 2
    co = 4
    kh = 3
    kw = 3
    stride = 2
    A = tvm.placeholder((n, h, w, ci), name="x")
    B = tvm.placeholder((co, kh, kw, ci), name="w")
    C = mps.conv2d(A, B, 'SAME', 2)
    s1 = tvm.create_schedule(C.op)

    def verify(A, B, C, target="llvm"):
        if not tvm.get_global_func("tvm.contrib.mps.conv2d", True):
            print("skip because extern function is not available")
            return
        ctx = tvm.metal(0)
        f = tvm.build(s1, [A, B, C], "metal")
        a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), ctx)
        f(a, b, c)
        # print(c.asnumpy())
        # print(c.shape)
        
    verify(A, B, C, s1)
def intrin_vadd(n, cache_read=False, cache_write=False):
    scope_ubuf = 'local'
    dtype = 'float32'
    x = tvm.placeholder((n,), dtype=dtype, name='vx')
    y = tvm.placeholder((n,), dtype=dtype, name='vy')
    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
    s = tvm.create_schedule(z.op)

    def create_buffer(t):
        return tvm.decl_buffer(t.shape, t.dtype,
                               name='W'+t.name,
                               scope=scope_ubuf,
                               offset_factor=16)

    binds = {}
    if cache_read:
        binds[x] = create_buffer(x)
        binds[y] = create_buffer(y)
    if cache_write:
        binds[z] = create_buffer(z)

    def intrin_func(ins, outs):
        ib = tvm.ir_builder.create()
        ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
        return ib.get()

    with tvm.build_config(offset_factor=16):
        return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
Example #11
0
def test_add():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = tvm.create_schedule(C.op)

    def check_c():
        mhost = tvm.build(s, [A, B, C], "c", name="fadd")
        temp = util.tempdir()
        path_dso = temp.relpath("temp.so")
        mhost.export_library(path_dso)
        m = tvm.module.load(path_dso)
        fadd = m['fadd']
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        fadd(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + b.asnumpy())
    check_c()
Example #12
0
def test_conv_tiling():
    HSTR = WSTR = 1
    in_channel = 128
    kernel_height = kernel_width = 3
    out_channel = 64
    batch_size = 1
    in_height = in_width = 64
    out_height = out_width = in_height - kernel_height + 1
    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
        out_channel), name='kernel')
    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')
    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
                                                     kernel[kh, kw, ic, oc],
                                                     axis=[ic, kh, kw]),
                       name="conv2d")
    s = tvm.create_schedule(conv.op)

    n, oc, oh, ow = conv.op.axis
    oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    stmt = tvm.ir_pass.LoopPartition(stmt, True)
    stmt = tvm.ir_pass.Simplify(stmt)
    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
Example #13
0
def test_multiple_func():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=4)
    s[C].parallel(xo)
    s[C].vectorize(xi)
    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        # build two functions
        f2 = tvm.lower(s, [A, B, C], name="fadd1")
        f1 = tvm.lower(s, [A, B, C], name="fadd2")
        m = tvm.build([f1, f2], "llvm")
        fadd1 = m['fadd1']
        fadd2 = m['fadd2']
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        fadd1(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + b.asnumpy())
        fadd2(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + b.asnumpy())
    check_llvm()
def intrin_gemv(m, n):
    w = tvm.placeholder((m, n), name='w')
    x = tvm.placeholder((n,), name='x')
    k = tvm.reduce_axis((0, n), name='k')
    z = tvm.compute((m,), lambda i:
                    tvm.sum(w[i, k] * x[k], axis=k), name='z')
    Wb = tvm.decl_buffer(w.shape, w.dtype,
                         name="W",
                         offset_factor=16,
                         strides=[tvm.var('ldw'), 1])
    def intrin_func(ins, outs):
        ww, xx = ins
        zz = outs[0]
        ww_ptr = ww.access_ptr("r")
        xx_ptr = xx.access_ptr("r")
        zz_ptr = zz.access_ptr("w")
        body = tvm.call_packed(
            "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
        reset = tvm.call_packed(
            "fill_zero", zz_ptr, n)
        update = tvm.call_packed(
            "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
        return body, reset, update

    with tvm.build_config(data_alignment=16,
                          offset_factor=16):
        return tvm.decl_tensor_intrin(z.op, intrin_func,
                                      binds={w: Wb})
Example #15
0
def test_sort_np():
    dshape = (1, 2, 3, 4, 5, 6)
    axis = 4
    reduced_shape = (1, 2, 3, 4, 6)
    is_descend = False
    data = tvm.placeholder(dshape, name='data')
    sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32")
    out = tvm.extern(data.shape, [data, sort_num],
                     lambda ins, outs: tvm.call_packed(
                         "tvm.contrib.sort.argsort", ins[0],
                         ins[1], outs[0], axis, is_descend),
                     dtype='int32', name="sort_tensor")

    ctx = tvm.cpu(0)
    target = "llvm"
    s = tvm.create_schedule(out.op)
    f = tvm.build(s, [data, sort_num, out], target)

    np_data = np.random.uniform(size=dshape)
    np_out = np.argsort(np_data, axis=axis)
    sort_num_input = np.full(reduced_shape, dshape[axis])
    a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx)
    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
    f(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
Example #16
0
def test_lstm_cell_inline():
    num_step = 128
    num_input = 256
    num_hidden = 1152
    batch_size = 4
    # Global transition matrix
    X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X")
    Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h")
    Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h")
    # h: output hidden state, c: cell state.
    s_state_h = tvm.placeholder((num_step, batch_size, num_hidden))
    s_state_c = tvm.placeholder((num_step, batch_size, num_hidden))
    s_init_c = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0, name="init_c")
    s_init_h = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0, name="init_h")
    # LSTM transition
    k = tvm.reduce_axis((0, num_input), name="ki2h")
    s_i2h = tvm.compute(
        (num_step, 4, batch_size, num_hidden),
        lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k),
        name="s_i2h")
    k = tvm.reduce_axis((0, num_hidden), name="ki2h")
    s_h2h = tvm.compute(
        (num_step, 4, batch_size, num_hidden),
        lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
        name="s_h2h")
    # Gate rules
    gates = tvm.compute(s_i2h.shape, lambda *i:
                        s_i2h(*i) + s_h2h(*i), name="gates")
    gshape = (num_step, batch_size, num_hidden)
    in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate")
    in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform")
    forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate")
    out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate")
    next_c = tvm.compute(gshape,
                         lambda t, i, j:
                         forget_gate[t, i, j] * s_state_c[t - 1, i, j] +
                         in_gate[t, i, j] * in_transform[t, i, j], name="next_c")
    next_h = tvm.compute(gshape,
                         lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h")
    update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c")
    update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h")
    # schedule
    scan_h, scan_c = tvm.scan(
        [s_init_h, s_init_c],
        [update_h, update_c],
        [s_state_h, s_state_c],
        inputs=[X],
        name="lstm_scan")
    # schedule
    s = tvm.create_schedule(scan_h.op)
    # Inline gate computations
    s[gates].compute_inline()
    s[in_gate].compute_inline()
    s[in_transform].compute_inline()
    s[forget_gate].compute_inline()
    s[out_gate].compute_inline()
    # verify we can lower correctly
    tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
Example #17
0
def test_argsort():
    dshape = (1, 8)
    valid_count_shape = (2,)
    data = tvm.placeholder(dshape, name="data", dtype="float32")
    valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
    np_data = np.random.rand(dshape[0], dshape[1]).astype(data.dtype)
    np_valid_count = np.array([4]).astype(valid_count.dtype)
    np_result = np.argsort(-np_data)
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            out = argsort(data, valid_count, axis = -1, is_ascend = False, flag=False)
            s = topi.generic.schedule_argsort(out)

        tvm_data = tvm.nd.array(np_data, ctx)
        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
        tvm_out = tvm.nd.array(np.zeros(dshape, dtype="float32"), ctx)
        f = tvm.build(s, [data, valid_count, out], device)
        f(tvm_data, tvm_valid_count, tvm_out)
        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result.astype("float32"), rtol=1e0)

    for device in ['llvm', 'cuda', 'opencl']:
        check_device(device)
Example #18
0
def test_sort():
    n = 2
    l = 5
    m = 3
    data = tvm.placeholder((n, l, m), name='data')
    sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32")
    axis = 1
    is_descend = True
    out = tvm.extern(data.shape, [data, sort_num],
                     lambda ins, outs: tvm.call_packed(
                         "tvm.contrib.sort.argsort", ins[0],
                         ins[1], outs[0], axis, is_descend),
                     dtype='int32', name="sort_tensor")
    input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]],
             [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]]
    sort_num_input = [[1, 2, 3], [4, 5, 5]]
    sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]],
                    [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]]

    ctx = tvm.cpu(0)
    target = "llvm"
    s = tvm.create_schedule(out.op)
    f = tvm.build(s, [data, sort_num, out], target)
    a = tvm.nd.array(np.array(input).astype(data.dtype), ctx)
    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
    f(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
Example #19
0
def test_inplace_rule2(scope_tb = "local_TB2", max_bits = 1024 * 1024 * 1024):
    #Test Buffer
    register_mem(scope_tb, max_bits)
    m = 10
    A = tvm.placeholder((m,), name='A')
    C = tvm.placeholder((m,), name='C')
    D = tvm.placeholder((m,), name='D')
    A0 = tvm.compute((m,), lambda i: A[i] + C[i], name='A0')
    A1 = tvm.compute((m,), lambda i: D[i] * D[i], name='A1')
    A2 = tvm.compute((m,), lambda i: A0[i] + A1[i], name='A2')
    B = tvm.compute((m,), lambda i: A2[i], name='B')
    s = tvm.create_schedule(B.op)
    A0L = s.cache_read(A0, scope_tb, [A2])
    A1L = s.cache_read(A1, scope_tb, [A2])
    A2L = s.cache_read(A2, scope_tb, [B])
    bounds = tvm.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    Cc = tvm.decl_buffer(C.shape, B.dtype, name='C')
    Dd = tvm.decl_buffer(D.shape, B.dtype, name='D')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cc, D:Dd}, 64)
    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
    stmt = tvm.ir_pass.Simplify(stmt)
    stmt = tvm.ir_pass.StorageRewrite(stmt)
    # verify only have one allocations.
    # verify inplace folding works
    num_alloc = [0]
    def verify(n):
        if isinstance(n, tvm.stmt.Allocate):
            num_alloc[0] += 1
    tvm.ir_pass.PostOrderVisit(stmt, verify)
    assert num_alloc[0] == 2
Example #20
0
def test_schedule_create():
    m = tvm.var('m')
    n = tvm.var('n')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')
    B = tvm.placeholder((n, l), name='B')
    AA = tvm.compute((m, l), lambda i, j: A[i, j])
    T = tvm.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k))
    s = tvm.create_schedule(T.op)
    s[AA].set_scope("shared")
    xo, xi = s[T].split(T.op.axis[0], factor=10)
    xi1, xi2 = s[T].split(xi, factor=2)
    s[AA].compute_at(s[T], xi1)
    xo, xi = s[AA].split(AA.op.axis[0], factor=10)
    s[T].reorder(xi2, xi1)
    assert T.op.axis[1] in s[T].leaf_iter_vars

    # save load json
    json_str = tvm.save_json(s)
    s_loaded = tvm.load_json(json_str)
    assert isinstance(s_loaded, tvm.schedule.Schedule)
    assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))

    # pickle unpickle
    dump = pkl.dumps(s)
    s_loaded = pkl.loads(dump)
    assert isinstance(s_loaded, tvm.schedule.Schedule)
    assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
Example #21
0
def test_upstream():
    @tvm.hybrid.script
    def upstream(a):
        b = output_tensor((20, ), 'float32')
        for i in range(20):
            b[i] = a[i] * i
        return b

    a = tvm.placeholder((20, ), 'float32')
    b = tvm.placeholder((20, ), 'float32')
    c = tvm.compute((20, ), lambda x: a[x] + b[x])
    d = upstream(c)
    sch = tvm.create_schedule([c.op, d.op])
    ir = tvm.lower(sch, [a, b, d], simple_mode=True)
    func = tvm.build(sch, [a, b, d])
    assert(func)

    a = numpy.random.randn(20).astype('float32')
    b = numpy.random.randn(20).astype('float32')
    ref = numpy.zeros((20, ), 'float32')
    for i in range(20):
        ref[i] = (a[i] + b[i]) * i

    tvm_a = tvm.nd.array(a)
    tvm_b = tvm.nd.array(b)
    tvm_d = tvm.nd.array(numpy.zeros((20, )).astype('float32'))

    func(tvm_a, tvm_b, tvm_d)
    tvm.testing.assert_allclose(tvm_d.asnumpy(), ref, 1e-5, 1e-5)
Example #22
0
def test_compile_cache():
    x = sym.Variable("x")
    y = sym.Variable("y")
    z = sym.exp(y + x)
    shape = (10, 1)
    dtype = tvm.float32
    shape_dict = {"x": shape, "y": shape}
    def verify(graph, lib):
        m = graph_runtime.create(graph, lib, tvm.cpu(0))
        # get member functions
        na = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
        nb = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
        m.run(x=na, y=nb)
        # get outputs
        out = m.get_output(0, tvm.nd.empty(shape, dtype))
        tvm.testing.assert_allclose(
            out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))

    engine = nnvm.compiler.engine
    graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
    inputs = [tvm.placeholder((10,)), tvm.placeholder((10,))]

    gkey = nnvm.compiler.graph_key(nnvm.graph.create(z), inputs, "llvm")
    gkey2 = nnvm.compiler.graph_key(nnvm.graph.create(z), inputs + inputs, "llvm")
    gf = engine[gkey]
    assert gf is not None
    assert engine[gkey2] is None
    graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
    assert graph.index.num_nodes == 3
    verify(graph, lib)
    # Test various set external cache
    engine.clear_cache()
    engine[gkey] = gf
Example #23
0
def test_looptype():
    @script
    def looptype(a, b, c):
        d = output_tensor((16, ), 'int32')
        e = output_tensor((16, ), 'int32')
        f = output_tensor((16, ), 'int32')
        for i in parallel(16):
            d[i] = a[i]
        for j in vectorize(16):
            e[j] = b[j]
        for k in unroll(16):
            f[k] = c[k]
        return d, e, f

    a = tvm.placeholder((16, ), name='a', dtype='int32')
    b = tvm.placeholder((16, ), name='b', dtype='int32')
    c = tvm.placeholder((16, ), name='c', dtype='int32')
    try:
        d, e, f = looptype(a, b, c)
        ir = d.op.body
    except:
        return
    iloop = ir.first
    jloop = ir.rest.first
    kloop = ir.rest.rest
    assert iloop.for_type == tvm.stmt.For.Parallel
    assert jloop.for_type == tvm.stmt.For.Vectorized
    assert kloop.for_type == tvm.stmt.For.Unrolled

    func, ins, outs = run_and_check(looptype, [a, b, c])
    run_and_check(func, ins, outs=outs)
Example #24
0
def test_non_zero():
    @tvm.hybrid.script
    def blur(a):
        b = output_tensor((30, 30), 'float32')
        for i in range(2, 32):
            for j in range(2, 32):
                s = 0.0
                for di in range(3):
                    for dj in range(3):
                        s += a[i-di, j-dj]
                b[i-2, j-2] = s / 9.0
        return b

    a = tvm.placeholder((32, 32), 'float32', 'a')
    func, ins, outs = run_and_check(blur, [a])
    run_and_check(func, ins, outs=outs)

    @tvm.hybrid.script
    def triangle(a, b):
        c = output_tensor((10, 10), dtype='float32')
        for i in range(10):
            for j in range(i, 10):
                c[i, j] = a[i] * b[j]
        return c

    a = tvm.placeholder((10, ), dtype='float32', name='a')
    b = tvm.placeholder((10, ), dtype='float32', name='b')

    func, ins, outs = run_and_check(triangle, [a, b])
    run_and_check(func, ins, outs=outs)
Example #25
0
def test_dot():
    nn = 12
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    k = tvm.reduce_axis((0, n), 'k')
    C = tvm.compute((1,), lambda _: tvm.sum(A[k] * B[k], axis=k), name='C')
    s = tvm.create_schedule(C.op)
    fapi = lower(s, [A, B, C])

    def verify(target):
        if not tvm.module.enabled(target):
            print("Target %s is not enabled" % target)
            return
        f = tvm.codegen.build_module(fapi, target)
        # verify
        ctx = tvm.cpu(0)
        a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx)
        c  = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx)
        f(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4)

    verify("llvm")
Example #26
0
    def get_gemm_feature(target):
        k = tvm.reduce_axis((0, N), 'k')
        A = tvm.placeholder((N, N), name='A')
        B = tvm.placeholder((N, N), name='B')
        C = tvm.compute(A.shape, lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
                        name='C')

        s = tvm.create_schedule(C.op)

        y, x = s[C].op.axis
        axes = list(s[C].tile(y, x, 8, 8)) + [k]
        perm = np.random.permutation(5)
        axes = [axes[x] for x in perm]
        s[C].reorder(*axes)

        if "gpu" in target.keys:
            pick = []
            # filter out reduction axis
            for i in range(len(perm)):
                if perm[i] != 4:
                    pick.append(axes[i])
            s[C].bind(pick[0], tvm.thread_axis("blockIdx.x"))
            s[C].bind(pick[1], tvm.thread_axis("vthread"))
            s[C].bind(pick[2], tvm.thread_axis("threadIdx.y"))

        with target:
            feas = feature.get_itervar_feature(s, [A, B, C])
            feas = feature.flatten_itervar_feature(feas)
        return feas
Example #27
0
def matmul(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)

    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
    s = tvm.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    ##### define space begin #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2)
    ##### define space end #####

    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]
Example #28
0
def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))

    in_height = in_width = in_size

    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
    bias = tvm.placeholder((num_filter, 1, 1), name='bias')

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.uniform(size=a_shape).astype(dtype)
        w_np = np.random.uniform(size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)
        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            C = topi.nn.conv2d(A, W, (stride, stride), (padding, padding),
                               (dilation, dilation), layout='NCHW', out_dtype=dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.generic.schedule_conv2d_nchw([C])

        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
        if add_bias:
            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
            func(a, w, b, c)
        else:
            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
            func(a, w, c)
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)

    for device in get_all_backend():
        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
            check_device(device)
Example #29
0
def test_cpu():
    n = 1024
    dtype = "float32"
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    def test_device_ir(A, B, C):
        n = A.shape[0]
        max_threads = 8
        ib = tvm.ir_builder.create()
        Aptr = ib.buffer_ptr(A)
        Bptr = ib.buffer_ptr(B)
        Cptr = ib.buffer_ptr(C)
        with ib.for_range(0, n, name="i") as i:
            Cptr[i] = Aptr[i] + Bptr[i]
        body = ib.get()
        return body
    C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
                   name="vector_add", dtype=dtype)
    s = tvm.create_schedule(C.op)
    def check_target(target):
        if not tvm.module.enabled(target):
            return
        # build and invoke the kernel.
        fadd = tvm.build(s, [A, B, C], target)
        ctx = tvm.context(target, 0)
        # launch the kernel.
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        fadd(a, b, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
    check_target("llvm")
Example #30
0
def test_matmul_add():
    n = 1024
    l = 128
    m = 235
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((l, m), name='B')
    C = rocblas.matmul(A, B)
    s = tvm.create_schedule(C.op)

    def verify(target="rocm"):
        if not tvm.module.enabled(target):
            print("skip because %s is not enabled..." % target)
            return
        if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True):
            print("skip because extern function is not available")
            return
        ctx = tvm.rocm(0)
        f = tvm.build(s, [A, B, C], target)
        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        f(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
    verify()
Example #31
0
def test_add_pipeline():
    n = tvm.var('n')
    A = tvm.placeholder((n, ), name='A')
    B = tvm.placeholder((n, ), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    D = tvm.compute(A.shape, lambda *i: C(*i) + 1, name='C')
    s = tvm.create_schedule(D.op)

    # GPU schedule have to split by gridIdx and threadIdx
    num_thread = 256
    xo, xi = s[C].split(C.op.axis[0], factor=num_thread)
    s[C].bind(xo, tvm.thread_axis("threadIdx.x"))
    s[C].bind(xi, tvm.thread_axis("blockIdx.x"))

    xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
    s[D].bind(xo, tvm.thread_axis("threadIdx.x"))
    s[D].bind(xi, tvm.thread_axis("blockIdx.x"))

    # compile to IR
    s = s.normalize()
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    Cb = tvm.decl_buffer(C.shape, C.dtype, name='C')
    stmt = tvm.ir_pass.LoopPartition(stmt)
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cb}, 64)
    stmt = tvm.ir_pass.Simplify(stmt)
    fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Cb], 0, True)
    fsplits = [x for x in tvm.ir_pass.SplitHostDevice(fapi)]
    fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])

    def check_target(device, host="stackvm"):
        if not tvm.module.enabled(host):
            return
        if not tvm.module.enabled(device):
            return
        ctx = tvm.context(device, 0)
        mhost = tvm.codegen.build_module(fsplits[0], host)
        mdev = tvm.codegen.build_module(fsplits[1:], device)
        mhost.import_module(mdev)
        code = mdev.get_source()
        f = mhost.entry_func
        # launch the kernel.
        n = 1027
        a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx)
        f(a, b, c)
        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

    def check_module_save(device, host="stackvm"):
        if not tvm.module.enabled(host):
            return
        if not tvm.module.enabled(device):
            return
        ctx = tvm.context(device, 0)
        fmt = "ptx" if device == "cuda" else "cl"
        mhost = tvm.codegen.build_module(fsplits[0], host)
        mdev = tvm.codegen.build_module(fsplits[1:], device)
        temp = util.tempdir()
        mpath = temp.relpath("test.%s" % fmt)
        mdev.save(mpath)
        mdev2 = tvm.module.load(mpath)
        mhost.import_module(mdev2)
        f = mhost.entry_func
        # launch the kernel.
        n = 1027
        a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx)
        f(a, b, c)
        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

    check_target("cuda", host="stackvm")
    check_target("cuda", host="llvm")
    check_module_save("cuda", host="stackvm")
    check_target("nvptx", host="llvm")
    check_target("rocm", host="llvm")
Example #32
0
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32"):
    # Build the logic and compile the function
    A = tvm.placeholder(shape=in_shape, name="A", dtype=dtype)
    A1 = topi.sqrt(topi.exp(A))
    out_dtype = dtype
    if type == "sum":
        B = topi.sum(A1, axis=axis, keepdims=keepdims)
    elif type == "max":
        B = topi.max(A1, axis=axis, keepdims=keepdims)
    elif type == "min":
        B = topi.min(A1, axis=axis, keepdims=keepdims)
    elif type == "argmax":
        B = topi.argmax(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    elif type == "argmin":
        B = topi.argmin(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    else:
        raise NotImplementedError

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_reduce(B)

        foo = tvm.build(s, [A, B], device, name=type)
        # Test
        in_npy = np.random.uniform(size=in_shape).astype(dtype)
        in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
        if type == "sum":
            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
        elif type == "max":
            out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
        elif type == "min":
            out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
        elif type == "argmax":
            out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
        elif type == "argmin":
            out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
        else:
            raise NotImplementedError
        data_tvm = tvm.nd.array(in_npy, ctx=ctx)
        out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
        for _ in range(1):
            foo(data_tvm, out_tvm)
        if type == "argmax" or type == "argmin":
            out_tvm_indices = out_tvm.asnumpy()
            if keepdims:
                out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
            if axis is None:
                out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
            else:
                other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis+1):]))
                sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
                out_tvm_val = in_npy_map[sel_indices]
            if type == "argmax":
                np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
            elif type == "argmin":
                np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
        else:
            np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
    for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan", "nvptx"]:
        check_device(device)
                               'b = np.random.rand(K, N).astype(dtype)\n',
                               stmt='answer = np.dot(a, b)',
                               number=np_repeat)
print("Numpy running time: %f" % (np_runing_time / np_repeat))

# ground truth
a = tvm.nd.array(np.random.rand(M, K).astype(dtype), ctx)
b = tvm.nd.array(np.random.rand(K, N).astype(dtype), ctx)
c = tvm.nd.array(np.zeros((M, N), dtype=dtype), ctx)
answer = np.dot(a.asnumpy(), b.asnumpy())

###################
# TVM part
# Algorithm
k = tvm.reduce_axis((0, K), 'k')
A = tvm.placeholder((M, K), name='A')
B = tvm.placeholder((K, N), name='B')
C = tvm.compute((M, N),
                lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k),
                name='C')

# Default schedule
s = tvm.create_schedule(C.op)
func = tvm.build(s, [A, B, C], target=target, name='mmult')
print(tvm.lower(s, [A, B, C], simple_mode=True))

func(a, b, c)
tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)

evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
print('Baseline: %f' % evaluator(a, b, c).mean)
Example #34
0
    def visit_Assign(self, node):
        """Visit targets = value

    Returns
    -------
    Stmt: Store node, tvm.var, tvm.buffer, or tvm.compute IR
    """
        # Currently, we only allow one output target
        target = node.targets[0]
        index = 0
        content = None
        is_tvm = False
        dtype = "float32"

        # Analyze right hand side first
        if isinstance(node.value, ast.Call):
            call = node.value
            call_type = self.check_call_type(call)
            if len(call_type) == 1:
                # External function call. We do not support it right now
                content = self.visit(call)
            else:
                args = call.args
                keywords = call.keywords
                # Currently we only support tvm calls
                if call_type[0] == "tvm":
                    is_tvm = True
                    if call_type[1] == "var":  # tvm.var
                        assert isinstance(
                            target,
                            ast.Name), "target of tvm.var must be a name"
                        for keyword in keywords:  # check every keyword in tvm.var
                            if keyword.arg == "dtype":
                                dtype = keyword.value.s
                            elif keyword.arg == "name":
                                pass
                            else:
                                raise ValueError(
                                    "Unknown/Unsupported keyowrds to tvm.var: "
                                    + str(keyword[0]))
                        name = target.id
                        tvm_var = tvm.var(name, dtype=dtype)
                        var = {
                            'var': tvm_var,
                            'type': 'tvm',
                            'allocated': False
                        }
                        if name in self.arg_list:  # check whether this var belongs to io
                            self.io_dict[name] = {'arg': tvm_var}
                            var['allocated'] = True
                        self.insert_var(name, var)
                        content = None
                    elif call_type[1] == "placeholder":  # tvm.placeholder
                        assert isinstance(
                            target, ast.Name
                        ), "target of tvm.placeholder must be a name"
                        for keyword in keywords:  # check every keyword in tvm.var
                            if keyword.arg == "dtype":
                                dtype = keyword.value.s
                            elif keyword.arg == "name":
                                pass
                            else:
                                raise ValueError(
                                    "Unknown/Unsupported keyowrds to tvm.placeholder: "
                                    + str(keyword[0]))
                        name = target.id
                        shape = self.get_shape(call.args[0])
                        placeholder = tvm.placeholder(shape,
                                                      name=name,
                                                      dtype=dtype)
                        buff = tvm.decl_buffer(placeholder.shape,
                                               placeholder.dtype,
                                               placeholder.name)
                        buffer = {
                            'tensor': placeholder,
                            'buffer': buff,
                            'type': 'input',
                            'ast': node,
                            'shape': shape,
                            'allocated': False
                        }
                        if name in self.arg_list:
                            self.io_dict[name] = {'arg': buff}
                            buffer['allocated'] = True
                        self.insert_buffer(name, buffer)
                        content = None
                    elif call_type[1] == "compute":
                        name = target.id
                        shape = self.get_shape(call.args[0])
                        placeholder = tvm.placeholder(shape,
                                                      name=name,
                                                      dtype=dtype)
                        buff = tvm.decl_buffer(placeholder.shape,
                                               placeholder.dtype,
                                               placeholder.name)
                        buffer = {
                            'tensor': placeholder,
                            'buffer': buff,
                            'type': 'compute',
                            'ast': node,
                            'shape': shape,
                            'allocated': False
                        }
                        if name in self.arg_list:
                            self.io_dict[name] = {'arg': buff}
                            buffer['allocated'] = True
                        self.insert_buffer(name, buffer)
                        lamb = call.args[1]
                        assert isinstance(
                            lamb, ast.Lambda
                        ), "The second argument to tvm.compute must be a lambda function"
                        self.scope += 1
                        ret = self.visit(lamb)[0]
                        args = lamb.args.args
                        if len(shape) == 1:
                            var_name = args[0].id
                            var = tvm.var(var_name, "int32")
                            st = tvm.make.Store(buff.data, ret, var, self.true)
                            if not isinstance(ret, tuple):
                                ret = self.ReplaceVar(var_name,
                                                      var).mutate(ret)
                                st = tvm.make.Store(buff.data, ret, var,
                                                    self.true)
                                content = tvm.make.For(var, 0, shape[0], 0, 0,
                                                       st)
                            else:
                                ret[0] = self.ReplaceVar(var_name,
                                                         var).mutate(ret[0])
                                ret[1] = self.ReplaceVar(var_name,
                                                         var).mutate(ret[1])
                                st = tvm.make.Store(buff.data, ret[1], var,
                                                    self.true)
                                content = tvm.make.For(
                                    var, 0, shape[0], 0, 0,
                                    tvm.make.Block(ret[0], st))
                        else:
                            var_name1 = args[0].id
                            var_name2 = args[1].id
                            var1 = tvm.var(var_name1, "int32")
                            var2 = tvm.var(var_name2, "int32")
                            if not isinstance(ret, tuple):
                                ret = self.ReplaceVar(var_name1,
                                                      var1).mutate(ret)
                                ret = self.ReplaceVar(var_name2,
                                                      var2).mutate(ret)
                                st = tvm.make.Store(buff.data, ret,
                                                    (var1 * shape[1] + var2),
                                                    self.true)
                                expr = tvm.make.For(var2, 0, shape[1], 0, 0,
                                                    st)
                            else:
                                if ret[0] is not None:
                                    ret0 = self.ReplaceVar(var_name1,
                                                           var1).mutate(ret[0])
                                    ret0 = self.ReplaceVar(var_name2,
                                                           var2).mutate(ret0)
                                ret1 = self.ReplaceVar(var_name1,
                                                       var1).mutate(ret[1])
                                ret1 = self.ReplaceVar(var_name2,
                                                       var2).mutate(ret1)
                                st = tvm.make.Store(buff.data, ret1,
                                                    (var1 * shape[1] + var2),
                                                    self.true)
                                if ret[0] is not None:
                                    expr = tvm.make.For(
                                        var2, 0, shape[1], 0, 0,
                                        tvm.make.Block(ret0, st))
                                else:
                                    expr = tvm.make.For(
                                        var2, 0, shape[1], 0, 0, st)
                            content = tvm.make.For(var1, 0, shape[0], 0, 0,
                                                   expr)
                            self.scope -= 1
                    else:
                        raise ValueError(
                            "Unkown/Unsupported tvm function: tvm." +
                            call_type[1])
                    return content
                else:  # if call_type[1] == "tvm"
                    raise ValueError("Currently we only support tvm functions")
        else:  # if isinstance(node.value, ast.Call)
            content = self.visit(node.value)
        # left hand side
        var, name, _type = self.get_target(target)
        if _type == 'name':
            if var is None:
                if isinstance(content, int):
                    var = tvm.var(name, "int32")
                elif isinstance(content, tvm.expr.Load):
                    var = tvm.var(name, content.dtype)
                else:
                    var = tvm.var(name, "float32")
                self.insert_var(name, {
                    'var': var,
                    'type': 'intermediate',
                    'allocated': False
                })
            else:
                var = var['var']
        else:
            index = self.visit(target)
            var = var['buffer'].data

        assert (not is_tvm)
        if isinstance(node.value, ast.IfExp):
            then = tvm.make.Store(var, content[1], index)
            orelse = tvm.make.Store(var, content[2], index)
            return tvm.make.IfThenElse(content[0], then, orelse)
        else:
            return tvm.make.Store(var, content, index)
Example #35
0
def test_schedule():
    @script
    def outer_product(a, b):
        c = output_tensor((64, 64), a.dtype)
        for i in range(64):
            for j in range(64):
                c[i, j] = a[i] * b[j]
        return c

    a = tvm.placeholder((64, ), name='a', dtype='float32')
    b = tvm.placeholder((64, ), name='b', dtype='float32')
    c = outer_product(a, b)

    # Test perfect loop split
    # Test loop reorder
    # Test loop annotation
    sch = tvm.create_schedule(c.op)
    i, j = c.op.axis
    io, ii = sch[c].split(i, 4)
    sch[c].parallel(ii)
    jo, ji = sch[c].split(j, 4)
    joo, joi = sch[c].split(jo, 4)
    sch[c].vectorize(ji)
    sch[c].reorder(ii, io, joo, joi, ji)
    ir = tvm.lower(sch, [a, b, c], simple_mode=True)
    assert isinstance(ir, tvm.stmt.ProducerConsumer)
    ir = ir.body
    assert isinstance(ir, tvm.stmt.AttrStmt)
    ir = ir.body
    assert isinstance(ir, tvm.stmt.For)
    assert ir.loop_var.name == 'i.inner'
    ir = ir.body
    assert isinstance(ir, tvm.stmt.For)
    assert ir.loop_var.name == 'i.outer'
    ir = ir.body
    assert isinstance(ir, tvm.stmt.For)
    assert ir.loop_var.name == 'j.outer.outer'
    ir = ir.body
    assert isinstance(ir, tvm.stmt.For)
    assert ir.loop_var.name == 'j.outer.inner'
    ir = ir.body
    func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c])
    run_and_check(func, ins, outs=outs)

    # Test fuse
    sch = tvm.create_schedule(c.op)
    sch[c].fuse(c.op.axis[0], c.op.axis[1])
    ir = tvm.lower(sch, [a, b, c], simple_mode=True)
    assert isinstance(ir, tvm.stmt.ProducerConsumer)
    ir = ir.body
    assert isinstance(ir, tvm.stmt.AttrStmt)
    ir = ir.body
    assert isinstance(ir, tvm.stmt.For)
    assert ir.loop_var.name == 'i.j.fused'
    func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c])
    run_and_check(func, ins, outs=outs)

    # Test imperfect loop split
    sch = tvm.create_schedule(c.op)
    sch[c].split(c.op.axis[0], 3)
    ir = tvm.lower(sch, [a, b, c], simple_mode=True)
    func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c])
    run_and_check(func, ins, outs=outs)
Example #36
0
def _alter_conv2d_layout(attrs, inputs, tinfos):
    """Alter op layout for pre-computing kernel transformation"""
    if 'cudnn' in tvm.target.current_target(
    ).libs or 'miopen' in tvm.target.current_target().libs:
        return None

    import nnvm.symbol as sym
    copy_inputs = [s for s in inputs]

    new_attrs = {k: attrs[k] for k in attrs.keys()}

    strides = attrs.get_int_tuple("strides")
    padding = attrs.get_int_tuple("padding")
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int('groups')
    layout = attrs["layout"]
    out_dtype = attrs["out_dtype"]
    out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype

    data, kernel = tinfos[0:2]
    N, CI, H, W = get_const_tuple(data.shape)
    CO, _, KH, KW = get_const_tuple(kernel.shape)

    dispatch_ctx = autotvm.DispatchContext.current
    target = tvm.target.current_target()

    if groups == 1:
        # query config of this workload
        workload = autotvm.task.args_to_workload([
            tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype
        ], conv2d)
        cfg = autotvm.DispatchContext.current.query(target, workload)

        if cfg.is_fallback:  # if is fallback, clear query cache and return None
            autotvm.task.clear_fallback_cache(target, workload)
            return None

        if cfg.template_key == 'direct':
            return None

        if cfg.template_key == 'int8':
            assert 'cuda' in target.keys
            new_layout = 'NCHW4c'
            new_attrs['layout'] = new_layout
            new_attrs['out_layout'] = new_layout
            new_attrs['kernel_layout'] = 'OIHW4o4i'
            ic_block_factor = oc_block_factor = 4

            # Store the same config for the altered operator (workload)
            new_data = tvm.placeholder(
                (N, CI // ic_block_factor, H, W, ic_block_factor),
                dtype=data.dtype)
            new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW,\
                                         oc_block_factor, ic_block_factor), dtype=kernel.dtype)
            new_workload = autotvm.task.args_to_workload([
                new_data, new_kernel, strides, padding, dilation, new_layout,
                out_dtype
            ], conv2d)
            dispatch_ctx.update(target, new_workload, cfg)
            return sym.conv2d(*copy_inputs, **new_attrs)

        if attrs.get_int_tuple("dilation") != (1, 1):
            warnings.warn(
                "Does not support weight pre-transform for dilated convolution."
            )
            return None

        # pre-compute weight transformation in winograd
        tile_size = _infer_tile_size(tinfos[0], tinfos[1])

        weight = sym.contrib.conv2d_winograd_weight_transform(
            copy_inputs[1], tile_size=tile_size)
        weight = sym.transpose(weight, axes=[0, 1, 3, 2])
        copy_inputs[1] = weight
        new_attrs['tile_size'] = tile_size

        # Store the same config for the altered operator (workload)
        new_data = data
        new_weight = tvm.placeholder(
            (KH + tile_size - 1, KW + tile_size - 1, CI, CO),
            dtype=kernel.dtype)
        new_workload = autotvm.task.args_to_workload([
            new_data, new_weight, strides, padding, dilation, layout,
            out_dtype, tile_size
        ], conv2d_winograd_without_weight_transform)
        dispatch_ctx.update(target, new_workload, cfg)
        return sym.contrib.conv2d_winograd_without_weight_transform(
            *copy_inputs, **new_attrs)
    elif groups != CI:
        workload = autotvm.task.args_to_workload([
            tinfos[0], tinfos[1], strides, padding, dilation, groups, out_dtype
        ], group_conv2d_nchw)
        cfg = autotvm.DispatchContext.current.query(target, workload)

        if cfg.is_fallback:  # if is fallback, clear query cache and return None
            autotvm.task.clear_fallback_cache(target, workload)
            return None

        if cfg.template_key == 'int8':
            assert 'cuda' in target.keys
            new_layout = 'NCHW4c'
            new_attrs['layout'] = new_layout
            new_attrs['out_layout'] = new_layout
            new_attrs['kernel_layout'] = 'OIHW4o4i'
            ic_block_factor = oc_block_factor = 4

            # Store the same config for the altered operator (workload)
            new_data = tvm.placeholder(
                (N, CI // ic_block_factor, H, W, ic_block_factor),
                dtype=data.dtype)
            new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups,\
                                         KH, KW, oc_block_factor, ic_block_factor),
                                         dtype=kernel.dtype)
            new_workload = autotvm.task.args_to_workload([
                new_data, new_kernel, strides, padding, dilation, groups,
                out_dtype
            ], group_conv2d_nchw)
            dispatch_ctx.update(target, new_workload, cfg)
            return sym.conv2d(*copy_inputs, **new_attrs)

    # do nothing for depthwise convolution
    return None
Example #37
0
def test_tensor_inputs():
    x = tvm.placeholder((1,), name='x')
    y = tvm.compute(x.shape, lambda i: x[i] + x[i])
    assert tuple(y.op.input_tensors) == (x,)
Example #38
0
def matvec(n, m, l):
    wei = tvm.placeholder((n, m), dtype='float32')
    data= tvm.placeholder((l, m), dtype='float32')
    res = topi.nn.dense(img, wei)

    cfg = autotvm.template.DispatchContext.current.query(None, None)

    s = tvm.create_schedule(res.op)

    if not tvm.gpu(0).exist:
        raise ValueError('shit!')

    n, k = get_const_tuple(data.shape)
    m, _ = get_const_tuple(wei.shape)
    cfg.add_flop(2 * n * l * m)

    output = den
    OL = s.cache_write(den, 'local')

    # create cache stage
    AA = s.cache_read(data, 'shared', [OL])
    WW = s.cache_read(weight, 'shared', [OL])

    # bind
    y, x = s[output].op.axis
    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
    scope, y = s[output].split(y, nparts=1)
    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
    s[output].bind(vy, tvm.thread_axis("vthread"))
    s[output].bind(vx, tvm.thread_axis("vthread"))
    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
    s[output].reorder(scope, by, bx, vy, vx, ty, tx, yi, xi)
    s[OL].compute_at(s[output], tx)

    # tile and bind reduction axes
    y, x = s[OL].op.axis
    r, = s[OL].op.reduce_axis
    cfg.define_split("tile_r", cfg.axis(r), num_outputs=3)
    ro, rm, ri = cfg['tile_r'].apply(s, OL, r)
    s[OL].reorder(ro, rm, ri, y, x)

    s[AA].compute_at(s[OL], ro)
    s[WW].compute_at(s[OL], rm)
    # s[AL].compute_at(s[OL], rxm)
    # s[WL].compute_at(s[OL], rxm)

    for load in [AA, WW]:
        fused = s[load].fuse(*list(s[load].op.axis))
        fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
        fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))

    cfg.other_option("auto_unroll_max_step", [0, 512, 1500])
    cfg.other_option("unroll_explicit", [0, 1])
    s[output].pragma(scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
    s[output].pragma(scope, 'unroll_explicit', cfg['unroll_explicit'].val)

    return s, [img, wei, res]
Example #39
0
"""
from __future__ import absolute_import, print_function

import tvm
import numpy as np

######################################################################
# Define Matrix Multiplication
# ----------------------------
# Take matrix multiplication as our example.
# Matmul first multiply the corresponding elements between two matrix,
# then accumulate across a certain axis.
# The following lines describe the computation :code:`A * B^T` in TVM.
#
N, M, L = 1024, 512, 64
A = tvm.placeholder((N, L), name='A')
B = tvm.placeholder((M, L), name='B')
k = tvm.reduce_axis((0, L), name='k')
C = tvm.compute((N, M),
                lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k),
                name='C')
s = tvm.create_schedule(C.op)
print(tvm.lower(s, [A, B, C], simple_mode=True))

######################################################################
# Schedule the Matmul
# -------------------
# Now, suppose we have an accelerator that supports
# matrix-vector multiplication (GEMV) as a hardware primitive,
# which can take arbitrary size of reduce axis,
# but another axis needs to be no larger than 16.
Example #40
0
env = nnpu.get_env()
nnpu.set_device(env, type=args.sim)

with ScheduleProcHelper():
    env = nnpu.get_env()
    shape = (32, 64)  # (32, 64) -> (32, )
    rshape = (16, 16)  # the shape that MReduceSum insn accepts
    assert shape[0] % rshape[0] == 0, 'height must be divisible to {0}'.format(
        rshape[0])
    assert shape[0] % env.cfg['vector_unit']['size'] == 0, \
        'height must be divisible to {0}'.format(env.cfg['vector_unit']['size'])
    assert shape[1] % rshape[1] == 0, 'width must be divisible to {0}'.format(
        rshape[0])
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'],
    a = tvm.placeholder(shape, dtype_n, 'a')

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a')

    k = tvm.reduce_axis((0, shape[1]), 'k0')
    re_shape = (shape[0], )
    re_buf = tvm.compute(
        re_shape, lambda i: tvm.sum(a_buf[i, k].astype(dtype_w), axis=k),
        're_buf')
    nnpu.utils.MarkScope(re_buf, 'acc')

    res_buf = nnpu.utils.CopyAccToBuf(re_buf, 'res')

    res_host, _ = nnpu.utils.CopyBufToH(res_buf, 'res')

    s = nnpu.create_schedule(res_host.op)
Example #41
0
def test_gemm():
    # graph
    nn = 1024
    n = tvm.var('n')
    n = tvm.convert(nn)
    m = n
    l = n
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((m, l), name='B')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute(
        (n, m),
        lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
        name='CC')
    # schedule
    s = tvm.create_schedule(C.op)
    xtile, ytile = 32, 32
    scale = 8
    num_thread = 8
    block_factor = scale * num_thread
    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis("threadIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    thread_y = tvm.thread_axis("threadIdx.y")

    CC = s.cache_write(C, "local")
    AA = s.cache_read(A, "shared", [CC])
    BB = s.cache_read(B, "shared", [CC])
    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
    s[C].reorder(by, bx, yi, xi)
    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    ty, yi = s[C].split(yi, nparts=num_thread)
    tx, xi = s[C].split(xi, nparts=num_thread)
    s[C].reorder(ty, tx, yi, xi)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    yo, xo = CC.op.axis
    s[CC].reorder(k, yo, xo)


    s[CC].compute_at(s[C], tx)
    s[AA].compute_at(s[CC], k)
    s[BB].compute_at(s[CC], k)
    s[AA].double_buffer()
    s[BB].double_buffer()
    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
    tx, xi = s[AA].split(xi, nparts=num_thread)
    s[AA].bind(ty, thread_y)
    s[AA].bind(tx, thread_x)

    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
    tx, xi = s[BB].split(xi, nparts=num_thread)
    s[BB].bind(ty, thread_y)
    s[BB].bind(tx, thread_x)

    # lowering test
    s = s.normalize()

    # one line to build the function.
    def check_device(device):
        if not tvm.module.enabled(device):
            print("skip because %s is not enabled.." % device)
            return

        f = tvm.build(s, [A, B, C], device)
        ctx = tvm.context(device, 0)
        # launch the kernel.
        n = nn
        m = n
        l = n
        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        ftimer = f.time_evaluator(f.entry_name, ctx, number=1)
        tcost = ftimer(a, b, c).mean
        print("%s: exec=%g sec/op" % (ctx, tcost))
        np.testing.assert_allclose(
            c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5)

    check_device("nvptx -mcpu=sm_20")
    check_device("rocm")
    check_device("metal")
    check_device("opencl")
    check_device("cuda")
Example #42
0
def verify_resize3d(batch,
                    in_channel,
                    in_depth,
                    in_height,
                    in_width,
                    out_depth,
                    out_height,
                    out_width,
                    layout='NCDHW',
                    coordinate_transformation_mode="half_pixel",
                    method="trilinear"):
    if layout == 'NCDHW':
        A = tvm.placeholder((batch, in_channel, in_depth, in_height, in_width),
                            name='A',
                            dtype='float32')
        dtype = A.dtype
        out_shape = (batch, in_channel, out_depth, out_height, out_width)
        a_np = np.random.uniform(size=(batch, in_channel, in_depth, in_height,
                                       in_width)).astype(dtype)
    elif layout == 'NDHWC':
        A = tvm.placeholder((batch, in_depth, in_height, in_width, in_channel),
                            name='A',
                            dtype='float32')
        dtype = A.dtype
        out_shape = (batch, out_depth, out_height, out_width, in_channel)
        a_np = np.random.uniform(size=(batch, in_depth, in_height, in_width,
                                       in_channel)).astype(dtype)
    else:
        raise NotImplementedError('Layout not supported {} '.format(layout))

    B = topi.image.resize3d(
        A, (out_depth, out_height, out_width),
        layout=layout,
        coordinate_transformation_mode=coordinate_transformation_mode,
        method=method)

    if method == "trilinear":
        b_np = topi.testing.trilinear_resize3d_python(
            a_np, (out_depth, out_height, out_width), layout,
            coordinate_transformation_mode)
    else:
        scale_d = out_depth / in_depth
        scale_h = out_height / in_height
        scale_w = out_width / in_width
        b_np = topi.testing.upsampling3d_python(a_np,
                                                (scale_d, scale_h, scale_w),
                                                layout)

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_injective(B)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
        f = tvm.build(s, [A, B], device)
        f(a, b)

        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)

    for device in get_all_backend():
        check_device(device)
Example #43
0
def gemm_int8(n, m, l):
    A = tvm.placeholder((n, l), name='A', dtype='int8')
    B = tvm.placeholder((m, l), name='B', dtype='int8')

    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('int32') * B[j, k].astype(
        'int32'), axis=k), name='C')

    cfg = autotvm.get_config()
    s = tvm.create_schedule(C.op)
    y, x = C.op.axis

    AA = s.cache_read(A, 'shared', [C])
    BB = s.cache_read(B, 'shared', [C])
    AL = s.cache_read(AA, 'local', [C])
    BL = s.cache_read(BB, 'local', [C])
    CC = s.cache_write(C, 'local')

    k = CC.op.reduce_axis[0]

    cfg.define_split('tile_k', cfg.axis(k), num_outputs=3,
                     filter=lambda entity: entity.size[2] == 4 and \
                     entity.size[0] * 2 >= entity.size[1])

    ko, kt, ki = cfg['tile_k'].apply(s, CC, k)

    s[CC].tensorize(ki, intrin_dp4a)

    block_x = tvm.thread_axis('blockIdx.x')
    block_y = tvm.thread_axis('blockIdx.y')
    thread_x = tvm.thread_axis('threadIdx.x')
    thread_y = tvm.thread_axis('threadIdx.y')

    def block_size_filter(entity):
        return entity.size[0] * 2 >= entity.size[1] * 2 and \
                entity.size[1] <= 16 and entity.size[3] <= 4
    cfg.define_split('tile_y', cfg.axis(y), num_outputs=4, filter=block_size_filter)
    cfg.define_split('tile_x', cfg.axis(x), num_outputs=4, filter=block_size_filter)
    by, tyz, ty, yi = cfg['tile_y'].apply(s, C, y)
    bx, txz, tx, xi = cfg['tile_x'].apply(s, C, x)

    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    s[C].bind(tyz, tvm.thread_axis('vthread'))
    s[C].bind(txz, tvm.thread_axis('vthread'))
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)

    s[CC].compute_at(s[C], tx)

    yo, xo = CC.op.axis
    s[CC].reorder(ko, kt, yo, xo, ki)
    s[CC].unroll(kt)

    for stage in [AL, BL]:
        s[stage].compute_at(s[CC], kt)
        _, xi = s[stage].split(stage.op.axis[1], factor=4)
        s[stage].vectorize(xi)
        s[stage].double_buffer()

    cfg.define_knob('storage_align', [16, 48])
    for stage in [AA, BB]:
        s[stage].storage_align(s[stage].op.axis[0],
                               cfg['storage_align'].val, 0)
        s[stage].compute_at(s[CC], ko)

        fused = s[stage].fuse(*s[stage].op.axis)
        ty, tx = s[stage].split(fused, nparts=cfg['tile_y'].size[2])
        tx, xi = s[stage].split(tx, nparts=cfg['tile_x'].size[2])
        _, xi = s[stage].split(xi, factor=16)

        s[stage].bind(ty, thread_y)
        s[stage].bind(tx, thread_x)
        s[stage].vectorize(xi)

    cfg.define_knob('auto_unroll_max_step', [512, 1500])
    s[C].pragma(by, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
    s[C].pragma(by, 'unroll_explicit', False)

    cfg.add_flop(n*m*l*2)
    return s, [A, B, C]
def test_stmt_constructor():
    v = tvm.var("aa")
    buffer_var = tvm.var("buf", dtype="handle")
    nop = tvm.stmt.Evaluate(1)
    x = tvm.stmt.LetStmt(v, 1, tvm.stmt.Evaluate(1))
    assert isinstance(x, tvm.stmt.LetStmt)
    assert x.var == v
    assert x.value.value == 1
    assert isinstance(x.body, tvm.stmt.Evaluate)

    x = tvm.stmt.AttrStmt(v == 1, "xx", 1, tvm.stmt.Evaluate(1))
    assert isinstance(x, tvm.stmt.AttrStmt)
    assert x.value.value == 1

    x = tvm.stmt.AssertStmt(tvm.const(1, "uint1"),
                            tvm.convert("hellow"),
                            nop)
    assert isinstance(x, tvm.stmt.AssertStmt)
    assert x.body == nop

    x = tvm.stmt.ProducerConsumer(None, True, nop)
    assert isinstance(x, tvm.stmt.ProducerConsumer)
    assert x.body == nop

    x = tvm.stmt.For(tvm.var("x"), 0, 10, 0, 0, nop)
    assert isinstance(x, tvm.stmt.For)
    assert x.min.value == 0
    assert x.extent.value == 10
    assert x.body == nop

    x = tvm.stmt.Store(buffer_var, 1, 10, tvm.const(1, "uint1"))
    assert isinstance(x, tvm.stmt.Store)
    assert x.buffer_var == buffer_var
    assert x.index.value == 10
    assert x.value.value == 1

    tensor = tvm.placeholder((), dtype="float32")
    x = tvm.stmt.Provide(tensor.op, 0, 10, [])
    assert isinstance(x, tvm.stmt.Provide)
    assert x.value_index == 0
    assert x.value.value == 10

    x = tvm.stmt.Allocate(buffer_var, "float32", [10],
                          tvm.const(1, "uint1"), nop)
    assert isinstance(x, tvm.stmt.Allocate)
    assert x.dtype == "float32"
    assert x.buffer_var == buffer_var
    assert x.body == nop

    x = tvm.stmt.AttrStmt(buffer_var, "xyz", 1, nop)
    assert isinstance(x, tvm.stmt.AttrStmt)
    assert x.node == buffer_var
    assert x.attr_key == "xyz"
    assert x.body == nop

    x = tvm.stmt.Free(buffer_var)
    assert isinstance(x, tvm.stmt.Free)
    assert x.buffer_var == buffer_var

    x = tvm.stmt.Realize(None, 0, "float", [], tvm.const(1, "uint1"), nop)
    assert isinstance(x, tvm.stmt.Realize)
    assert x.body == nop

    x = tvm.stmt.IfThenElse(tvm.const(1, "uint1"),
                            tvm.stmt.Evaluate(11),
                            nop)
    assert isinstance(x, tvm.stmt.IfThenElse)
    assert x.then_case.value.value == 11
    assert x.else_case == nop

    x = tvm.stmt.Prefetch(None, 1, "float32", [])
    assert isinstance(x, tvm.stmt.Prefetch)
    assert x.value_index == 1
Example #45
0
def test_dwarf_debug_information():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n, ), name='A')
    B = tvm.placeholder((n, ), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=4)
    s[C].parallel(xo)
    s[C].vectorize(xi)

    def check_llvm_object():
        if not tvm.module.enabled("llvm"):
            return
        if tvm.codegen.llvm_version_major() < 5:
            return
        if tvm.codegen.llvm_version_major() > 6:
            return
        # build two functions
        f2 = tvm.lower(s, [A, B, C], name="fadd1")
        f1 = tvm.lower(s, [A, B, C], name="fadd2")
        m = tvm.build([f1, f2], "llvm")
        temp = util.tempdir()
        o_path = temp.relpath("temp.o")
        m.save(o_path)
        import re
        import shutil
        import subprocess
        import sys

        # Try the dwarfdump utility (OS X)
        if shutil.which("dwarfdump"):
            output = subprocess.check_output(["dwarfdump", o_path])
            assert re.search(r"""DW_AT_name\\t\("fadd1"\)""", str(output))
            assert re.search(r"""DW_AT_name\\t\("fadd2"\)""", str(output))

        # Try gobjdump (OS X)
        if shutil.which("gobjdump"):
            output = subprocess.check_output(["gobjdump", "--dwarf", o_path])
            assert re.search(r"""DW_AT_name.*fadd1""", str(output))
            assert re.search(r"""DW_AT_name.*fadd2""", str(output))

        # Try objdump (Linux) - Darwin objdump has different DWARF syntax.
        if shutil.which("objdump") and sys.platform != 'darwin':
            output = subprocess.check_output(["objdump", "--dwarf", o_path])
            assert re.search(r"""DW_AT_name.*fadd1""", str(output))
            assert re.search(r"""DW_AT_name.*fadd2""", str(output))

    def check_llvm_ir():
        if not tvm.module.enabled("llvm"):
            return
        if tvm.codegen.llvm_version_major() < 5:
            return
        if tvm.codegen.llvm_version_major() > 6:
            return
        # build two functions
        f2 = tvm.lower(s, [A, B, C], name="fadd1")
        f1 = tvm.lower(s, [A, B, C], name="fadd2")
        m = tvm.build([f1, f2], target="llvm -target=aarch64-linux-gnu")
        ll = m.get_source("ll")

        # On non-Darwin OS, don't explicitly specify DWARF version.
        import re
        assert not re.search(r""""Dwarf Version""" "", ll)
        assert re.search(r"""llvm.dbg.value""", ll)

        # Try Darwin, require DWARF-2
        m = tvm.build([f1, f2],
                      target="llvm -target=x86_64-apple-darwin-macho")
        ll = m.get_source("ll")
        assert re.search(r"""i32 4, !"Dwarf Version", i32 2""", ll)
        assert re.search(r"""llvm.dbg.value""", ll)

    check_llvm_object()
    check_llvm_ir()
Example #46
0
def test():
    env = nnpu.get_env()
    nnpu.set_device(env)
    shape = (2, 2, 16)
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    a = tvm.placeholder(shape, dtype_w, 'a')

    sph = ScheduleProcHelper()

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)

    k = tvm.reduce_axis((0, 2), 'k')
    add_buf = tvm.compute(
        (2, 16), lambda i, j: tvm.sum(a_buf[k, i, j], axis=k), 'add_buf')
    sph.MarkScope(add_buf)
    add_host, add_dram = nnpu.utils.CopyBufToH(add_buf, 'add', sph)

    k1 = tvm.reduce_axis((0, 2), 'k1')
    mul_buf = tvm.compute(
        (2, 16), lambda i, j: tvm.sum(a_buf[k1, i, j], axis=k1), 'mul_buf')
    sph.MarkScope(mul_buf)
    mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph)

    s = tvm.create_schedule([add_host.op, mul_host.op])
    sph.Transform(s)

    ko, ki = s[add_buf].split(add_buf.op.reduce_axis[0], factor=1)
    s[add_buf].reorder(ko, ki, *(s[add_buf].op.axis))
    s[add_buf].tensorize(ki, env.intrins.get('MAddMerge',
                                             shape=shape,
                                             mode='w'))

    ko1, ki1 = s[mul_buf].split(mul_buf.op.reduce_axis[0], factor=1)
    s[mul_buf].reorder(ko1, ki1, *(s[mul_buf].op.axis))
    s[mul_buf].tensorize(ki1,
                         env.intrins.get('MMulMerge', shape=shape, mode='w'))

    print(nnpu.lower(s, [a, add_host, mul_host], simple_mode=True))

    func = nnpu.build(s, [a, add_host, mul_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_func')
    #exit()
    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=(2, 2, 16), dtype=a.dtype, low=-16, high=16)
    a_nd = tvm.nd.array(a_np, ctx)

    add_nd = tvm.nd.array(np.zeros((2, 16)).astype(add_host.dtype), ctx)

    mul_nd = tvm.nd.array(np.zeros((2, 16)).astype(mul_host.dtype), ctx)

    func(a_nd, add_nd, mul_nd)

    print('a = ')
    print(a_np)
    print('reduce sum row = ')
    print(add_nd.asnumpy())
    print('ground truth is: ')
    gt = np.sum(a_np, axis=0)
    print(gt)
    np.testing.assert_allclose(add_nd.asnumpy(), gt)

    print('reduce mul row = ')
    print(mul_nd.asnumpy())
    gt = np.multiply.reduce(a_np, axis=0, dtype=a.dtype)
    print(gt)
    np.testing.assert_allclose(mul_nd.asnumpy(), gt)
Example #47
0
def test_llvm_add_pipeline():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=4)
    s[C].parallel(xo)
    s[C].vectorize(xi)

    def verify_elf(path, e_machine):
        with open(path, "rb") as fi:
            arr = fi.read(20)
            assert struct.unpack('ccc', arr[1:4]) == (b'E',b'L',b'F')
            endian = struct.unpack('b', arr[0x5:0x6])[0]
            endian = '<' if endian == 1 else '>'
            assert struct.unpack(endian + 'h', arr[0x12:0x14])[0] == e_machine

    def build_i386():
        if not tvm.runtime.enabled("llvm"):
            print("Skip because llvm is not enabled..")
            return
        temp = util.tempdir()
        target = "llvm -target=i386-pc-linux-gnu"
        f = tvm.build(s, [A, B, C], target)
        path = temp.relpath("myadd.o")
        f.save(path)
        verify_elf(path, 0x03)

    def build_arm():
        target = "llvm -target=armv7-none-linux-gnueabihf"
        if not tvm.runtime.enabled(target):
            print("Skip because %s is not enabled.." % target)
            return
        temp = util.tempdir()
        f = tvm.build(s, [A, B, C], target)
        path = temp.relpath("myadd.o")
        f.save(path)
        verify_elf(path, 0x28)
        asm_path = temp.relpath("myadd.asm")
        f.save(asm_path)
        # Do a RPC verification, launch kernel on Arm Board if available.
        host = os.environ.get('TVM_RPC_ARM_HOST', None)
        remote = None
        if host:
            port = int(os.environ['TVM_RPC_ARM_PORT'])
            try:
                remote = rpc.connect(host, port)
            except tvm.TVMError as e:
                pass

        if remote:
            remote.upload(path)
            farm = remote.load_module("myadd.o")
            ctx = remote.cpu(0)
            n = nn
            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
            b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
            farm(a, b, c)
            tvm.testing.assert_allclose(
                c.asnumpy(), a.asnumpy() + b.asnumpy())
            print("Verification finish on remote..")

    build_i386()
    build_arm()
Example #48
0
def verify_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))

    in_height = in_width = in_size

    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8')
    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W', dtype='int8')
    bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias',
                            dtype='int8')

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
        w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype)

        # convert to NCHWc
        _, _, out_height, out_width = c_np.shape
        c_np = c_np.reshape((batch, num_filter // oc_block_factor, oc_block_factor, \
                out_height, out_width)).transpose(0, 1, 3, 4, 2)

        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)

        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
            print("Skip because int8 intrinsics are not available")
            return

        print("Running on target: %s" % device)
        with tvm.target.create(device):
            dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
            C = topi.nn.conv2d(A, dW, (stride, stride), (padding, padding),
                               layout='NCHW', out_dtype=dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.generic.schedule_conv2d_nchw([C])

        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
        if add_bias:
            tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
            func(a, w, b, c)
        else:
            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
            func(a, w, c)
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)

    for device in ["cuda"]:
        check_device(device)
Example #49
0
def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
    A = tvm.placeholder((n, l), name='A', dtype='float16')
    B = tvm.placeholder((l, m), name='B', dtype='float16')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m), lambda i, j: tvm.sum(
        A[i, k].astype('float32') * B[k, j].astype('float32'), axis=k))
    s = tvm.create_schedule(C.op)
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    AA = s.cache_read(A, "shared", [C])
    AL = s.cache_read(AA, "local", [C])
    BB = s.cache_read(B, "shared", [C])
    BL = s.cache_read(BB, "local", [C])
    CL = s.cache_write(C, "local")

    bx = 4
    by = 32
    step_k = 8
    v = 4
    TX = 8
    TY = 1
    tile_x = bx * TX
    tile_y = by * TY
    WX = min(warp_tile_m, tile_x)
    tile_k = 16
    vthread = 1

    yo, ty = s[C].split(y, tile_y * vthread)
    vy, ty = s[C].split(ty, tile_y)
    ty, yi = s[C].split(ty, TY)

    xo, xi = s[C].split(x, tile_x)
    tz, xi = s[C].split(xi, WX)
    tx, xi = s[C].split(xi, TX)
    ko, ki = s[CL].split(k, step_k * tile_k)
    kl, ki = s[CL].split(ki, tile_k)

    s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
    s[C].bind(yo, tvm.thread_axis("blockIdx.y"))
    s[C].bind(xo, tvm.thread_axis("blockIdx.x"))
    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
    s[C].bind(tz, tvm.thread_axis("threadIdx.z"))
    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
    s[C].bind(vy, tvm.thread_axis((0, vthread), "vthread", name="vy"))
    s[CL].compute_at(s[C], tx)
    yo, xo = CL.op.axis
    s[CL].reorder(ko, kl, ki, yo, xo)

    s[AA].compute_at(s[CL], ko)
    xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v)
    tz, tx = s[AA].split(xi, factor=(WX // TX) * v)
    tx, vec = s[AA].split(tx, factor=v)
    fused = s[AA].fuse(s[AA].op.axis[0], xo)
    _, ty = s[AA].split(fused, factor=by)
    s[AA].bind(ty, tvm.thread_axis("threadIdx.y"))
    s[AA].bind(tz, tvm.thread_axis("threadIdx.z"))
    s[AA].bind(tx, tvm.thread_axis("threadIdx.x"))
    s[AA].vectorize(vec)

    s[BB].compute_at(s[CL], ko)
    xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v)
    tz, tx = s[BB].split(xi, factor=(WX // TX) * v)
    tx, vec = s[BB].split(tx, factor=v)
    fused = s[BB].fuse(s[BB].op.axis[0], xo)
    _, ty = s[BB].split(fused, factor=by)
    s[BB].bind(ty, tvm.thread_axis("threadIdx.y"))
    s[BB].bind(tz, tvm.thread_axis("threadIdx.z"))
    s[BB].bind(tx, tvm.thread_axis("threadIdx.x"))
    s[BB].vectorize(vec)

    s[AL].compute_at(s[CL], kl)
    s[BL].compute_at(s[CL], kl)

    s[CL].pragma(ko, 'tensor_core')

    func = tvm.build(s, [A, B, C], 'cuda')

    ctx = tvm.gpu(0)
    a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
    b_np = np.random.uniform(size=(l, m)).astype(B.dtype)
    c_np = np.zeros((n, m), dtype=np.float32)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(b_np, ctx)
    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
    func(a, b, c)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
    print('gemm m=%d n=%d k=%d: %f ms' %
          (m, n, l, evaluator(a, b, c).mean * 1e3))

    c_np = np.dot(a_np, b_np)
    np.testing.assert_allclose(c_np, c.asnumpy(), rtol=1e-3)
Example #50
0
def test_rpc_remote_module():
    if not tvm.module.enabled("rpc"):
        return
    server = rpc.Server("localhost")
    remote = rpc.connect(server.host, server.port)
    # graph
    n = tvm.convert(1024)
    A = tvm.placeholder((n, ), name='A')
    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
    s = tvm.create_schedule(B.op)

    def check_remote():
        if not tvm.module.enabled("llvm"):
            print("Skip because llvm is not enabled")
            return
        temp = util.tempdir()
        ctx = remote.cpu(0)
        f = tvm.build(s, [A, B], "llvm", name="myadd")
        path_dso = temp.relpath("dev_lib.so")
        f.export_library(path_dso)
        remote.upload(path_dso)
        f1 = remote.load_module("dev_lib.so")
        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10)
        cost = time_f(a, b).mean
        print('%g secs/op' % cost)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)

    def check_remote_link_cl():
        """Test function to run remote code such as cl

        This is not enabled because there is forking issue
        of TVM runtime when server launches after OpenCL
        runtime initializes. We leave it as an example
        on how to do rpc when we want to do linking on remote.
        """
        if not tvm.module.enabled("llvm"):
            print("Skip because llvm is not enabled")
            return
        if not tvm.module.enabled("opencl"):
            print("Skip because opencl is not enabled")
            return
        temp = util.tempdir()
        ctx = remote.cl(0)
        s = tvm.create_schedule(B.op)
        xo, xi = s[B].split(B.op.axis[0], factor=32)
        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
        f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd")
        # Option 1: save modules separately and rely on remote compiler
        path_o = temp.relpath("myadd.o")
        path_cl = temp.relpath("myadd.cl")
        path_json = temp.relpath("myadd.tvm_meta.json")
        f.save(path_o)
        f.imported_modules[0].save(path_cl)
        remote.upload(path_o)
        remote.upload(path_cl)
        # upload meta data
        remote.upload(path_json)
        fhost = remote.load_module("myadd.o")
        fdev = remote.load_module("myadd.cl")
        fhost.import_module(fdev)
        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        fhost(a, b)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
        # Option 2: export library as a tar ball then handled by remote compiler
        path_tar = temp.relpath("myadd.tar")
        f.export_library(path_tar)
        remote.upload(path_tar)
        fhost = remote.load_module("myadd.tar")
        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
        fhost(a, b)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)

    check_remote()
def test_convolution_inference():
    BATCH = 8
    IH = 48
    IW = 48
    IC = 16
    OC = 16
    K = 3
    PAD = 1
    STRIDE = 1

    OH = (IH + 2 * PAD - K) + 1
    OW = (IW + 2 * PAD - K) + 1
    dshape = (BATCH, IC, IH, IW)
    kshape = (OC, IC, K, K)
    bshape = (OC, )
    oshape = (BATCH, OC, OH, OW)

    data = tvm.placeholder(dshape, name='data')
    kernel = tvm.placeholder(kshape, name='kernel')
    bias = tvm.placeholder(bshape, name='bias')

    def verify(target="llvm",
               algorithm=nnpack.ConvolutionAlgorithm.AUTO,
               with_bias=True):
        if not tvm.module.enabled(target):
            pytest.skip("%s is not enabled..." % target)
        if not tvm.get_global_func(
                "tvm.contrib.nnpack.fully_connected_inference", True):
            pytest.skip("extern function is not available")
        if not nnpack.is_available():
            pytest.skip("nnpack is not available")

        ctx = tvm.cpu(0)
        output = nnpack.convolution_inference(data,
                                              kernel,
                                              bias if with_bias else None,
                                              [PAD, PAD, PAD, PAD],
                                              [STRIDE, STRIDE],
                                              algorithm=algorithm)
        s = tvm.create_schedule(output.op)

        f = tvm.build(s, [data, kernel, bias, output], target)

        na = np.random.uniform(size=dshape).astype(data.dtype)
        nb = np.random.uniform(size=kshape).astype(kernel.dtype)
        nc = np.zeros(bshape, dtype=bias.dtype)
        ta = tvm.nd.array(na, ctx)
        tb = tvm.nd.array(nb, ctx)
        tc = tvm.nd.array(nc, ctx)
        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
        f(ta, tb, tc, td)
        nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD,
                     STRIDE) + nc.reshape(1, bshape[0], 1, 1)
        tvm.testing.assert_allclose(td.asnumpy(),
                                    nd.reshape(BATCH, IC, IH, IW),
                                    rtol=1e-5)

    for algorithm in [
            nnpack.ConvolutionAlgorithm.AUTO,
            nnpack.ConvolutionAlgorithm.FFT_8x8,
            nnpack.ConvolutionAlgorithm.FFT_16x16,
            nnpack.ConvolutionAlgorithm.WT_8x8,
            nnpack.ConvolutionAlgorithm.IMPLICIT_GEMM,
            nnpack.ConvolutionAlgorithm.WT_8x8_FP16,
    ]:
        for with_bias in [True, False]:
            verify(algorithm=algorithm, with_bias=with_bias)
def verify_depthwise_conv2d_back_input(batch, in_channel, in_h,
                                       channel_multiplier, filter_h, stride_h,
                                       padding_h):
    in_w = in_h
    filter_channel = in_channel
    filter_w = filter_h
    stride_w = stride_h
    padding_w = padding_h

    out_h = np.int((in_h + 2 * padding_h - filter_h) / stride_h + 1)
    out_w = np.int((in_w + 2 * padding_w - filter_w) / stride_w + 1)
    out_channel = in_channel * channel_multiplier

    ishape = [batch, in_h, in_w, in_channel]
    oshape = [batch, out_h, out_w, out_channel]

    # placeholder
    Out_grad = tvm.placeholder(oshape, name='Out_grad')
    Filter = tvm.placeholder(
        (filter_h, filter_w, filter_channel, channel_multiplier))
    # declare
    In_grad = topi.nn.depthwise_conv2d_backward_input_nhwc(
        Filter,
        Out_grad,
        oshape,
        ishape,
        stride=[stride_h, stride_w],
        padding=[padding_h, padding_w])
    # schedule
    schedule = schedule_depthwise_conv2d_backward_input_nhwc(In_grad)

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        # build the kernel
        f = tvm.build(schedule, [Filter, Out_grad, In_grad], device)
        # prepare pod type for test data closure
        dtype = Out_grad.dtype
        out_grad_shape = get_const_tuple(Out_grad.shape)
        filter_shape = get_const_tuple(Filter.shape)

        # use memoize to pickle the test data for next time use
        @memoize("topi.tests.test_topi_depthwise_conv2d_backward_input.nhwc")
        def get_ref_data():
            out_grad_np = np.random.uniform(size=out_grad_shape).astype(dtype)
            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
            dilated_out_grad_np = topi.testing.dilate_python(
                out_grad_np, [1, stride_h, stride_w, 1])
            # padding params in forward propagation
            fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(
                [padding_h, padding_w], (filter_h, filter_w))
            # padding params in backward propagation
            bpad_top = filter_h - 1 - fpad_top
            bpad_bottom = (filter_h - 1 - fpad_bottom) + (stride_h - 1)
            bpad_left = filter_w - 1 - fpad_left
            bpad_right = (filter_w - 1 - fpad_right) + (stride_w - 1)

            padded_out_grad = np.zeros(
                (batch, dilated_out_grad_np.shape[1] + bpad_top + bpad_bottom,
                 dilated_out_grad_np.shape[2] + bpad_left + bpad_right,
                 out_channel))
            padded_out_grad[:,
                            bpad_top:dilated_out_grad_np.shape[1] + bpad_top,
                            bpad_left:dilated_out_grad_np.shape[2] +
                            bpad_left, :] = dilated_out_grad_np

            in_grad_np = np.zeros((batch, in_h, in_w, in_channel))
            for b in range(batch):
                for c in range(in_channel):
                    for m in range(channel_multiplier):
                        in_grad_np[b, :, :, c] += signal.convolve2d(padded_out_grad[b, :, :, c*channel_multiplier+m], \
                                filter_np[:, :, c, m], mode='valid')[0:in_h, 0:in_w]
            return (out_grad_np, filter_np, in_grad_np)

        (out_grad_np, filter_np, in_grad_np) = get_ref_data()

        out_grad_tvm = tvm.nd.array(out_grad_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        in_grad_tvm = tvm.nd.array(np.zeros(shape=ishape, dtype=dtype), ctx)
        # launch the kernel
        timer = f.time_evaluator(f.entry_name, ctx, number=1)
        tcost = timer(filter_tvm, out_grad_tvm, in_grad_tvm).mean
        np.testing.assert_allclose(in_grad_np,
                                   in_grad_tvm.asnumpy(),
                                   rtol=1e-5)

    check_device("opencl")
    check_device("cuda")
    check_device("metal")
    check_device("rocm")
    check_device("vulkan")
Example #53
0
def verify_conv2d_nchw(batch,
                       in_channel,
                       in_size,
                       num_filter,
                       kernel,
                       stride,
                       padding,
                       dilation=1,
                       add_bias=False,
                       add_relu=False):
    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding))

    in_height = in_width = in_size

    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
    bias = tvm.placeholder((num_filter, 1, 1), name='bias')

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.uniform(size=a_shape).astype(dtype)
        w_np = np.random.uniform(size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)
        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
            C = topi.nn.conv2d(A,
                               dW, (stride, stride), (padding, padding),
                               layout='NCHW',
                               out_dtype=dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.generic.schedule_conv2d_nchw([C])

        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         ctx)
        if add_bias:
            func = tvm.build(s, [A, W, bias, C],
                             device,
                             name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                             (batch, in_channel, in_size, num_filter, kernel,
                              stride, padding, dilation))
            func(a, w, b, c)
        else:
            func = tvm.build(s, [A, W, C],
                             device,
                             name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                             (batch, in_channel, in_size, num_filter, kernel,
                              stride, padding, dilation))
            func(a, w, c)
        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)

    for device in get_all_backend():
        check_device(device)
Example #54
0
def col2im_manual_schedule(shape, kernel, stride, pad, dtype, output_H_W, polyhedral=True, attrs=None):
    """
    Col2im operation with manual schedule.

     Args:
        shape (Union[list, tuple]): seven int numbers for the input's image size.
        kernel (Union[list, tuple]): two int numbers for the sliding window's size.
        stride (Union[list, tuple]): two int numbers for the sliding window's stride.
        pad: (Union[list, tuple]): four int numbers for padding's sizes: top, bottom, left, and right
        dtype (str): parameters' type.
        output_H_W (Union[list, tuple]): two int numbers for the output's height and width.
        polyhedral (bool): If True, use auto-schedule, else use manual-schedule, default value is True.
        attrs (dict): Specifies parameters used in manual-schedule.

    Returns:
        tvm.tensor.Tensor as result for col2im operation.
    """

    N, C1, KH, KW, OH, OW, C0 = shape
    H, W = output_H_W
    output_shape = (N, C1, H, W, C0)
    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    pad_t, pad_b, pad_l, pad_r = pad

    assert H == (OH - 1) * stride_h + kernel_h - (pad_t + pad_b), "Height of input and output do not match"
    assert W == (OW - 1) * stride_w + kernel_w - (pad_l + pad_r), "Width of input and output do not match"

    col2im = intrin_col2im(shape, output_shape, kernel, stride, pad, dtype)

    # tensor for the input data
    data = tvm.placeholder(shape, dtype, name="input_data")

    # assume we need the whole width of A
    # choose a section of the rows of A that encompasses all of the windows in the current window-batch
    res = tvm.compute(
        output_shape,
        lambda b, c1, h, w, c0:
            data(b, c1, h % KH, w % KW, h % OH, w % OW, c0),
        name="col2im_intrinsic"
    )

    # schedule for differetiation operation
    s = tvm.create_schedule([res.op])

    res_ub = s.cache_write(res, "local.UB")
    data_ub = s.cache_read(data, "local.UB", [res_ub])

    b, c1, h, w, c0 = res.op.axis

    s[data_ub].compute_at(s[res], c1)
    s[res_ub].compute_at(s[res], c1)

    s[res_ub].tensorize(res_ub.op.axis[0], col2im)

    with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [data, res], "cce", name="col2im_manual_schedule", attrs=attrs, polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "col2im_manual_schedule"
        utils.create_code(kernel_name, "./", source_code)
    return mod
"""
from __future__ import absolute_import, print_function

import tvm
import numpy as np

######################################################################
# Direct Declare Extern Math Call
# -------------------------------
# The most straight-forward way to call target specific function is via
# extern function call construct in tvm.
# In th following example, we use :any:`tvm.call_pure_extern` to call
# :code:`__expf` function, which is only available under CUDA.
#
n = tvm.var("n")
A = tvm.placeholder((n, ), name='A')
B = tvm.compute(A.shape,
                lambda i: tvm.call_pure_extern("float32", "__expf", A[i]),
                name="B")
s = tvm.create_schedule(B.op)
num_thread = 64
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
f = tvm.build(s, [A, B], "cuda", name="myexp")
print(f.imported_modules[0].get_source())

######################################################################
# Unified Intrinsic Call
# ----------------------
# The above code verifies that direct external call can be used to
Example #56
0
def test():
    env = nnpu.get_env()
    nnpu.set_device(env)
    shape = (2, 16)
    a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host')
    print('a host ' + str(a_host))
    a = tvm.compute(shape, lambda *i: a_host(*i), name='a')
    a_buf = tvm.compute(shape, lambda *i: a(*i), name='a_buf')
    b_buf = tvm.compute(
        shape,
        lambda i, j: tvm.log(a_buf[i, j].astype(env.cfg['dtype_w'])),
        name='b_buf')
    b = tvm.compute(shape, lambda *i: b_buf(*i), name='b')
    b_host = tvm.compute(shape, lambda *i: b(*i), name='b_host')

    s = tvm.create_schedule(b_host.op)

    # mark variable scopes
    s[a].set_scope(env.dram_scope)
    s[b].set_scope(env.dram_scope)

    s[a_buf].set_scope(env.uni_scratchpad_scope)
    s[b_buf].set_scope(env.uni_scratchpad_scope)

    #print
    # (dir(s[b].op.body))

    # mark compiler pragmas
    s[a].pragma(s[a].op.axis[0], env.dma_copy_pragma)
    s[b_host].pragma(s[b_host].op.axis[0], env.dma_copy_pragma)

    s[a_buf].pragma(s[a_buf].op.axis[0], env.scratchpad_ls)
    s[b].pragma(s[b].op.axis[0], env.scratchpad_ls)

    s[a_buf].compute_at(s[b_buf], b_buf.op.axis[0])

    # tensorize
    s[b_buf].tensorize(s[b_buf].op.axis[1], env.intrins.get('VLOG',
                                                            mode='inc'))

    # build
    print(tvm.lower(s, [a_host, b_host], simple_mode=True))
    print(nnpu.lower(s, [a_host, b_host], simple_mode=True))
    #exit()
    func = nnpu.build(s, [a_host, b_host], 'nnpu', 'llvm', name='nnpu_log')

    print('function built: ')
    #print(func.get_source())

    # prepare data
    ctx = tvm.nd.TVMContext(13, 0)  #???
    print('i want to know:')
    print(ctx.exist)
    a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=1, high=20)
    a_nd = tvm.nd.array(a_np, ctx)
    b_nd = tvm.nd.array(np.zeros(shape).astype(b_host.dtype), ctx)

    # run
    func(a_nd, b_nd)

    print('run finished')

    b_np = b_nd.asnumpy()
    print('a=')
    print(a_np)
    print('b=')
    print(b_np)
    print('ground truth =')
    gt = np.log(a_np, dtype=b_host.dtype)
    print(gt)
    np.testing.assert_allclose(b_np, gt)
Example #57
0
def _alter_conv2d_layout(attrs, inputs, tinfo, F):
    import nnvm.symbol as sym

    copy_inputs = [s for s in inputs]
    new_attrs = {k: attrs[k] for k in attrs.keys()}
    data, kernel = tinfo[0], tinfo[1]
    batch_size, in_channel, height, width = get_const_tuple(data.shape)

    groups = attrs.get_int("groups")
    out_channel = attrs.get_int("channels") if F == sym else attrs.get_int(
        "channels").value
    padding = attrs.get_int_tuple("padding")
    strides = attrs.get_int_tuple("strides")
    dilation = attrs.get_int_tuple("dilation")
    out_dtype = attrs["out_dtype"]

    layout_name = 'layout' if F == sym else 'data_layout'

    layout = attrs[layout_name]
    kh, kw = attrs.get_int_tuple("kernel_size")

    dtype = data.dtype
    out_dtype = dtype if out_dtype in ("same", "") else out_dtype
    is_depthwise = groups == in_channel and groups == out_channel

    # only optimize for NCHW
    if layout != 'NCHW':
        return None
    if groups != 1 and not is_depthwise:
        return None

    dispatch_ctx = autotvm.task.DispatchContext.current
    target = tvm.target.current_target()
    # query schedule and fallback if necessary
    workload = autotvm.task.args_to_workload(
        [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \
        if is_depthwise else \
        autotvm.task.args_to_workload(
            [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d)
    cfg = dispatch_ctx.query(target, workload)
    if cfg.is_fallback:
        _get_default_config(cfg, data, kernel, strides, padding, out_dtype,
                            is_depthwise)

    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]

    new_attrs[layout_name] = 'NCHW%dc' % ic_bn
    new_attrs['out_layout'] = 'NCHW%dc' % oc_bn

    new_data = tvm.placeholder(
        (batch_size, in_channel // ic_bn, height, width, ic_bn),
        dtype=data.dtype)

    if is_depthwise:
        new_attrs['kernel_layout'] = 'OIHW1i%do' % oc_bn
        # Store altered operator's config
        new_kernel = tvm.placeholder(
            (out_channel // oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel.dtype)
        new_workload = autotvm.task.args_to_workload([
            new_data, new_kernel, strides, padding, dilation,
            new_attrs[layout_name], new_attrs['out_layout'], out_dtype
        ], depthwise_conv2d_NCHWc)
    else:
        out_channel, _, kh, kw = get_const_tuple(kernel.shape)
        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
        new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)

        # Store altered operator's config
        new_kernel = tvm.placeholder(
            (out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn, oc_bn),
            dtype=kernel.dtype)
        new_workload = autotvm.task.args_to_workload([
            new_data, new_kernel, strides, padding, dilation,
            new_attrs[layout_name], new_attrs['out_layout'], out_dtype
        ], conv2d_NCHWc)

    dispatch_ctx.update(target, new_workload, cfg)

    if is_depthwise:
        if F == sym:
            logging.warning(
                "Use native layout for depthwise convolution on NNVM.")
            return None
        return F.nn.contrib_depthwise_conv2d_nchwc(*copy_inputs, **new_attrs)
    else:
        if F == sym:
            return F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
        return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
Example #58
0
def _declaration_conv_NCHWc(cfg, data, kernel, strides, padding, dilation,
                            layout, out_layout, out_dtype):
    # layout and out_layout are not used here,
    # we keep them for debug convenience when dumping autotvm workload
    HPAD, WPAD = padding if isinstance(padding,
                                       (tuple, list)) else (padding, padding)
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)
    dh, dw = dilation if isinstance(dilation,
                                    (tuple, list)) else (dilation, dilation)
    assert (dh, dw) == (1, 1), "Does not support dilation"

    n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
    in_channel = ic_chunk * ic_bn
    if data.dtype == 'uint8':
        oc_chunk, _, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple(
            kernel.shape)
    else:
        oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(
            kernel.shape)
    num_filter = oc_chunk * oc_bn

    if cfg.is_fallback:
        _get_default_config(
            cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
            tvm.placeholder(
                (num_filter, in_channel, kernel_height, kernel_width),
                dtype=kernel.dtype), strides, padding, out_dtype)

    # output shape
    out_height = (ih + 2 * HPAD - kernel_height) // HSTR + 1
    out_width = (iw + 2 * WPAD - kernel_width) // WSTR + 1
    oshape = (n, oc_chunk, out_height, out_width, oc_bn)

    # DOPAD
    DOPAD = (HPAD != 0 or WPAD != 0)
    if DOPAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
    else:
        data_pad = data

    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')

    if data.dtype == 'uint8':
        assert out_dtype == "int32", \
            "INT8 convolution requires input dtype = uint8 and output dtype=int32"
        # Intel performs dot product of 2 "4" Int8 values
        # Current implementation requires ic_bn to be a multiple of 4
        n_elems = 4
        assert ic_bn % n_elems == 0

        ic_outer = tvm.reduce_axis((0, in_channel // ic_bn), name='ic_outer')
        ic_f_inner = tvm.reduce_axis((0, ic_bn // n_elems), name='ic_f_inner')
        ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
        return tvm.compute(
            oshape,
            lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(
                data_pad[n, ic_outer, oh * HSTR + kh, ow * WSTR + kw,
                         ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) *
                kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block,
                       ic_s_inner].astype(out_dtype),
                axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
            name='conv2d_NCHWc_int8',
            tag="conv2d_NCHWc_int8")
    # else: fp implementation
    return tvm.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_pad[
            n, ic // ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % ic_bn].astype(
                out_dtype) * kernel[oc_chunk, ic // ic_bn, kh, kw, ic % ic_bn,
                                    oc_block],
                                                      axis=[ic, kh, kw]),
        name='conv2d_NCHWc',
        tag="conv2d_NCHWc")
Example #59
0
def test_ewise():
    m = tvm.var('m')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')

    shape = (20, 3)

    def test_apply(func,
                   name,
                   f_numpy,
                   low,
                   high,
                   check_round=False,
                   skip_name_check=False):
        B = func(A)
        assert tuple(B.shape) == tuple(A.shape)
        if not skip_name_check:
            assert B.op.body[0].name == name
        a_np = np.random.uniform(low=low, high=high, size=shape).astype(
            A.dtype) * 10
        # avoid round check too close to boundary
        if check_round:
            a_np += ((np.fmod(a_np, 1) - 0.5) < 1e-6) * 1e-5
        b_np = f_numpy(a_np)

        def check_device(device):
            ctx = tvm.context(device, 0)
            if not ctx.exist:
                print("Skip because %s is not enabled" % device)
                return
            print("Running on target: %s" % device)
            with tvm.target.create(device):
                s = topi.generic.schedule_injective(B)
            foo = tvm.build(s, [A, B], device, name=name)
            a = tvm.nd.array(a_np, ctx)
            b = tvm.nd.array(np.zeros_like(b_np), ctx)
            foo(a, b)
            tvm.testing.assert_allclose(b.asnumpy(),
                                        b_np,
                                        rtol=1e-5,
                                        atol=1e-5)

        for device in [
                'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx',
                'sdaccel', 'aocl_sw_emu'
        ]:
            check_device(device)

    test_apply(topi.floor, "floor", np.floor, -100, 100)
    test_apply(topi.ceil, "ceil", np.ceil, -100, 100)
    test_apply(topi.sign, "sign", np.sign, -100, 100, skip_name_check=True)
    test_apply(topi.trunc, "trunc", np.trunc, -100, 100)
    test_apply(topi.abs, "fabs", np.abs, -100, 100)
    test_apply(topi.round, "round", np.round, -100, 100, check_round=True)
    test_apply(topi.exp, "exp", np.exp, -1, 1)
    test_apply(topi.tanh, "tanh", np.tanh, -10, 10)
    test_apply(topi.sigmoid, "sigmoid", lambda x: 1 / (1 + np.exp(-x)), -1, 1)
    test_apply(topi.log, "log", np.log, 0, 100)
    test_apply(topi.sqrt, "sqrt", np.sqrt, 0, 100)
    test_apply(topi.rsqrt,
               "rsqrt",
               lambda x: np.ones_like(x) / np.sqrt(x),
               0,
               100,
               skip_name_check=True)
Example #60
0
def test_bind():
    if not tvm.gpu(0).exist:
        print('[Warning] No GPU found! Skip bind test!')
        return

    @script
    def vec_add(a, b):
        c = output_tensor((1000, ), 'float32')
        for tx in bind('threadIdx.x', 1000):
            c[tx] = a[tx] + b[tx]
        return c

    a = tvm.placeholder((1000, ), dtype='float32', name='a')
    b = tvm.placeholder((1000, ), dtype='float32', name='b')
    func, ins, outs = run_and_check(vec_add, [a, b], target='cuda')
    run_and_check(func, ins, outs=outs, target='cuda')

    @script
    def raw(a, b):
        c = output_tensor((1000, ), 'float32')
        for i in range(1000):
            c[i] = a[i] + b[i]
        return c

    c = raw(a, b)
    sch = tvm.create_schedule(c.op)
    x = tvm.thread_axis('threadIdx.x')
    sch[c].bind(c.op.axis[0], x)
    func, ins, outs = run_and_check(raw, [a, b],
                                    sch=sch,
                                    outs=[c],
                                    target='cuda')
    run_and_check(func, ins, outs=outs, target='cuda')

    @tvm.hybrid.script
    def foo(a):
        c = output_tensor((a.shape[0], ), a.dtype)
        total = allocate((1, ), a.dtype, 'local')
        len_i = a.shape[0]
        len_j = a.shape[1]
        for i in bind('threadIdx.x', len_i):
            total[0] = 0.
            for k in const_range(len_j):
                total[0] += a[i, k]
            c[i] = total[0]

        return c

    a = tvm.placeholder((8, 4), 'float32')
    c = foo(a)
    s = tvm.create_schedule(c.op)
    ir = tvm.lower(s, [a, c], simple_mode=True)
    assert not isinstance(ir, tvm.stmt.AttrStmt)
    func, ins, outs = run_and_check(foo, [a], target='cuda')
    run_and_check(func, ins, outs=outs, target='cuda')

    @tvm.hybrid.script
    def max_threads(a):
        b = output_tensor(a.shape, a.dtype)
        n = a.shape[0]
        m = max_num_threads(True)
        for i in bind('threadIdx.x', m):
            for j in bind('blockIdx.x', ceil_div(n, m)):
                if i * m + j < n:
                    b[i * m + j] = a[i * m + j] + a[i * m + j]
        return b

    a = tvm.placeholder((10000, ), 'float32')
    with tvm.target.create('cuda'):
        func, ins, outs = run_and_check(max_threads, [a], target='cuda')
        run_and_check(func, ins, outs=outs, target='cuda')