def test_multilevel_splitting_with_indivisble_factors():
    from tvm import topi

    A = te.placeholder((130, ), dtype="float32")
    B = topi.nn.relu(A)
    s = te.create_schedule(B.op)
    (y, ) = s[B].op.axis
    (yo, yi) = s[B].split(y, factor=8)
    (yoo, yoi) = s[B].split(yo, factor=16)
    s[B].reorder(yoo, yoi, yi)
    s[B].unroll(yi)

    ## But this does the right thing.
    with tvm.transform.PassContext(
            config={"tir.LoopPartition": {
                "partition_const_loop": True
            }}):
        lowered_body = tvm.lower(s, [A, B], name="x")["x"].body

        def visit_stmt(op):
            return isinstance(op, tvm.tir.Max)

        num_max = collect_visit(lowered_body, visit_stmt)
        assert num_max.count(True) == 10
    def check_cuda(dtype):
        if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        m = 128
        A = te.placeholder((m,), name='A', dtype=dtype)
        B = te.compute((m,), lambda i: A[i // 32 * 32 + (i + 1) % 32], name='B')

        cuda_target = tvm.target.create("cuda")
        assert cuda_target.thread_warp_size == 32
        with cuda_target:
            s = te.create_schedule(B.op)
            AA = s.cache_read(A, "warp", [B])
            xo, xi = s[B].split(B.op.axis[0], 64)
            xi0, xi1 = s[B].split(xi, factor=32)
            tx = te.thread_axis("threadIdx.x")
            s[B].bind(xi1, tx)
            s[B].bind(xo, te.thread_axis("blockIdx.x"))
            s[AA].compute_at(s[B], xo)
            xo, xi = s[AA].split(s[AA].op.axis[0], 32)
            s[AA].bind(xi, tx)

            ctx = tvm.gpu(0)
            func = tvm.build(s, [A, B], "cuda")
            A_np = np.array(list(range(m)), dtype=dtype)
            B_np = np.array(
                    list(range(1, 32)) + [0] +
                    list(range(33, 64)) + [32] +
                    list(range(65, 96)) + [64] +
                    list(range(97, 128)) + [96],
                    dtype=dtype)
            A_nd = tvm.nd.array(A_np, ctx)
            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), ctx)
            func(A_nd, B_nd)
            tvm.testing.assert_allclose(B_nd.asnumpy(), B_np, rtol=1e-3)
Exemple #3
0
def test_sort():
    n = 2
    l = 5
    m = 3
    data = te.placeholder((n, l, m), name="data")
    sort_num = te.placeholder((n, m), name="sort_num", dtype="int32")
    axis = 1
    is_ascend = False
    out = te.extern(
        data.shape,
        [data, sort_num],
        lambda ins, outs: tvm.tir.call_packed(
            "tvm.contrib.sort.argsort_nms", ins[0], ins[1], outs[0], axis, is_ascend
        ),
        dtype="int32",
        name="sort_tensor",
    )
    input = [
        [[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]],
        [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]],
    ]
    sort_num_input = [[1, 2, 3], [4, 5, 5]]
    sorted_index = [
        [[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]],
        [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]],
    ]

    ctx = tvm.cpu(0)
    target = "llvm"
    s = te.create_schedule(out.op)
    f = tvm.build(s, [data, sort_num, out], target)
    a = tvm.nd.array(np.array(input).astype(data.dtype), ctx)
    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
    f(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
Exemple #4
0
def test_in_bounds_vectorize_llvm():
    n = 512
    lanes = 2
    A = te.placeholder((n, ), name="A", dtype="float32x%d" % lanes)
    B = te.compute((n, ), lambda i: A[i], name="B")
    C = te.compute((n, ), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C")
    s = te.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], nparts=2)
    _, xi = s[C].split(xi, factor=2)
    s[C].parallel(xo)
    s[C].vectorize(xi)
    s[B].compute_at(s[C], xo)
    xo, xi = s[B].split(B.op.axis[0], factor=2)
    s[B].vectorize(xi)
    # build and invoke the kernel.
    lowered_func = tvm.lower(s, [A, C], "llvm", simple_mode=False)
    f = tvm.build(s, [A, C], "llvm")
    dev = tvm.cpu(0)
    # launch the kernel.
    a = tvm.nd.empty((n, ), A.dtype).copyfrom(
        np.random.uniform(size=[n] + ([] if lanes == 1 else [lanes])))
    c = tvm.nd.empty((n, ), C.dtype, dev)
    f(a, c)
    tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
Exemple #5
0
 def check_rfactor_no_reset_multi_reduction(factor, rfactor):
     s = te.create_schedule(C.op)
     x, y = C.op.axis
     rk = C.op.reduce_axis[0]
     yo, yi = s[C].split(y, factor=factor)
     ro, ri = s[C].split(rk, factor=rfactor)
     roo, roi = s[C].split(ro, factor=2)
     s[C].reorder(yo, roo, roi, yi, ri)
     gemv = intrin_gemv_no_reset(factor, rfactor)
     s[C].tensorize(yi, gemv)
     s = s.normalize()
     dom_map = tvm.te.schedule.InferBound(s)
     finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
     out_dom, in_dom = finfer(s[C], dom_map)
     assert tvm.ir.structural_equal(out_dom[x].extent, 1)
     assert tvm.ir.structural_equal(out_dom[y].extent, factor)
     assert tvm.ir.structural_equal(out_dom[y].min, yo * factor)
     fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
     body = fmatch(s[C], out_dom, in_dom, gemv)
     ana = tvm.arith.Analyzer()
     assert tvm.ir.structural_equal(ana.simplify(body[0]),
                                    ana.simplify(gemv.op.body[0]))
     stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
     tvm.lower(s, [A, B, C])
def test_tensor_scalar():
    # test te with scalar shape
    a = np.array(np.random.uniform(size=(1))[0], "float32")
    b = np.array(0.0, "float32")

    @tvm.register_func("tvm.test_tensor_scalar_copy")
    def mycopy(x, y):
        x.copyto(y)

    A = te.placeholder(a.shape, name="A")
    B = te.extern(
        a.shape,
        [A],
        lambda ins, outs: tvm.tir.call_packed("tvm.test_tensor_scalar_copy",
                                              ins[0], outs[0]),
        name="B",
    )
    s = te.create_schedule(B.op)
    f = tvm.build(s, [A, B], "llvm")

    ta = tvm.nd.array(a)
    tb = tvm.nd.array(b)
    f(ta, tb)
    tvm.testing.assert_allclose(ta.numpy(), tb.numpy())
    def mod(self, target, load_type, store_type, indirect_indices):
        target = tvm.target.Target(target)

        n = 4
        dtype = "int32"
        A = te.placeholder((n, ), dtype=dtype, name="A")
        R = te.placeholder((n, ), dtype=dtype, name="R")

        def do_compute(ins, outs):
            ib = tvm.tir.ir_builder.create()
            A, R = map(ib.buffer_ptr, ins)
            B = ib.buffer_ptr(outs[0])

            if "gpu" in target.keys:
                ib.scope_attr(te.thread_axis("blockIdx.x"), "thread_extent", 0)

            index_map = {
                "ramp": tvm.tir.Ramp(0, 1, 4),
                "broadcast": tvm.tir.Broadcast(0, 4),
            }

            load_index = index_map[load_type]
            store_index = index_map[store_type]

            if indirect_indices:
                load_index = tvm.tir.expr.Load("int32x4", R, load_index)

            transfer = tvm.tir.expr.Load("int32x4", A, load_index)
            ib.emit(tvm.tir.stmt.Store(B, transfer, store_index))

            return ib.get()

        B = te.extern(A.shape, [A, R], do_compute, dtype="int32")
        s = te.create_schedule(B.op)

        return tvm.lower(s, [A, R, B])
def test_large_input():
    @tvm.hybrid.script
    def compute(a, b):
        n = 16384
        c = output_tensor((n, n), 'int32')
        for i in range(n):
            for j in range(n):
                c[i, j] = a[i, j] - b[i, j]
        return c

    n = 16384
    shape = (n, n)
    a = te.placeholder(shape, name='a', dtype='int32')
    b = te.placeholder(shape, name='b', dtype='int32')
    c = te.compute(shape, lambda i, j: compute(a, b)[i, j])
    c = te.compute(shape, lambda i, j: 1 + c[i, j])
    s = te.create_schedule(c.op)
    stmt = tvm.lower(s, [a, b, c], simple_mode=True)

    def verify(n):
        if isinstance(n, tvm.tir.Allocate):
            assert n.extents[0].value == 268435456

    tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
Exemple #9
0
def try_warp_memory():
    """skip this in default test because it require higher arch"""
    m = 128
    A = te.placeholder((m, ), name='A')
    B = te.compute((m, ), lambda i: A[i] + 3, name='B')
    warp_size = 32
    s = te.create_schedule(B.op)
    AA = s.cache_read(A, "warp", [B])
    xo, xi = s[B].split(B.op.axis[0], warp_size * 2)
    xi0, xi1 = s[B].split(xi, factor=warp_size)
    tx = te.thread_axis("threadIdx.x")
    s[B].bind(xi1, tx)
    s[B].bind(xo, te.thread_axis("blockIdx.x"))
    s[AA].compute_at(s[B], xo)
    xo, xi = s[AA].split(s[AA].op.axis[0], warp_size)
    s[AA].bind(xi, tx)

    @tvm.register_func
    def tvm_callback_cuda_compile(code):
        ptx = nvcc.compile_cuda(code, target="ptx")
        return ptx

    # one line to build the function.
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("skip because %s is not enabled.." % device)
            return
        f = tvm.build(s, [A, B], device)
        a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype),
                         ctx)
        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx)
        f(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 3, rtol=1e-6)

    check_device("cuda")
def test_large_input():
    @te.hybrid.script
    def compute(a, b):
        n = 16384
        c = output_tensor((n, n), "int32")
        for i in range(n):
            for j in range(n):
                c[i, j] = a[i, j] - b[i, j]
        return c

    n = 16384
    shape = (n, n)
    a = te.placeholder(shape, name="a", dtype="int32")
    b = te.placeholder(shape, name="b", dtype="int32")
    c = te.compute(shape, lambda i, j: compute(a, b)[i, j])
    c = te.compute(shape, lambda i, j: 1 + c[i, j])
    s = te.create_schedule(c.op)
    stmt = tvm.lower(s, [a, b, c])["main"].body

    def verify(n):
        if isinstance(n, tvm.tir.Allocate):
            assert n.extents[0].value == 268435456

    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
def test_storage_share_gpu():
    m = te.var('m')
    A = [te.placeholder((m), name='A')]
    num_stage = 5
    for t in range(num_stage):
        A.append(
            te.compute((m, ), lambda i: A[-1][i] + (t + 1), name='A%d_s' % t))
        A.append(te.compute((m, ), lambda i: A[-1][i], name='A%d' % t))
    s = te.create_schedule(A[-1].op)
    for t in range(num_stage):
        x = A[2 * t + 2].op.axis[0]
        bx, tx = s[A[2 * t + 2]].split(x, factor=32)
        s[A[2 * t + 2]].bind(bx, te.thread_axis("blockIdx.x"))
        s[A[2 * t + 2]].bind(tx, te.thread_axis("threadIdx.x"))
        s[A[2 * t + 1]].compute_at(s[A[2 * t + 2]], tx)
        s[A[2 * t + 1]].set_scope("shared")

    bounds = tvm.te.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
    Ab = tvm.tir.decl_buffer(A[0].shape, A[0].dtype, name='A')
    Bb = tvm.tir.decl_buffer(A[0].shape, A[0].dtype, name='B')
    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A[0]: Ab, A[-1]: Bb}, 64)
    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
    stmt = tvm.tir.ir_pass.Simplify(stmt)
    stmt = tvm.tir.ir_pass.StorageRewrite(stmt)
    alloc_stats = {"global": 0, "shared": 0}

    def verify(n):
        if isinstance(n, tvm.tir.AttrStmt):
            if n.attr_key == "storage_scope":
                alloc_stats[n.value.value] += 1

    tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
    assert alloc_stats["global"] == 2
    assert alloc_stats["shared"] == num_stage
Exemple #12
0
def test_tensor_intrin_scalar_params():
    n = te.size_var("n")
    x = te.placeholder((n, ), name='x')
    v = te.size_var("v")
    w = te.size_var("w")
    z = te.compute((n, ), lambda i: x[i] * v + w, name='z')

    def intrin_func(ins, outs, sp):
        assert (isinstance(ins[0], tvm.te.schedule.Buffer))
        assert (ins[0].shape[0] == n)
        assert (sp[0] == v)
        assert (sp[1] == w)
        return tvm.tir.call_packed("hw_func", ins[0].data, outs[0].data, sp[0],
                                   sp[1])

    intrin = te.decl_tensor_intrin(z.op,
                                   intrin_func,
                                   scalar_params=[v, w],
                                   default_buffer_params={"offset_factor": 1})
    assert intrin.op == z.op
    assert intrin.reduce_init is None
    assert tuple(intrin.inputs) == tuple(z.op.input_tensors)
    assert (intrin.buffers[0].shape[0] == n)
    assert tuple(intrin.scalar_params) == tuple((v, w))

    A = te.placeholder((10, 10), name='A')
    # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs
    C = te.compute((10, 10),
                   lambda i, j: intrin(i * i, A[i, j], i + j),
                   name="C")
    s = te.create_schedule(C.op)
    stmt = tvm.lower(s, [A, C])["main"].body
    assert isinstance(stmt.body.body, tvm.tir.Evaluate)
    assert len(stmt.body.body.value.args) == 5
    assert str(stmt.body.body.value.args[3]) == "(i: int32*i)"
    assert str(stmt.body.body.value.args[4]) == "(i: int32 + j: int32)"
Exemple #13
0
def check_value(expr, vx, vy, data, fref):
    n = len(data)
    A = te.placeholder((n, ), name="A", dtype=expr.dtype)
    B = te.placeholder((n, ), name="B", dtype=expr.dtype)

    def make_binds(i):
        x = expr
        x = tvm.tir.Let(vx, A[i], x)
        x = tvm.tir.Let(vy, B[i], x)
        return x

    C = te.compute((n, ), make_binds)
    s = te.create_schedule([C.op])

    if not tvm.runtime.enabled("llvm"):
        return

    f = tvm.build(s, [A, B, C], "llvm")
    a = tvm.nd.array(np.array([x for x, y in data], dtype=expr.dtype))
    b = tvm.nd.array(np.array([y for x, y in data], dtype=expr.dtype))
    c = tvm.nd.array(np.zeros(len(data), dtype=expr.dtype))
    f(a, b, c)
    cref = np.array([fref(x, y) for x, y in data])
    np.testing.assert_equal(c.asnumpy(), cref)
Exemple #14
0
def test_matmul():
    n = 1024
    l = 128
    m = 235
    A = te.placeholder((n, l), name="A")
    B = te.placeholder((l, m), name="B")
    C = rocblas.matmul(A, B)
    s = te.create_schedule(C.op)

    def verify(target="rocm"):
        if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True):
            print("skip because extern function is not available")
            return
        ctx = tvm.rocm(0)
        f = tvm.build(s, [A, B, C], target)
        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        f(a, b, c)
        tvm.testing.assert_allclose(c.asnumpy(),
                                    np.dot(a.asnumpy(), b.asnumpy()),
                                    rtol=1e-5)

    verify()
def test_inline_multi_reduce():
    def argmax_comp(x, y):
        idx = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
        val = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
        return idx, val

    def argmax_init(idx_typ, val_typ):
        return tvm.tir.const(-1, idx_typ), tvm.te.min_value(val_typ)

    argmax = te.comm_reducer(argmax_comp, argmax_init, name="argmax")
    m = te.var("m")
    n = te.var("n")
    val = te.placeholder((m, n), name="val", dtype="float32")
    val1 = te.compute((m, n), lambda i, j: val[i, j] + 1, name="val1")
    val2 = te.compute((m, n), lambda i, j: te.exp(val1[i, j]), name="val2")
    k = te.reduce_axis((0, n), "k")
    T_idx, T_val = te.compute((m, ),
                              lambda i: argmax((k.var, val2[i, k]), axis=k),
                              name="T")
    s = te.create_schedule(T_idx.op)
    s[val1].compute_inline()
    s = s.normalize()
    bounds = tvm.te.schedule.InferBound(s)
    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
def test_large_uint_imm():
    value =  (1 << 63) + 123
    other = tvm.tir.const(3, "uint64")
    n = 12
    num_thread = 2

    A = te.compute((n,), lambda *i: tvm.tir.const(value, "uint64") + other, name='A')
    s = te.create_schedule(A.op)
    xo, xi = s[A].split(A.op.axis[0], factor=num_thread)
    s[A].bind(xi, te.thread_axis("threadIdx.x"))
    s[A].bind(xo, te.thread_axis("blockIdx.x"))

    def check_target(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            return
        f = tvm.build(s, [A], device)
        # launch the kernel.
        a = tvm.nd.empty((n, ), dtype=A.dtype, ctx=ctx)
        f(a)
        assert a.asnumpy()[0] == value + 3

    check_target("cuda")
    check_target("vulkan")
def test_multi_kernel():
    # graph
    n = tvm.runtime.convert(1024)
    A = te.placeholder((n, ), name='A')
    B = te.placeholder((n, ), name='B')
    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    D = te.compute(A.shape, lambda *i: A(*i) + C(*i), name='D')
    s = te.create_schedule(D.op)
    # create iter var and assign them tags.
    px, x = s[C].split(C.op.axis[0], nparts=1)
    s[C].bind(px, te.thread_axis("pipeline"))
    px, x = s[D].split(D.op.axis[0], nparts=1)
    s[D].bind(px, te.thread_axis("pipeline"))

    # one line to build the function.
    def check_device(device, host="llvm"):
        if not tvm.runtime.enabled(host):
            return
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            return
        fadd = tvm.build(s, [A, B, C, D], device, host, name="myadd")
        ctx = tvm.context(device, 0)
        # launch the kernel.
        n = 1024
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), ctx)
        d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), ctx)
        fadd(a, b, c, d)
        tvm.testing.assert_allclose(d.asnumpy(),
                                    a.asnumpy() * 2 + b.asnumpy(),
                                    rtol=1e-5)

    check_device("sdaccel")
    check_device("aocl_sw_emu")
Exemple #18
0
def test_matmul():
    n = 1024
    l = 128
    m = 256
    A = te.placeholder((n, l), name="A")
    B = te.placeholder((l, m), name="B")
    C = mps.matmul(A, B)
    D = te.compute(C.shape, lambda *i: C(*i) + 1.0)
    s = te.create_schedule(D.op)
    yo, xo = D.op.axis
    block_y = te.thread_axis("blockIdx.y")
    block_x = te.thread_axis("blockIdx.x")
    thread_y = te.thread_axis("threadIdx.y")
    thread_x = te.thread_axis("threadIdx.x")
    by, ty = s[D].split(yo, factor=16)
    bx, tx = s[D].split(xo, factor=16)
    s[D].bind(by, block_y)
    s[D].bind(bx, block_x)
    s[D].bind(ty, thread_y)
    s[D].bind(tx, thread_x)

    def verify(A, B, D, s, target="metal"):
        if not tvm.get_global_func("tvm.contrib.mps.matmul", True):
            print("skip because extern function is not available")
            return
        dev = tvm.metal(0)
        f = tvm.build(s, [A, B, D], "metal")
        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
        f(a, b, c)
        tvm.testing.assert_allclose(c.numpy(),
                                    np.dot(a.numpy(), b.numpy()) + 1,
                                    rtol=1e-5)

    verify(A, B, D, s)
def verify_dense_sw(batch, in_dim, out_dim, use_bias=True, dtype='float32'):
    nonzeros = te.var('nonzeros')
    A = te.placeholder((batch, in_dim), dtype=dtype, name='A')
    B = tvmsp.placeholder(shape=(out_dim, in_dim), nonzeros=nonzeros, dtype=dtype, name='B')
    C = te.placeholder((out_dim,), dtype=dtype, name='C')
    D = topi.sparse.dense(A, B, C if use_bias else None)
    s = te.create_schedule(D.op)

    # get the test data
    def get_ref_data():
        mag = 10.
        a_np = (mag*(np.random.uniform(size=(batch, in_dim)).astype('float32')-.5)).astype(dtype)
        b_np = np.maximum(mag*(np.random.uniform(size=(out_dim, in_dim)).astype('float32')-0.5), 0.).astype(dtype)
        c_np = (mag*(np.random.uniform(size=(out_dim,)).astype('float32')-.5)).astype(dtype)
        if use_bias:
            d_np = np.dot(a_np, b_np.T) + c_np
        else:
            d_np = np.dot(a_np, b_np.T)
        return (a_np, b_np, c_np, d_np)
    a_np, b_np, c_np, d_np = get_ref_data()

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        a = tvm.nd.array(a_np, ctx)
        b = tvmsp.array(b_np, ctx)
        c = tvm.nd.array(c_np, ctx)
        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
        f = tvm.build(s, [A, B.data, B.indices, B.indptr, C, D], device, name="dense")
        f(a, b.data, b.indices, b.indptr, c, d)
        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)

    check_device('llvm')
Exemple #20
0
def test_add():
    """Test a module which performs addition."""
    if not tvm.runtime.enabled("micro_dev"):
        return
    shape = (1024, )
    dtype = "float32"

    reset_gdbinit()

    # Construct TVM expression.
    tvm_shape = tvm.runtime.convert(shape)
    A = te.placeholder(tvm_shape, name="A", dtype=dtype)
    B = te.placeholder(tvm_shape, name="B", dtype=dtype)
    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
    s = te.create_schedule(C.op)

    func_name = "fadd"
    c_mod = tvm.build(s, [A, B, C], target="c", name=func_name)

    with micro.Session(DEV_CONFIG_A) as sess:
        micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A)
        micro_func = micro_mod[func_name]
        ctx = tvm.micro_dev(0)

        a_np = np.random.uniform(size=shape).astype(dtype)
        a = tvm.nd.array(a_np, ctx)
        b_np = np.random.uniform(size=shape).astype(dtype)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
        micro_func(a, b, c)

        # ensure inputs weren't corrupted
        tvm.testing.assert_allclose(a.asnumpy(), a_np)
        tvm.testing.assert_allclose(b.asnumpy(), b_np)
        # ensure output is correct
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
Exemple #21
0
def test_cuda_const_float_to_half():
    # This import is required to use nvcc to perform code gen;
    # otherwise it is found that the code gen is done by nvrtc.
    from tvm import autotvm

    shape = (2, 3, 4)
    a = te.placeholder(shape, dtype="float16", name="a")
    b = tvm.tir.const(0.5, dtype="float16")
    c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name="c")
    s = te.create_schedule(c.op)
    axes = [axis for axis in c.op.axis]
    fused = s[c].fuse(*axes)
    bx, tx = s[c].split(fused, factor=64)
    s[c].bind(bx, te.thread_axis("blockIdx.x"))
    s[c].bind(tx, te.thread_axis("threadIdx.x"))

    func = tvm.build(s, [a, c], "cuda")
    dev = tvm.gpu(0)
    a_np = np.random.uniform(size=shape).astype(a.dtype)
    c_np = np.zeros(shape=shape, dtype=c.dtype)
    a = tvm.nd.array(a_np, dev)
    c = tvm.nd.array(c_np, dev)
    func(a, c)
    np.testing.assert_equal(c.asnumpy(), a_np > b.value)
Exemple #22
0
def test_sort_by_key_gpu():
    size = 6
    keys = te.placeholder((size, ), name="keys", dtype="int32")
    values = te.placeholder((size, ), name="values", dtype="int32")

    for target in ["cuda", "nvptx", "opencl", "rocm"]:
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            continue

        with tvm.target.Target(target):
            keys_out, values_out = sort_by_key(keys, values)
            ctx = tvm.context(target)
            s = te.create_schedule([keys_out.op, values_out.op])
            f = tvm.build(s, [keys, values, keys_out, values_out], target)

            keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32)
            values_np = np.random.randint(0, 10,
                                          size=(size, )).astype(np.int32)
            keys_np_out = np.zeros(keys_np.shape, np.int32)
            values_np_out = np.zeros(values_np.shape, np.int32)
            keys_in = tvm.nd.array(keys_np, ctx)
            values_in = tvm.nd.array(values_np, ctx)
            keys_out = tvm.nd.array(keys_np_out, ctx)
            values_out = tvm.nd.array(values_np_out, ctx)
            f(keys_in, values_in, keys_out, values_out)

            ref_keys_out = np.sort(keys_np)
            ref_values_out = np.array(
                [values_np[i] for i in np.argsort(keys_np)])
            tvm.testing.assert_allclose(keys_out.asnumpy(),
                                        ref_keys_out,
                                        rtol=1e-5)
            tvm.testing.assert_allclose(values_out.asnumpy(),
                                        ref_values_out,
                                        rtol=1e-5)
    def check_correct_assembly(dtype):
        n = (1024, )
        A = te.placeholder(n, dtype=dtype, name='A')
        B = te.compute(
            A.shape,
            lambda i: tvm.tir.Select(A[i] >= 0, A[i] + tvm.tir.const(1, dtype),
                                     tvm.tir.const(0, dtype)),
            name='B')
        s = te.create_schedule(B.op)

        (bx, tx) = s[B].split(s[B].op.axis[0], factor=128)
        (tx, vx) = s[B].split(tx, factor=4)
        s[B].bind(bx, te.thread_axis("blockIdx.x"))
        s[B].bind(tx, te.thread_axis("threadIdx.x"))
        s[B].vectorize(vx)
        f = tvm.build(s, [A, B], target)

        # Verify we generate the boolx4 type declaration and the OpSelect
        # v4{float,half,int} instruction
        assembly = f.imported_modules[0].get_source()
        matches = re.findall("%v4bool = OpTypeVector %bool 4", assembly)
        assert len(matches) == 1
        matches = re.findall("OpSelect %v4.*", assembly)
        assert len(matches) == 1
def test_scan_inline2():
    m = te.var("m")
    n = te.var("n")
    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
    s_state1 = te.placeholder((m, n))
    s_state2 = te.placeholder((m, n))
    s_init1 = te.compute((1, n), lambda _, i: x[0, i])
    s_init2 = te.compute((1, n), lambda _, i: x[0, i])
    s_xx = te.compute((m, n),
                      lambda t, i: s_state1[t - 1, i] + x[t, i],
                      name="xx")
    s_x1 = te.compute((m, n), lambda t, i: s_xx[t, i] + 1, name="x1")
    s_x2 = te.compute((m, n),
                      lambda t, i: s_xx[t, i] + s_state2[t - 1, 2],
                      name="x2")
    s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1")
    s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2")
    res1, res2 = tvm.te.scan([s_init1, s_init2], [s_update1, s_update2],
                             [s_state1, s_state2])
    s = te.create_schedule(res1.op)
    s[s_xx].compute_inline()
    s[s_x1].compute_inline()
    s[s_x2].compute_inline()
    stmt = tvm.lower(s, [x, res1, res2])
def test_single_likely():
    n = 60
    A = te.placeholder((n, ), name="A")
    B = te.placeholder((n, ), name="B")

    T = te.compute((n, ), lambda i: A[i] + B[i])
    s = te.create_schedule(T.op)
    x = T.op.axis[0]
    xo, xi = s[T].split(x, factor=16)

    bounds = tvm.te.schedule.InferBound(s)
    stmt = tvm.te.schedule.ScheduleOps(s, bounds)

    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))

    with tvm.transform.PassContext(
            config={"tir.LoopPartition": {
                "partition_const_loop": True
            }}):
        mod = tvm.tir.transform.LoopPartition()(mod)
        stmt = tvm.tir.transform.Simplify()(mod)["main"].body

    assert not any(
        collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
def test_tensor_core_batch_conv():
    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
        print("skip because cuda is not enabled..")
        return
    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
        print("skip because gpu does not support tensor core")
        return

    # The sizes of inputs and filters
    batch_size = 32
    height = 14
    width = 14
    in_channels = 32
    out_channels = 64
    kernel_h = 3
    kernel_w = 3
    pad_h = 1
    pad_w = 1
    stride_h = 1
    stride_w = 1
    block_size = 16

    block_row_warps = 2
    block_col_warps = 4
    warp_row_tiles = 4
    warp_col_tiles = 2
    warp_size = 32
    chunk = 2

    # Input feature map: (N, H, W, IC, n, ic)
    data_shape = (batch_size // block_size, height, width,
                  in_channels // block_size, block_size, block_size)
    # Kernel: (H, W, IC, OC, ic, oc)
    kernel_shape = (kernel_h, kernel_w, in_channels // block_size,
                    out_channels // block_size, block_size, block_size)

    # Output feature map: (N, H, W, OC, n, oc)
    output_shape = (batch_size // block_size, height, width,
                    out_channels // block_size, block_size, block_size)

    assert (batch_size % block_size == 0)
    assert (in_channels % block_size == 0)
    assert (out_channels % block_size == 0)

    kh = te.reduce_axis((0, kernel_h), name='kh')
    kw = te.reduce_axis((0, kernel_w), name='kw')
    ic = te.reduce_axis((0, in_channels // block_size), name='ic')
    ii = te.reduce_axis((0, block_size), name='ii')

    # Algorithm
    A = te.placeholder(data_shape, name='A', dtype="float16")
    W = te.placeholder(kernel_shape, name='W', dtype="float16")
    Apad = te.compute(
        (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w,
         in_channels // block_size, block_size, block_size),
        lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
            tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w <
                        width), A[n, h - pad_h, w - pad_w, i, nn, ii],
            tvm.tir.const(0., "float16")),
        name='Apad')
    Conv = te.compute(
        output_shape,
        lambda n, h, w, o, nn, oo: te.sum(Apad[
            n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype(
                "float32") * W[kh, kw, ic, o, ii, oo].astype("float32"),
                                          axis=[ic, kh, kw, ii]),
        name="Conv")

    s = te.create_schedule(Conv.op)
    s[Apad].compute_inline()

    AS = s.cache_read(Apad, 'shared', [Conv])
    WS = s.cache_read(W, 'shared', [Conv])
    AF = s.cache_read(AS, 'wmma.matrix_a', [Conv])
    WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
    ConvF = s.cache_write(Conv, 'wmma.accumulator')

    block_x = te.thread_axis('blockIdx.x')
    block_y = te.thread_axis('blockIdx.y')
    block_z = te.thread_axis('blockIdx.z')
    thread_x = te.thread_axis('threadIdx.x')
    thread_y = te.thread_axis('threadIdx.y')
    thread_z = te.thread_axis('threadIdx.z')

    nc, hc, wc, oc, nnc, ooc = Conv.op.axis
    block_k = s[Conv].fuse(hc, wc)
    s[Conv].bind(block_k, block_z)
    nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
    block_i, nc = s[Conv].split(nc, factor=block_row_warps)
    oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
    block_j, oc = s[Conv].split(oc, factor=block_col_warps)
    s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
    s[Conv].bind(block_i, block_x)
    s[Conv].bind(block_j, block_y)
    s[Conv].bind(nc, thread_y)
    s[Conv].bind(oc, thread_z)

    s[ConvF].compute_at(s[Conv], oc)
    n, h, w, o, nnf, oof = ConvF.op.axis
    ko, ki = s[ConvF].split(ic, factor=chunk)
    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)

    s[AF].compute_at(s[ConvF], kw)
    s[WF].compute_at(s[ConvF], kw)

    s[WS].compute_at(s[ConvF], kh)
    s[AS].compute_at(s[ConvF], kh)

    n, h, w, i, nn, ii = AS.op.axis
    tx, xo = s[AS].split(n, nparts=block_row_warps)
    ty, yo = s[AS].split(xo, nparts=block_col_warps)
    t = s[AS].fuse(nn, ii)
    to, ti = s[AS].split(t, factor=warp_size)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(ti, thread_x)

    kh, kw, ic, o, ii, oo = WS.op.axis
    tx, xo = s[WS].split(o, nparts=block_row_warps)
    ty, yo = s[WS].split(xo, nparts=block_col_warps)
    t = s[WS].fuse(ii, oo)
    to, ti = s[WS].split(t, nparts=warp_size)
    s[WS].bind(tx, thread_y)
    s[WS].bind(ty, thread_z)
    s[WS].bind(to, thread_x)
    s[WS].vectorize(ti)

    s[AF].tensorize(AF.op.axis[-2],
                    intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_a'))
    s[WF].tensorize(WF.op.axis[-2],
                    intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_b'))
    s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16)))
    s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16)))

    func = tvm.build(s, [A, W, Conv], 'cuda')

    ctx = tvm.gpu(0)
    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
    a = tvm.nd.array(a_np, ctx)
    w = tvm.nd.array(w_np, ctx)
    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))

    if VERIFY:
        func(a, w, c)
        a_np = a_np.transpose(0, 4, 1, 2, 3,
                              5).reshape(batch_size, height, width,
                                         in_channels)
        w_np = w_np.transpose(0, 1, 2, 4, 3,
                              5).reshape(kernel_h, kernel_w, in_channels,
                                         out_channels)
        c_np = c.asnumpy().transpose(
            (0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width,
                                        out_channels)
        c_std = conv2d_nhwc_python(a_np.astype(Conv.dtype),
                                   w_np.astype(Conv.dtype),
                                   (stride_h, stride_w),
                                   (pad_h, pad_w)).astype(Conv.dtype)
        np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
def test_tensor_core_batch_matmal():
    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
        print("skip because cuda is not enabled..")
        return
    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
        print("skip because gpu does not support tensor core")
        return

    batch_size = 4
    n = 512
    m, l = n, n
    assert (n % 32 == 0)
    assert (m % 8 == 0)
    assert (l % 16 == 0)
    nn, mm, ll = n // 32, m // 8, l // 16
    A = te.placeholder((batch_size, nn, ll, 32, 16), name='A', dtype='float16')
    B = te.placeholder((batch_size, ll, mm, 16, 8), name='B', dtype='float16')
    k1 = te.reduce_axis((0, ll), name='k1')
    k2 = te.reduce_axis((0, 16), name='k2')
    C = te.compute((batch_size, nn, mm, 32, 8),
                   lambda b, i, j, ii, jj: te.sum(A[b, i, k1, ii, k2].astype(
                       'float') * B[b, k1, j, k2, jj].astype('float'),
                                                  axis=[k1, k2]),
                   name='Fragment_C')
    s = te.create_schedule(C.op)

    warp_size = 32
    kernel_size = 16
    block_row_warps = 2
    block_col_warps = 4
    warp_row_tiles = 4
    warp_col_tiles = 2
    chunk = 4

    block_x = te.thread_axis('blockIdx.x')
    block_y = te.thread_axis('blockIdx.y')
    block_z = te.thread_axis('blockIdx.z')
    thread_x = te.thread_axis('threadIdx.x')
    thread_y = te.thread_axis('threadIdx.y')
    thread_z = te.thread_axis('threadIdx.z')

    AS = s.cache_read(A, 'shared', [C])
    BS = s.cache_read(B, 'shared', [C])
    AF = s.cache_read(AS, 'wmma.matrix_a', [C])
    BF = s.cache_read(BS, 'wmma.matrix_b', [C])
    CF = s.cache_write(C, 'wmma.accumulator')

    b, i, j, kernel_i, kernel_j = s[C].op.axis
    i, ii = s[C].split(i, factor=warp_row_tiles)
    block_i, i = s[C].split(i, factor=block_row_warps)
    j, jj = s[C].split(j, factor=warp_col_tiles)
    block_j, j = s[C].split(j, factor=block_col_warps)
    s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j)
    s[C].bind(b, block_z)
    s[C].bind(block_i, block_x)
    s[C].bind(block_j, block_y)
    s[C].bind(i, thread_y)
    s[C].bind(j, thread_z)

    s[CF].compute_at(s[C], j)
    b, warp_i, warp_j, _i, _j = s[CF].op.axis
    k, _k = CF.op.reduce_axis
    ko, ki = s[CF].split(k, factor=chunk)
    s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k)

    s[AF].compute_at(s[CF], ki)
    s[BF].compute_at(s[CF], ki)

    s[AS].compute_at(s[CF], ko)
    b, xo, yo, xi, yi = AS.op.axis
    tx, xo = s[AS].split(xo, nparts=block_row_warps)
    ty, yo = s[AS].split(yo, nparts=block_col_warps)
    t = s[AS].fuse(xi, yi)
    to, ti = s[AS].split(t, nparts=warp_size)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(to, thread_x)

    s[BS].compute_at(s[CF], ko)
    b, xo, yo, xi, yi = BS.op.axis
    tx, xo = s[BS].split(xo, nparts=block_row_warps)
    ty, yo = s[BS].split(yo, nparts=block_col_warps)
    t = s[BS].fuse(xi, yi)
    to, ti = s[BS].split(t, nparts=warp_size)
    s[BS].bind(tx, thread_y)
    s[BS].bind(ty, thread_z)
    s[BS].bind(to, thread_x)

    s[AF].tensorize(AF.op.axis[-2],
                    intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_a'))
    s[BF].tensorize(BF.op.axis[-2],
                    intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_b'))
    s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16)))
    s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16)))

    func = tvm.build(s, [A, B, C], 'cuda')

    ctx = tvm.gpu(0)
    a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype)
    b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(b_np, ctx)
    c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), ctx)
    func(a, b, c)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
    print('gemm with tensor core: %f ms' % (evaluator(a, b, c).mean * 1e3))

    if VERIFY:
        func(a, b, c)
        a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        c_np = c.asnumpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        np.testing.assert_allclose(c_np,
                                   np.matmul(a_np.astype(C.dtype),
                                             b_np.astype(C.dtype)),
                                   rtol=1e-4,
                                   atol=1e-4)
Exemple #28
0
def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
    A = te.placeholder((n, l), name="A", dtype="float16")
    B = te.placeholder((l, m), name="B", dtype="float16")
    k = te.reduce_axis((0, l), name="k")
    C = te.compute((n, m), lambda i, j: te.sum(
        A[i, k].astype("float32") * B[k, j].astype("float32"), axis=k))
    s = te.create_schedule(C.op)
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    AA = s.cache_read(A, "shared", [C])
    AL = s.cache_read(AA, "local", [C])
    BB = s.cache_read(B, "shared", [C])
    BL = s.cache_read(BB, "local", [C])
    CL = s.cache_write(C, "local")

    bx = 4
    by = 32
    step_k = 8
    v = 4
    TX = 8
    TY = 1
    tile_x = bx * TX
    tile_y = by * TY
    WX = min(warp_tile_m, tile_x)
    tile_k = 16
    vthread = 1

    yo, ty = s[C].split(y, tile_y * vthread)
    vy, ty = s[C].split(ty, tile_y)
    ty, yi = s[C].split(ty, TY)

    xo, xi = s[C].split(x, tile_x)
    tz, xi = s[C].split(xi, WX)
    tx, xi = s[C].split(xi, TX)
    ko, ki = s[CL].split(k, step_k * tile_k)
    kl, ki = s[CL].split(ki, tile_k)

    s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
    s[C].bind(yo, te.thread_axis("blockIdx.y"))
    s[C].bind(xo, te.thread_axis("blockIdx.x"))
    s[C].bind(ty, te.thread_axis("threadIdx.y"))
    s[C].bind(tz, te.thread_axis("threadIdx.z"))
    s[C].bind(tx, te.thread_axis("threadIdx.x"))
    s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy"))
    s[CL].compute_at(s[C], tx)
    yo, xo = CL.op.axis
    s[CL].reorder(ko, kl, ki, yo, xo)

    s[AA].compute_at(s[CL], ko)
    xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v)
    tz, tx = s[AA].split(xi, factor=(WX // TX) * v)
    tx, vec = s[AA].split(tx, factor=v)
    fused = s[AA].fuse(s[AA].op.axis[0], xo)
    _, ty = s[AA].split(fused, factor=by)
    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
    s[AA].bind(tz, te.thread_axis("threadIdx.z"))
    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
    s[AA].vectorize(vec)

    s[BB].compute_at(s[CL], ko)
    xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v)
    tz, tx = s[BB].split(xi, factor=(WX // TX) * v)
    tx, vec = s[BB].split(tx, factor=v)
    fused = s[BB].fuse(s[BB].op.axis[0], xo)
    _, ty = s[BB].split(fused, factor=by)
    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
    s[BB].bind(tz, te.thread_axis("threadIdx.z"))
    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
    s[BB].vectorize(vec)

    s[AL].compute_at(s[CL], kl)
    s[BL].compute_at(s[CL], kl)

    s[CL].pragma(ko, "tensor_core")

    func = tvm.build(s, [A, B, C], "cuda")

    dev = tvm.cuda(0)
    a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
    b_np = np.random.uniform(size=(l, m)).astype(B.dtype)
    c_np = np.zeros((n, m), dtype=np.float32)
    a = tvm.nd.array(a_np, dev)
    b = tvm.nd.array(b_np, dev)
    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
    func(a, b, c)
    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
    print("gemm m=%d n=%d k=%d: %f ms" %
          (m, n, l, evaluator(a, b, c).mean * 1e3))

    c_np = np.dot(a_np, b_np)
    np.testing.assert_allclose(c_np, c.numpy(), rtol=1e-3)
def rnn_matexp():
    n_num_step = 128
    n_num_hidden = 1152
    n_batch_size = 4
    detect_global_barrier = DETECT_GLOBAL_BARRIER

    num_step = te.var("num_step")
    num_hidden = tvm.runtime.convert(n_num_hidden)
    batch_size = tvm.runtime.convert(n_batch_size)
    num_thread_y = 8
    num_thread_x = 16 * 3
    num_sm = 24

    Whh = te.placeholder((num_hidden, num_hidden), name="Whh")
    s_init = te.compute((1, batch_size, num_hidden),
                        lambda _, i, j: 1.0,
                        name="init")
    s_state = te.placeholder((num_step, batch_size, num_hidden))
    kh = te.reduce_axis((0, num_hidden), name="kh")
    s_update = te.compute(
        (num_step, batch_size, num_hidden),
        lambda t, i, j: te.sum(s_state[t - 1, i, kh] * Whh[kh, j], axis=kh),
        name="update")
    s_scan = tvm.te.scan(s_init, s_update, s_state)
    # schedule
    s = te.create_schedule(s_scan.op)
    CL = s_update
    SS = s.cache_read(s_state, "shared", [CL])
    SL = s.cache_read(SS, "local", [CL])
    WhhL = s.cache_read(Whh, "local", [CL])
    ko, ki = s[CL].split(s[CL].op.reduce_axis[0], nparts=num_thread_y)
    CLF = s.rfactor(CL, ko)

    block_x = te.thread_axis((0, num_sm), "blockIdx.x")
    thread_x = te.thread_axis((0, num_thread_x), "threadIdx.x")
    thread_y = te.thread_axis((0, num_thread_y), "threadIdx.y")
    if PERSIST_KERNEL:
        s[s_scan.op].env_threads([block_x, thread_y, thread_x])

    bx, xi = s[s_init].split(s_init.op.axis[2], nparts=num_sm)
    tx, xi = s[s_init].split(xi, nparts=num_thread_x)
    s[s_init].bind(bx, block_x)
    s[s_init].bind(tx, thread_x)

    bx, xi = s[s_update].split(s[CL].op.axis[2], nparts=num_sm)
    tx, xi = s[s_update].split(xi, nparts=num_thread_x)
    s[s_update].bind(bx, block_x)
    s[s_update].bind(tx, thread_x)
    s[CL].bind(s[CL].op.reduce_axis[0], thread_y)
    s[CLF].compute_at(s[CL], s[CL].op.reduce_axis[0])
    # Duplicate store predicate.
    s[CL].set_store_predicate(thread_y.equal(0))

    if PERSIST_KERNEL:
        s[WhhL].compute_at(s[s_scan], thread_x)
        s[WhhL].unroll(WhhL.op.axis[0])
    else:
        s[WhhL].compute_at(s[CLF], CLF.op.axis[3])

    kr, ki = s[CLF].split(CLF.op.reduce_axis[0], nparts=1)
    ko, ki = s[CLF].split(ki, factor=4)
    s[SS].compute_at(s[CLF], kr)
    s[SL].compute_at(s[CLF], ko)

    xo, xi = s[SS].split(SS.op.axis[2], factor=num_thread_x * num_thread_y * 3)
    ty, xi = s[SS].split(xi, nparts=num_thread_y)
    tx, xi = s[SS].split(xi, nparts=num_thread_x)
    s[SS].bind(ty, thread_y)
    s[SS].bind(tx, thread_x)

    def check_device(target):
        with tvm.target.build_config(
                detect_global_barrier=detect_global_barrier,
                auto_unroll_max_step=128,
                unroll_explicit=False):
            f = tvm.build(s, [s_scan, Whh], target)
        ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
        # launch the kernel.
        res_np = np.zeros(
            (n_num_step, n_batch_size, n_num_hidden)).astype("float32")
        Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32")
        Whh_np[:] = 2.0 / n_num_hidden
        Whh_np[:, n_num_hidden // 2:] = 0

        res_a = tvm.nd.array(res_np, ctx)
        Whh_a = tvm.nd.array(Whh_np, ctx)
        # Skip first pass as it is compilation
        f(res_a, Whh_a)
        ctx.sync()
        # measure time cost of second step.
        tstart = time.time()
        f(res_a, Whh_a)
        ctx.sync()
        tgap = time.time() - tstart
        print("Time cost=%g" % tgap)
        # correctness
        if not SKIP_CHECK:
            res_gpu = res_a.asnumpy()
            res_cmp = np.ones_like(res_np).astype("float64")
            Whh_np = Whh_np.astype("float64")
            for t in range(1, n_num_step):
                res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np)
            for i in range(n_num_step):
                for j in range(n_num_hidden):
                    if abs(res_cmp[i, 0, j] - res_gpu[i, 0, j]) > 1e-5:
                        print("%d, %d: %g vs %g" %
                              (i, j, res_cmp[i, 0, j], res_gpu[i, 0, j]))
            tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)

    check_device("cuda")
from __future__ import absolute_import, print_function
import tvm
from tvm import te
import numpy as np

######################################################################
# We first write a very simple vector add and build it with the default schedule. Then, we use
# our customized lowering pass to manipulate the IR directly instead of using schedule primitives.
#

n = tvm.tir.const(128, "int32")
a = te.placeholder((n, ), name="a")
b = te.placeholder((n, ), name="b")
c = te.compute((n, ), lambda i: a[i] + b[i], name='c')

sch = te.create_schedule(c.op)
ir = tvm.lower(sch, [a, b, c], simple_mode=True)
print(ir)

######################################################################
# Writing a Pass
# --------------
# Essentially, an "IR transformation pass" is a function which maps a statement to a new statement.
# Thus, we define this vectorize function and implement it step by step.
#

######################################################################
# TVM already provides two class for users to both analyze and transform IR.
#
# IR Visitor
# ~~~~~~~~~~