Example #1
0
def test_static_tensor():
    dtype = "float32"
    stype = "csr"
    target = "llvm"
    ctx = tvm.context(target, 0)
    m = te.size_var("m")
    n = te.size_var("n")
    A = tvmsp.placeholder(shape=(m, n), name="A", dtype=dtype)
    assert A.stype == "csr"
    n = 3
    a = np.maximum(np.random.uniform(size=(n, n)).astype(dtype) - 0.6, 0.0)
    a = tvmsp.array(a, ctx)
    A.data = te.placeholder(a.data.shape, dtype, name="A_data")
    Ab = tvm.tir.decl_buffer(a.data.shape, dtype, name="A_data")
    binds = {A.data: Ab}
    C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter")
    s = te.create_schedule(C.op)
    f = tvm.build(s, [A.data, C], target, binds=binds)
    c = tvmsp.array(np.zeros((n, n), dtype), ctx)
    c.data = tvm.nd.empty(a.data.shape, dtype)
    c.indices = a.indices
    c.indptr = a.indptr
    f(a.data, c.data)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2.0, rtol=1e-5)
Example #2
0
def test_tensor_intrin_scalar_params():
    n = te.size_var("n")
    x = te.placeholder((n, ), name="x")
    v = te.size_var("v")
    w = te.size_var("w")
    z = te.compute((n, ), lambda i: x[i] * v + w, name="z")

    def intrin_func(ins, outs, sp):
        assert isinstance(ins[0], tvm.te.schedule.Buffer)
        assert ins[0].shape[0] == n
        assert sp[0] == v
        assert sp[1] == w
        return tvm.tir.call_packed("hw_func", ins[0].data, outs[0].data, sp[0],
                                   sp[1])

    intrin = te.decl_tensor_intrin(z.op,
                                   intrin_func,
                                   scalar_params=[v, w],
                                   default_buffer_params={"offset_factor": 1})
    assert intrin.op == z.op
    assert intrin.reduce_init is None
    assert tuple(intrin.inputs) == tuple(z.op.input_tensors)
    assert intrin.buffers[0].shape[0] == n
    assert tuple(intrin.scalar_params) == tuple((v, w))

    A = te.placeholder((10, 10), name="A")
    # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs
    C = te.compute((10, 10),
                   lambda i, j: intrin(i * i, A[i, j], i + j),
                   name="C")
    s = te.create_schedule(C.op)
    stmt = tvm.lower(s, [A, C])["main"].body
    assert isinstance(stmt.body.body, tvm.tir.Evaluate)
    assert len(stmt.body.body.value.args) == 5
    assert str(stmt.body.body.value.args[3]) == "(i: int32*i)"
    assert str(stmt.body.body.value.args[4]) == "(i: int32 + j: int32)"
Example #3
0
def test_batch_matmul(executor_kind):
    b, m, n, k = te.size_var("b"), te.size_var("m"), te.size_var(
        "n"), te.size_var("k")
    x = relay.var("x", relay.TensorType((b, m, k), "float32"))
    y = relay.var("y", relay.TensorType((b, n, k), "float32"))
    z = relay.nn.batch_matmul(x, y)
    zz = run_infer_type(z)
    assert zz.checked_type == relay.TensorType((b, m, n), "float32")

    verify_batch_matmul(executor_kind, (1, 16, 32), (1, 16, 32), (1, 16, 16),
                        trans_x=False,
                        trans_y=True)
    verify_batch_matmul(executor_kind, (5, 16, 32), (5, 16, 32), (5, 16, 16),
                        trans_x=False,
                        trans_y=True)
    verify_batch_matmul(executor_kind, (5, 16, 32), (5, 20, 32), (5, 16, 20),
                        trans_x=False,
                        trans_y=True)
    verify_batch_matmul(executor_kind, (30, 16, 32), (30, 20, 32),
                        (30, 16, 20),
                        trans_x=False,
                        trans_y=True)
    verify_batch_matmul(executor_kind, (1, 32, 16), (1, 16, 32), (1, 16, 16),
                        trans_x=True,
                        trans_y=True)
    verify_batch_matmul(executor_kind, (5, 16, 32), (5, 32, 16), (5, 16, 16),
                        trans_x=False,
                        trans_y=False)
    verify_batch_matmul(executor_kind, (5, 32, 16), (5, 32, 20), (5, 16, 20),
                        trans_x=True,
                        trans_y=False)

    x_np = np.random.randn(10, 27, 64).astype("float32")
    x = relay.var("x", shape=x_np.shape)
    verify_batch_matmul_with_inputs(executor_kind, x, x, x_np, x_np,
                                    (10, 27, 27))
def test_thread_storage_sync():
    m = te.size_var('m')
    l = te.size_var('l')
    A = te.placeholder((m, l), name='A')

    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')

    s = te.create_schedule(A2.op)
    xo, xi = s[A2].split(A2.op.axis[0], factor=8)
    s[A2].bind(xo, te.thread_axis("blockIdx.x"))
    s[A1].compute_at(s[A2], xo)
    s[A1].set_scope("shared")

    bounds = tvm.te.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    stmt = tvm.te.schedule.ScheduleOps(s, bounds)

    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, A2], stmt, None)
    mod = tvm.IRModule.from_expr(func)
    mod = tvm.tir.transform.StorageFlatten(64)(mod._move())

    cuda_target = tvm.target.create("cuda")

    mod = tvm.tir.transform.Apply(lambda f: f.with_attr({
        "global_symbol": "test",
        "target": cuda_target
    }))(mod._move())

    fdevice = tvm.tir.transform.SplitHostDevice()(mod)["test_kernel0"]
    mod = tvm.IRModule.from_expr(fdevice)
    cuda_target = tvm.target.create("cuda")
    f = tvm.tir.transform.ThreadSync("shared")(mod)["test_kernel0"]
    body_list = tvm.tir.stmt_list(f.body.body.body.body)
    assert (body_list[1].value.op.same_as(
        tvm.ir.Op.get("tir.tvm_storage_sync")))
Example #5
0
def test_rocm_cross_thread_reduction():
    # based on the reduction tutorial
    n = te.size_var("n")
    m = te.size_var("m")
    A = te.placeholder((n, m), name="A")
    k = te.reduce_axis((0, m), "k")
    B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
    s = te.create_schedule(B.op)
    ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
    BF = s.rfactor(B, ki)
    xo, xi = s[B].split(s[B].op.axis[0], factor=32)
    s[B].bind(xo, bx)
    s[B].bind(xi, ty)
    s[B].bind(s[B].op.reduce_axis[0], tx)
    s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
    s[B].set_store_predicate(tx.var.equal(0))
    frocm = tvm.build(s, [A, B], "rocm")

    nn = 128
    dev = tvm.rocm(0)
    a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), dev)
    b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev)
    frocm(a, b)
    tvm.testing.assert_allclose(b.numpy(), np.sum(a.numpy(), axis=1), rtol=1e-4)
Example #6
0
def test_basic():
    n = te.size_var('n')
    A = te.placeholder((n, ), name='A')
    B = te.placeholder((n, ), name='B')

    T = te.compute((n, ), lambda i: A[i]+B[i])
    s = te.create_schedule(T.op)
    xo, xi = s[T].split(T.op.axis[0], factor=4)

    bounds = tvm.te.schedule.InferBound(s)
    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
    stmt = tvm.tir.ir_pass.LoopPartition(stmt, False)
    stmt = tvm.tir.ir_pass.Simplify(stmt)
    assert('if' not in str(stmt.body.body.body[0]))
    assert('if' in str(stmt.body.body.body[1]))
def test_in_bounds_llvm():
    n = te.size_var("n")
    A = te.placeholder((n,), name="A")
    B = te.placeholder((n,), name="B")
    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
    s = te.create_schedule(C.op)
    tgt = "llvm"
    tgt_host = "llvm"
    stmt = tvm.lower(s, [A, B, C], simple_mode=True)
    fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
    ctx = tvm.context(tgt, 0)
    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx)
    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), ctx)
    fadd(a, b, c)
def test_hoisting_block_scope_1():
    n = te.size_var("n")
    m = te.size_var("m")
    A = te.placeholder((n, m), name="A")
    k = te.reduce_axis((0, m), "k")
    B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
    s = te.create_schedule(B.op)
    ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
    BF = s.rfactor(B, ki)
    xo, xi = s[B].split(s[B].op.axis[0], factor=32)
    s[B.op].bind(xo, te.thread_axis("blockIdx.x"))
    s[B.op].bind(xi, te.thread_axis("threadIdx.y"))
    s[B].bind(s[B].op.reduce_axis[0], te.thread_axis("threadIdx.x"))
    s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
    func = tvm.driver.build_module.schedule_to_module(s, [A, B], "main", None)["main"]
    stmt = func.body
    new_stmt = tvm.tir.transform.HoistIfThenElse()(tvm.IRModule.from_expr(func))["main"].body
    tvm.ir.assert_structural_equal(new_stmt, stmt)

    with tvm.transform.PassContext(
        config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}}
    ):
        new_stmt = tvm.tir.transform.HoistIfThenElse()(tvm.IRModule.from_expr(func))["main"].body
    assert not tvm.ir.structural_equal(new_stmt, stmt)
def test_basic_likely_elimination():
    n = te.size_var('n')
    X = te.placeholder(shape=(n, ), name="x")
    W = te.placeholder(shape=(n + 1, ), dtype="int32", name="w")

    def f(i):
        start = W[i]
        extent = W[i + 1] - W[i]
        rv = te.reduce_axis((0, extent))
        return te.sum(X[rv + start], axis=rv)

    Y = te.compute(X.shape, f, name="y")
    s = te.create_schedule([Y.op])
    stmt = tvm.lower(s, [X, W, Y], simple_mode=True)
    assert ('if' not in str(stmt))
def test_multi_if():
    ib = tvm.tir.ir_builder.create()
    m = te.size_var("m")
    n = te.size_var("n")
    with ib.for_range(0, 4, "i") as i:
        with ib.for_range(0, n, "j") as j:
            with ib.for_range(0, m, "k") as k:
                with ib.if_scope(ib.likely(i * m + j + k < n)):
                    ib.emit(tvm.tir.Evaluate(m))
                with ib.else_scope():
                    ib.emit(tvm.tir.Evaluate(n))
                with ib.if_scope(ib.likely(i * m + j - k < n)):
                    ib.emit(tvm.tir.Evaluate(m))
                with ib.else_scope():
                    ib.emit(tvm.tir.Evaluate(n))
    stmt = ib.get()

    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
    mod = tvm.tir.transform.LoopPartition()(mod)
    stmt = tvm.tir.transform.Simplify()(mod)["main"].body

    assert not any(
        collect_visit(stmt.body[0],
                      lambda x: isinstance(x, tvm.tir.IfThenElse)))
def test_thread_extent_simplify():
    ib = tvm.tir.ir_builder.create()
    A = ib.pointer("float32", name="A")
    C = ib.pointer("float32", name="C")
    n = te.size_var("n")
    tx = te.thread_axis("threadIdx.x")
    ty = te.thread_axis("threadIdx.y")
    ib.scope_attr(tx, "thread_extent", n)
    ib.scope_attr(tx, "thread_extent", n)
    ib.scope_attr(ty, "thread_extent", 1)
    with ib.if_scope(tx + ty < 12):
        A[tx] = C[tx + ty]
    body = tvm.tir.LetStmt(n, 10, ib.get())
    body = tvm.tir.ir_pass.CanonicalSimplify(body)
    assert isinstance(body.body.body.body, tvm.tir.Store)
def test_dyn_shared_reuse_and_merge():
    n = 64
    A = te.placeholder((n, ), name="A", dtype="float32")
    B = te.placeholder((n, ), name="B", dtype="float32")
    C = te.placeholder((te.size_var("n_dyn"), ), name="C", dtype="float32")

    def test_device_ir(A, B, C, D):
        ib = tvm.tir.ir_builder.create()

        tx = te.thread_axis("threadIdx.x")
        ib.scope_attr(tx, "thread_extent", n)

        A_sh = ib.allocate(A.dtype, (n, ), scope="shared.dyn", name="A_sh")
        B_sh = ib.allocate(B.dtype, (n, ), scope="shared.dyn", name="B_sh")
        C_sh = ib.allocate(C.dtype, (C.shape[0], ),
                           scope="shared.dyn",
                           name="C_sh")

        Aptr = ib.buffer_ptr(A)
        Bptr = ib.buffer_ptr(B)
        Cptr = ib.buffer_ptr(C)
        Dptr = ib.buffer_ptr(D)

        A_sh[tx] = Aptr[tx]
        Dptr[tx] = A_sh[tx]

        B_sh[tx] = Bptr[tx]
        Dptr[tx] += B_sh[tx]

        C_sh[tx] = Cptr[
            tx]  # C cannot reuse other buffers since it size is dynamic
        Dptr[tx] += C_sh[tx]

        return ib.get()

    D = te.extern(
        (n, ),
        [A, B, C],
        lambda ins, outs: test_device_ir(ins[0], ins[1], ins[2], outs[0]),
        name="vadd",
        dtype="float32",
    )
    s = te.create_schedule(D.op)

    mod = run_passes(s, [A, B, C, D])
    # merged allocation
    # allocate(buf_dyn_shmem: Pointer(shared.dyn uint8), uint8, [((n_dyn*4) + 256)]);
    verify_single_allocation(mod["main"].body)
def test_thread_axis2():
    n = tvm.runtime.convert(4096)
    m = te.size_var('m')
    A = te.placeholder((n, ), name='A')
    B = te.placeholder((n, ), name='B')
    C = te.compute(A.shape, lambda i: A[i] + B[i], name='C')
    s = te.create_schedule(C.op)
    num_thread = 32
    bx, x = s[C].split(C.op.axis[0], factor=32)
    tx, x = s[C].split(x, nparts=num_thread)
    _, x = s[C].split(x, factor=m)
    s[C].bind(bx, te.thread_axis("blockIdx.x"))
    s[C].bind(tx, te.thread_axis("threadIdx.x"))
    stmt = lower(s, [A, B])
    for_body = stmt.body.body.body.body[0]
    assert ('threadIdx' not in str(for_body.extent))
def test_thread_extent_simplify():
    ib = tvm.tir.ir_builder.create()
    A = ib.pointer("float32", name="A")
    C = ib.pointer("float32", name="C")
    n = te.size_var("n")
    tx = te.thread_axis("threadIdx.x")
    ty = te.thread_axis("threadIdx.y")
    ib.scope_attr(tx, "thread_extent", n)
    ib.scope_attr(tx, "thread_extent", n)
    ib.scope_attr(ty, "thread_extent", 1)
    with ib.if_scope(tx + ty < 12):
        A[tx] = C[tx + ty]
    body = tvm.tir.LetStmt(n, 10, ib.get())
    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, C, n], body))
    body = tvm.tir.transform.Simplify()(mod)["main"].body
    assert isinstance(body.body.body.body, tvm.tir.BufferStore)
Example #15
0
def get_shape(shape):
    """Convert the shape to correct dtype and vars."""
    ret = []
    for dim in shape:
        if isinstance(dim, tvm.tir.IntImm):
            if libinfo()["INDEX_DEFAULT_I64"] == "ON":
                ret.append(dim)
            else:
                val = int(dim)
                assert val <= np.iinfo(np.int32).max
                ret.append(tvm.tir.IntImm("int32", val))
        elif isinstance(dim, tvm.tir.Any):
            ret.append(te.size_var("any_dim", "int32"))
        else:
            ret.append(dim)
    return ret
def test_out_of_bounds_loop_partition_basic_llvm(index_a, index_b):
    n = te.size_var("n")
    A = te.placeholder((n, ), name="A")
    B = te.placeholder((n, ), name="B")

    T = te.compute((n, ), lambda i: A[i + index_a] + B[i + index_b])
    s = te.create_schedule(T.op)
    xo, xi = s[T].split(T.op.axis[0], factor=4)
    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
    dev = tvm.cpu(0)

    f = tvm.build(s, [A, B, T], "llvm")
    a = tvm.nd.array(np.random.uniform(size=(32, )).astype(A.dtype), dev)
    b = tvm.nd.array(np.random.uniform(size=(32, )).astype(B.dtype), dev)
    t = tvm.nd.empty((32, ), T.dtype, dev)
    f(a, b, t)
def test_in_bounds_loop_partition_basic_llvm():
    n = te.size_var('n')
    A = te.placeholder((n, ), name='A')
    B = te.placeholder((n, ), name='B')

    T = te.compute((n, ), lambda i: A[i] + B[i])
    s = te.create_schedule(T.op)
    xo, xi = s[T].split(T.op.axis[0], factor=4)
    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
    ctx = tvm.cpu(0)

    f = tvm.build(s, [A, B, T], "llvm")
    a = tvm.nd.array(np.random.uniform(size=(32, )).astype(A.dtype), ctx)
    b = tvm.nd.array(np.random.uniform(size=(32, )).astype(B.dtype), ctx)
    t = tvm.nd.empty((32, ), T.dtype, ctx)
    f(a, b, t)
Example #18
0
def test_for():
    ib = tvm.tir.ir_builder.create()
    n = te.size_var("n")
    A = ib.allocate("float32", n, name="A", scope="global")
    with ib.for_range(0, n, name="i") as i:
        A[i] = A[i] + 1
        with ib.for_range(0, 10, name="j") as j:
            A[j] = A[j] + 2

    body = ib.get()
    assert isinstance(body, tvm.tir.Allocate)
    body = body.body
    assert isinstance(body, tvm.tir.For)
    body = body.body
    assert isinstance(body, tvm.tir.SeqStmt)
    assert isinstance(body[1], tvm.tir.For)
Example #19
0
def test_stack_vm_basic():
    a = tvm.nd.array(np.zeros(10, dtype='float32'))

    @tvm.register_func
    def tvm_call_back_get_shape(shape0):
        print(shape0)
        assert shape0 == a.shape[0]

    n = te.size_var('n')
    Ab = tvm.tir.decl_buffer((n, ), "float32")
    stmt = tvm.tir.Evaluate(
        tvm.tir.call_packed("tvm_call_back_get_shape", Ab.shape[0]))
    fapi = tvm.tir.ir_pass.MakeAPI(stmt, "print_shape", [Ab], 0, True)
    fapi = tvm.tir.ir_pass.LowerTVMBuiltin(fapi)
    fapi = tvm.tir.ir_pass.LowerIntrin(fapi, "stackvm")
    run_jit(fapi, lambda f: f(a))
Example #20
0
def test_buffer_broadcast():
    m0, m1, m2 = te.size_var("m0"), te.size_var("m1"), te.size_var("m2")
    n0, n1, n2 = te.size_var("n0"), te.size_var("n1"), te.size_var("n2")
    o0, o1, o2 = te.size_var("o0"), te.size_var("o1"), te.size_var("o2")

    A = te.placeholder((m0, m1, m2), name="A")
    B = te.placeholder((n0, n1, n2), name="B")

    C = te.compute((o0, o1, o2),
                   lambda i, j, k: A[i, j, k] + B[i, j, k],
                   name="C")

    Ab = tvm.tir.decl_buffer(A.shape,
                             A.dtype,
                             name="Ab",
                             buffer_type="auto_broadcast")
    Bb = tvm.tir.decl_buffer(B.shape,
                             B.dtype,
                             name="Bb",
                             buffer_type="auto_broadcast")
    s = te.create_schedule(C.op)

    def check():
        fadd = tvm.build(s, [A, B, C],
                         target="llvm",
                         name="bcast_add",
                         binds={
                             A: Ab,
                             B: Bb
                         })
        dev = tvm.cpu(0)
        a = tvm.nd.array(
            np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev)
        b = tvm.nd.array(
            np.random.uniform(size=(2, 1, 1)).astype(B.dtype), dev)
        c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev)
        fadd(a, b, c)
        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

    check()
Example #21
0
def test_buffer_broadcast_expr():
    n0, m0, x = te.size_var("n0"), te.size_var("m0"), te.size_var("x")
    n1, m1 = te.size_var("n1"), te.size_var("m1")
    o0, o1 = te.size_var("o0"), te.size_var("o1")

    A = te.placeholder((m0, n0), name="A")
    B = te.placeholder((m1, n1), name="B")
    C = te.compute((o0, o1 // x), lambda i, j: A[i, j] + B[i, j], name="C")

    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
    Cc = tvm.tir.decl_buffer(C.shape, C.dtype, name="Cc", buffer_type="auto_broadcast")
    s = te.create_schedule(C.op)

    def check_stride():
        fadd = tvm.build(
            s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc}
        )
        dev = tvm.cpu(0)
        a = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(A.dtype), dev)
        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
        fadd(a, b, c, 4, 1)
        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

    def check_no_stride():
        fadd = tvm.build(
            s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc}
        )
        dev = tvm.cpu(0)
        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev)
        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
        fadd(a, b, c, 4, 1)
        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

    def check_auto_bind():
        # Let build bind buffers
        fadd = tvm.build(s, [A, B, C, o1, x], target="llvm", name="bcast_add")
        dev = tvm.cpu(0)
        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev)
        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
        fadd(a, b, c, 4, 1)
        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

    check_stride()
    check_no_stride()
    check_auto_bind()
Example #22
0
def verify_tensor_scalar_bop(shape, typ="add"):
    """Verify non-constant Tensor and scalar binary operations."""
    sh = [te.size_var("n%d" % i) for i in range(0, len(shape))]
    k = te.var("k")
    A = te.placeholder(sh, name="A")
    if typ == "add":
        B = A + k
    elif typ == "sub":
        B = A - k
    elif typ == "mul":
        B = A * k
    elif typ == "div":
        B = A / k
    else:
        raise NotImplementedError()

    def check_device(device):
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        ctx = tvm.context(device, 0)
        print("Running on target: %s" % device)
        with tvm.target.Target(device):
            s = tvm.topi.testing.get_elemwise_schedule(device)(B)

        k_ = 2
        foo = tvm.build(s, [A, B, k] + sh, device, name="tensor_scalar_" + typ)
        a_npy = np.random.uniform(size=shape).astype(A.dtype)
        if typ == "add":
            b_npy = a_npy + k_
        elif typ == "sub":
            b_npy = a_npy - k_
        elif typ == "mul":
            b_npy = a_npy * k_
        elif typ == "div":
            b_npy = a_npy / k_
        else:
            raise NotImplementedError()

        a_nd = tvm.nd.array(a_npy, ctx)
        b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx)
        foo(a_nd, b_nd, k_, *shape)
        tvm.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5)

    for device in ["llvm", "cuda", "opencl", "metal", "rocm", "vulkan"]:
        check_device(device)
def test_if_likely():
    ib = tvm.tir.ir_builder.create()
    A = ib.pointer("float32", name="A")
    C = ib.pointer("float32", name="C")
    n = te.size_var("n")
    tx = te.thread_axis("threadIdx.x")
    ty = te.thread_axis("threadIdx.y")
    ib.scope_attr(tx, "thread_extent", 32)
    ib.scope_attr(ty, "thread_extent", 32)
    with ib.if_scope(ib.likely(tx * 32 + ty < n)):
        with ib.if_scope(ib.likely(tx * 32 + ty < n)):
            A[tx] = C[tx * 32 + ty]
    body = ib.get()
    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, C, n], body))
    body = tvm.tir.transform.Simplify()(mod)["main"].body
    assert isinstance(body.body.body, tvm.tir.IfThenElse)
    assert not isinstance(body.body.body.then_case, tvm.tir.IfThenElse)
Example #24
0
def test_meta_data():
    n, c, h, w = te.size_var("n"), 10, 224, 224
    x = relay.var("x", shape=(n, c, h, w))
    w = relay.var("w")
    z = relay.nn.conv2d(x, w, kernel_size=(3, 3), padding=(1, 1), channels=2)
    f = relay.Function([x, w], z)
    text = astext(f, unify_free_vars=True)
    text_no_meta = str(f)
    assert "channels=2" in text
    assert "channels=2" in text_no_meta
    assert "meta[tir.SizeVar][0]" in text
    assert "meta[tir.SizeVar][0]" in text_no_meta
    assert "type_key" in text
    assert "type_key" not in text_no_meta

    text = astext(relay.const([1, 2, 3]))
    assert "meta[relay.Constant][0]" in text
Example #25
0
def test_unroll_fake_loop():
    ib = tvm.tir.ir_builder.create()
    dtype = 'int32'
    n = te.size_var('n')
    Ab = tvm.tir.decl_buffer((n, ), dtype)
    Aptr = ib.buffer_ptr(Ab)
    # for i in 0 to n-1:
    with ib.for_range(0, 1, name="i") as i:
        Aptr[i * 2] = 3
        with ib.for_range(0, 10, name="j") as j:
            Aptr[j + 1] = Aptr[i] + 1

    stmt = ib.get()

    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([Ab], stmt))
    ret = tvm.tir.transform.UnrollLoop(8, 0, 1, False)(mod)["main"].body
    assert isinstance(ret[0], tvm.tir.Store)
Example #26
0
def test_stack_vm_basic():
    a = tvm.nd.array(np.zeros(10, dtype="float32"))

    @tvm.register_func
    def tvm_call_back_get_shape(shape0):
        print(shape0)
        assert shape0 == a.shape[0]

    n = te.size_var("n")
    Ab = tvm.tir.decl_buffer((n, ), "float32")
    stmt = tvm.tir.Evaluate(
        tvm.tir.call_packed("tvm_call_back_get_shape", Ab.shape[0]))

    mod = tvm.IRModule.from_expr(
        tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", "print_shape"))

    run_jit(mod, lambda f: f(a))
def test_inline():
    m = te.size_var('m')
    A = te.placeholder((m, ), name='A')
    T = te.compute((m, ), lambda i, : A[i] + 10, name='T')
    stmt = tvm.tir.Evaluate(T[10] + 11 * T[100])
    stmt = tvm.tir.ir_pass.Inline(stmt, T.op, [x.var for x in T.op.axis],
                                  T.op.body[0])
    print(stmt)
    assert (tvm.tir.ir_pass.VerifySSA(stmt))

    try:
        # pass in int array(wrong argument type)
        # must raise an error
        stmt = tvm.tir.ir_pass.Inline(T.op, [1, 2, 3], T.op.body, stmt)
        assert False
    except tvm.error.TVMError:
        pass
Example #28
0
 def save_object(names):
     n = te.size_var('n')
     Ab = tvm.tir.decl_buffer((n, ), dtype)
     i = te.var('i')
     # for i in 0 to n-1:
     stmt = tvm.tir.For(
         i, 0, n - 1, 0, 0,
         tvm.tir.Store(Ab.data,
                        tvm.tir.Load(dtype, Ab.data, i) + 1,
                        i + 1))
     mod = tvm.IRModule.from_expr(
         tvm.tir.PrimFunc([Ab], stmt).with_attr(
             "global_symbol", "main")
     )
     m = tvm.driver.build(mod, target="llvm")
     for name in names:
         m.save(name)
Example #29
0
    def run(dtype):
        # graph
        n = te.size_var("n")
        A = te.placeholder((n, ), name="A", dtype=dtype)
        B = te.placeholder((n, ), name="B", dtype=dtype)
        bias = te.var("bias", dtype=dtype)
        scale = te.var("scale", dtype=dtype)
        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
        # schedule
        s = te.create_schedule(C.op)
        # create iter var and assign them tags.
        num_thread = 16
        bx, x = s[C].split(C.op.axis[0], factor=num_thread * 4)
        tx, x = s[C].split(x, nparts=num_thread)
        _, x = s[C].split(x, factor=4)
        s[C].bind(bx, te.thread_axis("blockIdx.x"))
        s[C].bind(tx, te.thread_axis("threadIdx.x"))
        s[C].vectorize(x)

        # one line to build the function.
        def check_device(device):
            ctx = tvm.context(device, 0)
            if not tvm.testing.device_enabled(device):
                print("skip because %s is not enabled.." % device)
                return
            fadd = tvm.build(s, [A, B, C], device, name="myadd")

            # launch the kernel.
            n = 1024
            a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype),
                             ctx)
            b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype),
                             ctx)
            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
            ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=1)
            tcost = ftimer(a, b, c).mean
            tvm.testing.assert_allclose(c.asnumpy(),
                                        a.asnumpy() + b.asnumpy(),
                                        rtol=1e-6)

        check_device("opencl")
        check_device("cuda")
        if dtype == "float32":
            check_device("metal")
            check_device("vulkan")
 def check_llvm():
     # Specifically allow offset to test codepath when offset is available
     Ab = tvm.tir.decl_buffer(A.shape,
                              A.dtype,
                              elem_offset=te.size_var("Aoffset"),
                              offset_factor=8,
                              name="A")
     binds = {A: Ab}
     # BUILD and invoke the kernel.
     f = tvm.build(s, [A, B, C], "llvm", binds=binds)
     dev = tvm.cpu(0)
     # launch the kernel.
     n = nn
     a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
     b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
     c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())