def test_static_tensor(): dtype = "float32" stype = "csr" target = "llvm" ctx = tvm.context(target, 0) m = te.size_var("m") n = te.size_var("n") A = tvmsp.placeholder(shape=(m, n), name="A", dtype=dtype) assert A.stype == "csr" n = 3 a = np.maximum(np.random.uniform(size=(n, n)).astype(dtype) - 0.6, 0.0) a = tvmsp.array(a, ctx) A.data = te.placeholder(a.data.shape, dtype, name="A_data") Ab = tvm.tir.decl_buffer(a.data.shape, dtype, name="A_data") binds = {A.data: Ab} C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter") s = te.create_schedule(C.op) f = tvm.build(s, [A.data, C], target, binds=binds) c = tvmsp.array(np.zeros((n, n), dtype), ctx) c.data = tvm.nd.empty(a.data.shape, dtype) c.indices = a.indices c.indptr = a.indptr f(a.data, c.data) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2.0, rtol=1e-5)
def test_tensor_intrin_scalar_params(): n = te.size_var("n") x = te.placeholder((n, ), name="x") v = te.size_var("v") w = te.size_var("w") z = te.compute((n, ), lambda i: x[i] * v + w, name="z") def intrin_func(ins, outs, sp): assert isinstance(ins[0], tvm.te.schedule.Buffer) assert ins[0].shape[0] == n assert sp[0] == v assert sp[1] == w return tvm.tir.call_packed("hw_func", ins[0].data, outs[0].data, sp[0], sp[1]) intrin = te.decl_tensor_intrin(z.op, intrin_func, scalar_params=[v, w], default_buffer_params={"offset_factor": 1}) assert intrin.op == z.op assert intrin.reduce_init is None assert tuple(intrin.inputs) == tuple(z.op.input_tensors) assert intrin.buffers[0].shape[0] == n assert tuple(intrin.scalar_params) == tuple((v, w)) A = te.placeholder((10, 10), name="A") # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs C = te.compute((10, 10), lambda i, j: intrin(i * i, A[i, j], i + j), name="C") s = te.create_schedule(C.op) stmt = tvm.lower(s, [A, C])["main"].body assert isinstance(stmt.body.body, tvm.tir.Evaluate) assert len(stmt.body.body.value.args) == 5 assert str(stmt.body.body.value.args[3]) == "(i: int32*i)" assert str(stmt.body.body.value.args[4]) == "(i: int32 + j: int32)"
def test_batch_matmul(executor_kind): b, m, n, k = te.size_var("b"), te.size_var("m"), te.size_var( "n"), te.size_var("k") x = relay.var("x", relay.TensorType((b, m, k), "float32")) y = relay.var("y", relay.TensorType((b, n, k), "float32")) z = relay.nn.batch_matmul(x, y) zz = run_infer_type(z) assert zz.checked_type == relay.TensorType((b, m, n), "float32") verify_batch_matmul(executor_kind, (1, 16, 32), (1, 16, 32), (1, 16, 16), trans_x=False, trans_y=True) verify_batch_matmul(executor_kind, (5, 16, 32), (5, 16, 32), (5, 16, 16), trans_x=False, trans_y=True) verify_batch_matmul(executor_kind, (5, 16, 32), (5, 20, 32), (5, 16, 20), trans_x=False, trans_y=True) verify_batch_matmul(executor_kind, (30, 16, 32), (30, 20, 32), (30, 16, 20), trans_x=False, trans_y=True) verify_batch_matmul(executor_kind, (1, 32, 16), (1, 16, 32), (1, 16, 16), trans_x=True, trans_y=True) verify_batch_matmul(executor_kind, (5, 16, 32), (5, 32, 16), (5, 16, 16), trans_x=False, trans_y=False) verify_batch_matmul(executor_kind, (5, 32, 16), (5, 32, 20), (5, 16, 20), trans_x=True, trans_y=False) x_np = np.random.randn(10, 27, 64).astype("float32") x = relay.var("x", shape=x_np.shape) verify_batch_matmul_with_inputs(executor_kind, x, x, x_np, x_np, (10, 27, 27))
def test_thread_storage_sync(): m = te.size_var('m') l = te.size_var('l') A = te.placeholder((m, l), name='A') A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1') A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2') s = te.create_schedule(A2.op) xo, xi = s[A2].split(A2.op.axis[0], factor=8) s[A2].bind(xo, te.thread_axis("blockIdx.x")) s[A1].compute_at(s[A2], xo) s[A1].set_scope("shared") bounds = tvm.te.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.te.schedule.ScheduleOps(s, bounds) func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, A2], stmt, None) mod = tvm.IRModule.from_expr(func) mod = tvm.tir.transform.StorageFlatten(64)(mod._move()) cuda_target = tvm.target.create("cuda") mod = tvm.tir.transform.Apply(lambda f: f.with_attr({ "global_symbol": "test", "target": cuda_target }))(mod._move()) fdevice = tvm.tir.transform.SplitHostDevice()(mod)["test_kernel0"] mod = tvm.IRModule.from_expr(fdevice) cuda_target = tvm.target.create("cuda") f = tvm.tir.transform.ThreadSync("shared")(mod)["test_kernel0"] body_list = tvm.tir.stmt_list(f.body.body.body.body) assert (body_list[1].value.op.same_as( tvm.ir.Op.get("tir.tvm_storage_sync")))
def test_rocm_cross_thread_reduction(): # based on the reduction tutorial n = te.size_var("n") m = te.size_var("m") A = te.placeholder((n, m), name="A") k = te.reduce_axis((0, m), "k") B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B") s = te.create_schedule(B.op) ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) BF = s.rfactor(B, ki) xo, xi = s[B].split(s[B].op.axis[0], factor=32) s[B].bind(xo, bx) s[B].bind(xi, ty) s[B].bind(s[B].op.reduce_axis[0], tx) s[BF].compute_at(s[B], s[B].op.reduce_axis[0]) s[B].set_store_predicate(tx.var.equal(0)) frocm = tvm.build(s, [A, B], "rocm") nn = 128 dev = tvm.rocm(0) a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), dev) b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev) frocm(a, b) tvm.testing.assert_allclose(b.numpy(), np.sum(a.numpy(), axis=1), rtol=1e-4)
def test_basic(): n = te.size_var('n') A = te.placeholder((n, ), name='A') B = te.placeholder((n, ), name='B') T = te.compute((n, ), lambda i: A[i]+B[i]) s = te.create_schedule(T.op) xo, xi = s[T].split(T.op.axis[0], factor=4) bounds = tvm.te.schedule.InferBound(s) stmt = tvm.te.schedule.ScheduleOps(s, bounds) stmt = tvm.tir.ir_pass.LoopPartition(stmt, False) stmt = tvm.tir.ir_pass.Simplify(stmt) assert('if' not in str(stmt.body.body.body[0])) assert('if' in str(stmt.body.body.body[1]))
def test_in_bounds_llvm(): n = te.size_var("n") A = te.placeholder((n,), name="A") B = te.placeholder((n,), name="B") C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") s = te.create_schedule(C.op) tgt = "llvm" tgt_host = "llvm" stmt = tvm.lower(s, [A, B, C], simple_mode=True) fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd") ctx = tvm.context(tgt, 0) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), ctx) fadd(a, b, c)
def test_hoisting_block_scope_1(): n = te.size_var("n") m = te.size_var("m") A = te.placeholder((n, m), name="A") k = te.reduce_axis((0, m), "k") B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B") s = te.create_schedule(B.op) ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) BF = s.rfactor(B, ki) xo, xi = s[B].split(s[B].op.axis[0], factor=32) s[B.op].bind(xo, te.thread_axis("blockIdx.x")) s[B.op].bind(xi, te.thread_axis("threadIdx.y")) s[B].bind(s[B].op.reduce_axis[0], te.thread_axis("threadIdx.x")) s[BF].compute_at(s[B], s[B].op.reduce_axis[0]) func = tvm.driver.build_module.schedule_to_module(s, [A, B], "main", None)["main"] stmt = func.body new_stmt = tvm.tir.transform.HoistIfThenElse()(tvm.IRModule.from_expr(func))["main"].body tvm.ir.assert_structural_equal(new_stmt, stmt) with tvm.transform.PassContext( config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}} ): new_stmt = tvm.tir.transform.HoistIfThenElse()(tvm.IRModule.from_expr(func))["main"].body assert not tvm.ir.structural_equal(new_stmt, stmt)
def test_basic_likely_elimination(): n = te.size_var('n') X = te.placeholder(shape=(n, ), name="x") W = te.placeholder(shape=(n + 1, ), dtype="int32", name="w") def f(i): start = W[i] extent = W[i + 1] - W[i] rv = te.reduce_axis((0, extent)) return te.sum(X[rv + start], axis=rv) Y = te.compute(X.shape, f, name="y") s = te.create_schedule([Y.op]) stmt = tvm.lower(s, [X, W, Y], simple_mode=True) assert ('if' not in str(stmt))
def test_multi_if(): ib = tvm.tir.ir_builder.create() m = te.size_var("m") n = te.size_var("n") with ib.for_range(0, 4, "i") as i: with ib.for_range(0, n, "j") as j: with ib.for_range(0, m, "k") as k: with ib.if_scope(ib.likely(i * m + j + k < n)): ib.emit(tvm.tir.Evaluate(m)) with ib.else_scope(): ib.emit(tvm.tir.Evaluate(n)) with ib.if_scope(ib.likely(i * m + j - k < n)): ib.emit(tvm.tir.Evaluate(m)) with ib.else_scope(): ib.emit(tvm.tir.Evaluate(n)) stmt = ib.get() mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt)) mod = tvm.tir.transform.LoopPartition()(mod) stmt = tvm.tir.transform.Simplify()(mod)["main"].body assert not any( collect_visit(stmt.body[0], lambda x: isinstance(x, tvm.tir.IfThenElse)))
def test_thread_extent_simplify(): ib = tvm.tir.ir_builder.create() A = ib.pointer("float32", name="A") C = ib.pointer("float32", name="C") n = te.size_var("n") tx = te.thread_axis("threadIdx.x") ty = te.thread_axis("threadIdx.y") ib.scope_attr(tx, "thread_extent", n) ib.scope_attr(tx, "thread_extent", n) ib.scope_attr(ty, "thread_extent", 1) with ib.if_scope(tx + ty < 12): A[tx] = C[tx + ty] body = tvm.tir.LetStmt(n, 10, ib.get()) body = tvm.tir.ir_pass.CanonicalSimplify(body) assert isinstance(body.body.body.body, tvm.tir.Store)
def test_dyn_shared_reuse_and_merge(): n = 64 A = te.placeholder((n, ), name="A", dtype="float32") B = te.placeholder((n, ), name="B", dtype="float32") C = te.placeholder((te.size_var("n_dyn"), ), name="C", dtype="float32") def test_device_ir(A, B, C, D): ib = tvm.tir.ir_builder.create() tx = te.thread_axis("threadIdx.x") ib.scope_attr(tx, "thread_extent", n) A_sh = ib.allocate(A.dtype, (n, ), scope="shared.dyn", name="A_sh") B_sh = ib.allocate(B.dtype, (n, ), scope="shared.dyn", name="B_sh") C_sh = ib.allocate(C.dtype, (C.shape[0], ), scope="shared.dyn", name="C_sh") Aptr = ib.buffer_ptr(A) Bptr = ib.buffer_ptr(B) Cptr = ib.buffer_ptr(C) Dptr = ib.buffer_ptr(D) A_sh[tx] = Aptr[tx] Dptr[tx] = A_sh[tx] B_sh[tx] = Bptr[tx] Dptr[tx] += B_sh[tx] C_sh[tx] = Cptr[ tx] # C cannot reuse other buffers since it size is dynamic Dptr[tx] += C_sh[tx] return ib.get() D = te.extern( (n, ), [A, B, C], lambda ins, outs: test_device_ir(ins[0], ins[1], ins[2], outs[0]), name="vadd", dtype="float32", ) s = te.create_schedule(D.op) mod = run_passes(s, [A, B, C, D]) # merged allocation # allocate(buf_dyn_shmem: Pointer(shared.dyn uint8), uint8, [((n_dyn*4) + 256)]); verify_single_allocation(mod["main"].body)
def test_thread_axis2(): n = tvm.runtime.convert(4096) m = te.size_var('m') A = te.placeholder((n, ), name='A') B = te.placeholder((n, ), name='B') C = te.compute(A.shape, lambda i: A[i] + B[i], name='C') s = te.create_schedule(C.op) num_thread = 32 bx, x = s[C].split(C.op.axis[0], factor=32) tx, x = s[C].split(x, nparts=num_thread) _, x = s[C].split(x, factor=m) s[C].bind(bx, te.thread_axis("blockIdx.x")) s[C].bind(tx, te.thread_axis("threadIdx.x")) stmt = lower(s, [A, B]) for_body = stmt.body.body.body.body[0] assert ('threadIdx' not in str(for_body.extent))
def test_thread_extent_simplify(): ib = tvm.tir.ir_builder.create() A = ib.pointer("float32", name="A") C = ib.pointer("float32", name="C") n = te.size_var("n") tx = te.thread_axis("threadIdx.x") ty = te.thread_axis("threadIdx.y") ib.scope_attr(tx, "thread_extent", n) ib.scope_attr(tx, "thread_extent", n) ib.scope_attr(ty, "thread_extent", 1) with ib.if_scope(tx + ty < 12): A[tx] = C[tx + ty] body = tvm.tir.LetStmt(n, 10, ib.get()) mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, C, n], body)) body = tvm.tir.transform.Simplify()(mod)["main"].body assert isinstance(body.body.body.body, tvm.tir.BufferStore)
def get_shape(shape): """Convert the shape to correct dtype and vars.""" ret = [] for dim in shape: if isinstance(dim, tvm.tir.IntImm): if libinfo()["INDEX_DEFAULT_I64"] == "ON": ret.append(dim) else: val = int(dim) assert val <= np.iinfo(np.int32).max ret.append(tvm.tir.IntImm("int32", val)) elif isinstance(dim, tvm.tir.Any): ret.append(te.size_var("any_dim", "int32")) else: ret.append(dim) return ret
def test_out_of_bounds_loop_partition_basic_llvm(index_a, index_b): n = te.size_var("n") A = te.placeholder((n, ), name="A") B = te.placeholder((n, ), name="B") T = te.compute((n, ), lambda i: A[i + index_a] + B[i + index_b]) s = te.create_schedule(T.op) xo, xi = s[T].split(T.op.axis[0], factor=4) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) dev = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") a = tvm.nd.array(np.random.uniform(size=(32, )).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=(32, )).astype(B.dtype), dev) t = tvm.nd.empty((32, ), T.dtype, dev) f(a, b, t)
def test_in_bounds_loop_partition_basic_llvm(): n = te.size_var('n') A = te.placeholder((n, ), name='A') B = te.placeholder((n, ), name='B') T = te.compute((n, ), lambda i: A[i] + B[i]) s = te.create_schedule(T.op) xo, xi = s[T].split(T.op.axis[0], factor=4) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) ctx = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") a = tvm.nd.array(np.random.uniform(size=(32, )).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(32, )).astype(B.dtype), ctx) t = tvm.nd.empty((32, ), T.dtype, ctx) f(a, b, t)
def test_for(): ib = tvm.tir.ir_builder.create() n = te.size_var("n") A = ib.allocate("float32", n, name="A", scope="global") with ib.for_range(0, n, name="i") as i: A[i] = A[i] + 1 with ib.for_range(0, 10, name="j") as j: A[j] = A[j] + 2 body = ib.get() assert isinstance(body, tvm.tir.Allocate) body = body.body assert isinstance(body, tvm.tir.For) body = body.body assert isinstance(body, tvm.tir.SeqStmt) assert isinstance(body[1], tvm.tir.For)
def test_stack_vm_basic(): a = tvm.nd.array(np.zeros(10, dtype='float32')) @tvm.register_func def tvm_call_back_get_shape(shape0): print(shape0) assert shape0 == a.shape[0] n = te.size_var('n') Ab = tvm.tir.decl_buffer((n, ), "float32") stmt = tvm.tir.Evaluate( tvm.tir.call_packed("tvm_call_back_get_shape", Ab.shape[0])) fapi = tvm.tir.ir_pass.MakeAPI(stmt, "print_shape", [Ab], 0, True) fapi = tvm.tir.ir_pass.LowerTVMBuiltin(fapi) fapi = tvm.tir.ir_pass.LowerIntrin(fapi, "stackvm") run_jit(fapi, lambda f: f(a))
def test_buffer_broadcast(): m0, m1, m2 = te.size_var("m0"), te.size_var("m1"), te.size_var("m2") n0, n1, n2 = te.size_var("n0"), te.size_var("n1"), te.size_var("n2") o0, o1, o2 = te.size_var("o0"), te.size_var("o1"), te.size_var("o2") A = te.placeholder((m0, m1, m2), name="A") B = te.placeholder((n0, n1, n2), name="B") C = te.compute((o0, o1, o2), lambda i, j, k: A[i, j, k] + B[i, j, k], name="C") Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast") Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast") s = te.create_schedule(C.op) def check(): fadd = tvm.build(s, [A, B, C], target="llvm", name="bcast_add", binds={ A: Ab, B: Bb }) dev = tvm.cpu(0) a = tvm.nd.array( np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev) b = tvm.nd.array( np.random.uniform(size=(2, 1, 1)).astype(B.dtype), dev) c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev) fadd(a, b, c) tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) check()
def test_buffer_broadcast_expr(): n0, m0, x = te.size_var("n0"), te.size_var("m0"), te.size_var("x") n1, m1 = te.size_var("n1"), te.size_var("m1") o0, o1 = te.size_var("o0"), te.size_var("o1") A = te.placeholder((m0, n0), name="A") B = te.placeholder((m1, n1), name="B") C = te.compute((o0, o1 // x), lambda i, j: A[i, j] + B[i, j], name="C") Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast") Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast") Cc = tvm.tir.decl_buffer(C.shape, C.dtype, name="Cc", buffer_type="auto_broadcast") s = te.create_schedule(C.op) def check_stride(): fadd = tvm.build( s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc} ) dev = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev) c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev) fadd(a, b, c, 4, 1) tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) def check_no_stride(): fadd = tvm.build( s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc} ) dev = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev) c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev) fadd(a, b, c, 4, 1) tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) def check_auto_bind(): # Let build bind buffers fadd = tvm.build(s, [A, B, C, o1, x], target="llvm", name="bcast_add") dev = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev) c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev) fadd(a, b, c, 4, 1) tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) check_stride() check_no_stride() check_auto_bind()
def verify_tensor_scalar_bop(shape, typ="add"): """Verify non-constant Tensor and scalar binary operations.""" sh = [te.size_var("n%d" % i) for i in range(0, len(shape))] k = te.var("k") A = te.placeholder(sh, name="A") if typ == "add": B = A + k elif typ == "sub": B = A - k elif typ == "mul": B = A * k elif typ == "div": B = A / k else: raise NotImplementedError() def check_device(device): if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.context(device, 0) print("Running on target: %s" % device) with tvm.target.Target(device): s = tvm.topi.testing.get_elemwise_schedule(device)(B) k_ = 2 foo = tvm.build(s, [A, B, k] + sh, device, name="tensor_scalar_" + typ) a_npy = np.random.uniform(size=shape).astype(A.dtype) if typ == "add": b_npy = a_npy + k_ elif typ == "sub": b_npy = a_npy - k_ elif typ == "mul": b_npy = a_npy * k_ elif typ == "div": b_npy = a_npy / k_ else: raise NotImplementedError() a_nd = tvm.nd.array(a_npy, ctx) b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx) foo(a_nd, b_nd, k_, *shape) tvm.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5) for device in ["llvm", "cuda", "opencl", "metal", "rocm", "vulkan"]: check_device(device)
def test_if_likely(): ib = tvm.tir.ir_builder.create() A = ib.pointer("float32", name="A") C = ib.pointer("float32", name="C") n = te.size_var("n") tx = te.thread_axis("threadIdx.x") ty = te.thread_axis("threadIdx.y") ib.scope_attr(tx, "thread_extent", 32) ib.scope_attr(ty, "thread_extent", 32) with ib.if_scope(ib.likely(tx * 32 + ty < n)): with ib.if_scope(ib.likely(tx * 32 + ty < n)): A[tx] = C[tx * 32 + ty] body = ib.get() mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, C, n], body)) body = tvm.tir.transform.Simplify()(mod)["main"].body assert isinstance(body.body.body, tvm.tir.IfThenElse) assert not isinstance(body.body.body.then_case, tvm.tir.IfThenElse)
def test_meta_data(): n, c, h, w = te.size_var("n"), 10, 224, 224 x = relay.var("x", shape=(n, c, h, w)) w = relay.var("w") z = relay.nn.conv2d(x, w, kernel_size=(3, 3), padding=(1, 1), channels=2) f = relay.Function([x, w], z) text = astext(f, unify_free_vars=True) text_no_meta = str(f) assert "channels=2" in text assert "channels=2" in text_no_meta assert "meta[tir.SizeVar][0]" in text assert "meta[tir.SizeVar][0]" in text_no_meta assert "type_key" in text assert "type_key" not in text_no_meta text = astext(relay.const([1, 2, 3])) assert "meta[relay.Constant][0]" in text
def test_unroll_fake_loop(): ib = tvm.tir.ir_builder.create() dtype = 'int32' n = te.size_var('n') Ab = tvm.tir.decl_buffer((n, ), dtype) Aptr = ib.buffer_ptr(Ab) # for i in 0 to n-1: with ib.for_range(0, 1, name="i") as i: Aptr[i * 2] = 3 with ib.for_range(0, 10, name="j") as j: Aptr[j + 1] = Aptr[i] + 1 stmt = ib.get() mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([Ab], stmt)) ret = tvm.tir.transform.UnrollLoop(8, 0, 1, False)(mod)["main"].body assert isinstance(ret[0], tvm.tir.Store)
def test_stack_vm_basic(): a = tvm.nd.array(np.zeros(10, dtype="float32")) @tvm.register_func def tvm_call_back_get_shape(shape0): print(shape0) assert shape0 == a.shape[0] n = te.size_var("n") Ab = tvm.tir.decl_buffer((n, ), "float32") stmt = tvm.tir.Evaluate( tvm.tir.call_packed("tvm_call_back_get_shape", Ab.shape[0])) mod = tvm.IRModule.from_expr( tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", "print_shape")) run_jit(mod, lambda f: f(a))
def test_inline(): m = te.size_var('m') A = te.placeholder((m, ), name='A') T = te.compute((m, ), lambda i, : A[i] + 10, name='T') stmt = tvm.tir.Evaluate(T[10] + 11 * T[100]) stmt = tvm.tir.ir_pass.Inline(stmt, T.op, [x.var for x in T.op.axis], T.op.body[0]) print(stmt) assert (tvm.tir.ir_pass.VerifySSA(stmt)) try: # pass in int array(wrong argument type) # must raise an error stmt = tvm.tir.ir_pass.Inline(T.op, [1, 2, 3], T.op.body, stmt) assert False except tvm.error.TVMError: pass
def save_object(names): n = te.size_var('n') Ab = tvm.tir.decl_buffer((n, ), dtype) i = te.var('i') # for i in 0 to n-1: stmt = tvm.tir.For( i, 0, n - 1, 0, 0, tvm.tir.Store(Ab.data, tvm.tir.Load(dtype, Ab.data, i) + 1, i + 1)) mod = tvm.IRModule.from_expr( tvm.tir.PrimFunc([Ab], stmt).with_attr( "global_symbol", "main") ) m = tvm.driver.build(mod, target="llvm") for name in names: m.save(name)
def run(dtype): # graph n = te.size_var("n") A = te.placeholder((n, ), name="A", dtype=dtype) B = te.placeholder((n, ), name="B", dtype=dtype) bias = te.var("bias", dtype=dtype) scale = te.var("scale", dtype=dtype) C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") # schedule s = te.create_schedule(C.op) # create iter var and assign them tags. num_thread = 16 bx, x = s[C].split(C.op.axis[0], factor=num_thread * 4) tx, x = s[C].split(x, nparts=num_thread) _, x = s[C].split(x, factor=4) s[C].bind(bx, te.thread_axis("blockIdx.x")) s[C].bind(tx, te.thread_axis("threadIdx.x")) s[C].vectorize(x) # one line to build the function. def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return fadd = tvm.build(s, [A, B, C], device, name="myadd") # launch the kernel. n = 1024 a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), ctx) b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=1) tcost = ftimer(a, b, c).mean tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy(), rtol=1e-6) check_device("opencl") check_device("cuda") if dtype == "float32": check_device("metal") check_device("vulkan")
def check_llvm(): # Specifically allow offset to test codepath when offset is available Ab = tvm.tir.decl_buffer(A.shape, A.dtype, elem_offset=te.size_var("Aoffset"), offset_factor=8, name="A") binds = {A: Ab} # BUILD and invoke the kernel. f = tvm.build(s, [A, B, C], "llvm", binds=binds) dev = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())