def test_shape_func(): if not tvm.testing.device_enabled("cuda") or not tvm.cuda(0).exist: return mod = tvm.IRModule() data_shape = (relay.Any(), ) x = relay.var("x", shape=data_shape) y = relay.op.vm.shape_of(x) z = relay.nn.relu(y) p0 = relay.var("p0", shape=data_shape) fn = relay.Function([p0], z) out = relay.var("out", shape=(1, ), dtype="int64") ins = relay.Tuple([y]) outs = relay.Tuple([out]) is_inputs = [False] shape_func = relay.op.vm.shape_func(fn, ins, outs, is_inputs) mod["main"] = relay.Function([x, out], shape_func) ca = context_analysis(mod, tvm.cuda()) main = mod["main"] cpu_dev = tvm.cpu().device_type gpu_dev = tvm.cuda().device_type assert main.params[0] in ca and ca[main.params[0]][0].value == gpu_dev # The output of shape func should be on cpu. assert main.params[1] in ca and ca[main.params[1]][0].value == cpu_dev # shape func is the body and it should be on cpu assert main.body in ca and ca[main.body][0].value == cpu_dev
def run_test(tvm_intrin, np_func, dtype): if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return # set of intrinsics does not support fp16 yet. skip_set = { tvm.tir.abs, tvm.tir.round, tvm.tir.tan, tvm.tir.atan, tvm.tir.tanh, tvm.tir.cosh, tvm.tir.sinh, } if dtype == "float16" and tvm_intrin in skip_set: print("Skip because '{0}' does not support fp16 yet".format(tvm_intrin.__name__)) return n = 128 A = te.placeholder((n,), dtype=dtype, name="A") B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name="B") s = sched(B) f = tvm.build(s, [A, B], "cuda") dev = tvm.cuda(0) a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev) b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev) f(a, b) tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=1e-3, rtol=1e-3)
def check_cuda(dtype, n, l, padding, lanes): if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return dev = tvm.cuda(0) A = tvm.te.placeholder((n, l), name="A", dtype=dtype) B = tvm.te.compute( (n // lanes, l + 2 * padding, lanes), lambda i, j, k: tvm.te.if_then_else( tvm.te.any(j < padding, j >= l + padding), tvm.runtime.convert(0).astype(dtype), A[i * lanes + k, j - padding], ), name="B", ) s = te.create_schedule(B.op) block, thread, vectorize = s[B].op.axis s[B].bind(block, bx) s[B].bind(thread, tx) s[B].vectorize(vectorize) fun = tvm.build(s, [A, B], "cuda", name="vector_load_permute_pad") np_a = np.random.randint(low=-128, high=127, size=(n, l)).astype(A.dtype) a = tvm.nd.empty((n, l), A.dtype, dev).copyfrom(np_a) b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, dev) fun(a, b) np_a_reshape = np_a.reshape(n // lanes, lanes, l).transpose(0, 2, 1) ref = np.pad( np_a_reshape, ((0, 0), (padding, padding), (0, 0)), mode="constant", constant_values=0 ) tvm.testing.assert_allclose(b.numpy(), ref)
def test_multi_targets(): # Build an IRModule. n = 10 x = relay.var("x", shape=(n,)) y = relay.var("y", shape=(n,)) z = relay.var("z", shape=(n,)) f = relay.Function([x, y, z], x + relay.op.annotation.on_device(y + z, tvm.cpu())) mod = IRModule.from_expr(f) # Compile to VMExecutable. with tvm.transform.PassContext( opt_level=3, config={"relay.fallback_device_type": tvm.cuda().device_type} ): exe = relay.vm.compile( mod, target={"cpu": tvm.target.Target("llvm"), "cuda": tvm.target.Target("cuda")} ) # Run vm = runtime.vm.VirtualMachine(exe, [tvm.cuda(), tvm.cpu()]) x_data = np.random.rand( n, ).astype("float32") y_data = np.random.rand( n, ).astype("float32") z_data = np.random.rand( n, ).astype("float32") actual_result = vm.invoke("main", x_data, y_data, z_data) # Test expected_result = x_data + y_data + z_data tvm.testing.assert_allclose(actual_result.numpy(), expected_result)
def test_alloc_storage(): if not tvm.testing.device_enabled("cuda") or not tvm.cuda(0).exist: return mod = tvm.IRModule() mod.import_from_std("core.rly") size = relay.Var("size", relay.scalar_type("int64")) alignment = relay.Var("alignment", relay.scalar_type("int64")) # allocate a chunk on of memory on gpu. sto = relay.op.memory.alloc_storage(size, alignment, tvm.cuda()) mod["main"] = relay.Function([size, alignment], sto) ca = context_analysis(mod, tvm.cuda()) main = mod["main"] body = main.body cpu_dev = tvm.cpu().device_type gpu_dev = tvm.cuda().device_type # Inputs are unified with alloc storage inputs which are on cpu assert main.params[0] in ca and ca[main.params[0]][0].value == cpu_dev assert main.params[1] in ca and ca[main.params[1]][0].value == cpu_dev assert isinstance(body, relay.Call) and len(body.args) == 2 # size of alloc_storage is on cpu assert body.args[0] in ca and ca[body.args[0]][0].value == cpu_dev # alignment of alloc_storage is on cpu assert body.args[1] in ca and ca[body.args[1]][0].value == cpu_dev # alloc_storage is on gpu as specified assert body in ca and ca[body][0].value == gpu_dev
def test_alloc_tensor(): if not tvm.testing.device_enabled("cuda") or not tvm.cuda(0).exist: return mod = tvm.IRModule() mod.import_from_std("core.rly") sto_type = relay.TypeCall(mod.get_global_type_var("Storage"), []) sto = relay.Var("x", sto_type) sh = relay.const(np.array([3, 2]), dtype="int64") at = relay.op.memory.alloc_tensor(sto, relay.const(0, dtype="int64"), sh) mod["main"] = relay.Function([sto], at) ca = context_analysis(mod, tvm.cuda()) main = mod["main"] body = main.body cpu_dev = tvm.cpu().device_type gpu_dev = tvm.cuda().device_type # Input of the function falls back to the default device gpu assert main.params[0] in ca and ca[main.params[0]][0].value == gpu_dev assert isinstance(body, relay.Call) and len(body.args) == 3 # storage of alloc_tensor falls back to the default device gpu assert body.args[0] in ca and ca[body.args[0]][0].value == gpu_dev # shape of alloc_tensor is on cpu assert body.args[1] in ca and ca[body.args[1]][0].value == cpu_dev # alloc_tensor keeps the same device context as storage which is is on gpu assert body in ca and ca[body][0].value == gpu_dev
def check(t0, t1, factor): if (t0 == "float16" or t1 == "float16") and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return # compute n = 128 A = te.placeholder((n,), dtype=t0, name="A") B = te.placeholder((n,), dtype=t1, name="B") C = te.compute((n,), lambda i: A[i] + topi.cast(B[i], A.dtype), name="C") # schedule s = tvm.te.create_schedule(C.op) ob, ib = s[C].split(s[C].op.axis[0], factor=factor) s[C].vectorize(ib) s[C].bind(ob, tx) func = tvm.build(s, [A, B, C], "cuda") # correctness dev = tvm.cuda(0) low, high = (0, 20) if t0.startswith("u") or t1.startswith("u") else (-10, 10) a_np = np.random.randint(low, high, size=n).astype(A.dtype) b_np = np.random.randint(low, high, size=n).astype(B.dtype) c_np = (a_np + b_np).astype(A.dtype) a_nd = tvm.nd.array(a_np, dev) b_nd = tvm.nd.array(b_np, dev) c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), dev) func(a_nd, b_nd, c_nd) tvm.testing.assert_allclose(c_nd.numpy(), c_np, rtol=1e-3)
def check_cuda(dtype, n, lanes): if dtype == "int8" and not have_int8(tvm.cuda(0).compute_version): print("skip because gpu does not support int8") return A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes)) B = te.placeholder((n,), name="B", dtype="%sx%d" % (dtype, lanes)) C = te.placeholder((n,), name="C", dtype="int32") D = te.compute( (n,), lambda i: tvm.tir.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name="D" ) s = te.create_schedule(D.op) xo, xi = s[D].split(D.op.axis[0], factor=num_thread) s[D].bind(xo, bx) s[D].bind(xi, tx) fun = tvm.build(s, [A, B, C, D], "cuda") np_a = np.random.randint(low=-128, high=127, size=(n, lanes)) np_b = np.random.randint(low=-128, high=127, size=(n, lanes)) np_c = np.random.randint(low=0, high=127, size=(n,)) np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)] dev = tvm.cuda(0) a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_a) b = tvm.nd.empty((n,), B.dtype, dev).copyfrom(np_b) c = tvm.nd.empty((n,), C.dtype, dev).copyfrom(np_c) d = tvm.nd.empty((n,), D.dtype, dev) fun(a, b, c, d) tvm.testing.assert_allclose(d.numpy(), np_d)
def check_cuda(dtype): if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return n, m = 16, 16 A = te.placeholder( ( n, m, ), name="A", dtype=dtype, ) B = te.compute( ( n, m, ), lambda j, i: A[j, (i + 1) % m], name="B", ) cuda_target = tvm.target.Target("cuda") assert cuda_target.thread_warp_size == 2 * m with cuda_target: s = te.create_schedule(B.op) tx = te.thread_axis("threadIdx.x") ty = te.thread_axis("threadIdx.y") bx = te.thread_axis("blockIdx.x") AA = s.cache_read(A, "warp", [B]) y, x = B.op.axis z, y = s[B].split(y, nparts=2) s[B].bind(x, tx) s[B].bind(y, ty) s[B].bind(z, bx) s[AA].compute_at(s[B], y) _, x = AA.op.axis s[AA].bind(x, tx) dev = tvm.cuda(0) func = tvm.build(s, [A, B], "cuda") A_np = np.array([list(range(i, m + i)) for i in range(n)], dtype=dtype) B_np = np.array( [list(range(1 + i, m + i)) + [i] for i in range(n)], dtype=dtype) A_nd = tvm.nd.array(A_np, dev) B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev) func(A_nd, B_nd) tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3)
def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"): global TASK TASK = ( "bcast_binary_" + typ + "_lhs" + "_".join([str(ele) for ele in lhs_shape]) + "rhs" + "_".join([str(ele) for ele in rhs_shape]) ) A = te.placeholder(shape=lhs_shape, name="A") B = te.placeholder(shape=rhs_shape, name="B") if typ == "add": C = topi.broadcast_add(A, B) elif typ == "sub": C = topi.broadcast_sub(A, B) elif typ == "div": C = topi.broadcast_div(A, B) elif typ == "mul": C = topi.broadcast_mul(A, B) elif typ == "maximum": C = topi.broadcast_maximum(A, B) elif typ == "minimum": C = topi.broadcast_minimum(A, B) else: raise NotImplementedError s = topi.cuda.schedule_broadcast(C) fcuda = tvm.build(s, [A, B, C], "cuda", name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) lhs_nd = tvm.nd.array(lhs_npy, tvm.cuda()) rhs_nd = tvm.nd.array(rhs_npy, tvm.cuda()) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), tvm.cuda()) for _ in range(2): fcuda(lhs_nd, rhs_nd, out_nd) tvm.testing.assert_allclose(out_nd.numpy(), out_npy)
def test_inject_async_copy_shared_dyn(): f = ptx_global_to_shared_dyn_copy_fp16x8 mod = tvm.IRModule.from_expr(f) mod = tvm.tir.transform.FlattenBuffer()(mod) mod = tvm.tir.transform.VectorizeLoop()(mod) mod = tvm.tir.transform.MergeDynamicSharedMemoryAllocations()(mod) mod = tvm.tir.transform.InjectPTXAsyncCopy()(mod) assert count_cp_async(mod["main"].body) == 2 if not tvm.testing.is_ampere_or_newer(): return with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}): mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda") A_np = np.random.rand(32, 128).astype("float16") B_np = np.random.rand(32, 128).astype("float16") C_np = np.zeros((32, 128)).astype("float16") dev = tvm.cuda(0) A_nd = tvm.nd.array(A_np, device=dev) B_nd = tvm.nd.array(B_np, device=dev) C_nd = tvm.nd.array(C_np, device=dev) mod(A_nd, B_nd, C_nd) tvm.testing.assert_allclose(C_nd.numpy(), A_np + B_np)
def build_export_vm(device): """relay build & export graph""" x = relay.var("x", shape=(10, 5)) y = relay.var("y", shape=(1, 5)) z = relay.add(x, y) z = relay.exp(z) func = relay.Function([x, y], z) x_data = np.random.rand(10, 5).astype("float32") y_data = np.random.rand(1, 5).astype("float32") pt_device = torch.device(device) if pt_device.type == "cuda": target = "cuda" ctx = tvm.cuda(pt_device.index) else: target = "llvm" ctx = tvm.cpu(0) exe = relay.vm.compile(tvm.IRModule.from_expr(func), target=target, params={}) code, lib = exe.save() export_dir = tempfile.mkdtemp("tvm_export") # export to tempdir lib.export_library(os.path.join(export_dir, TVM_ASSETS[0])) with open(os.path.join(export_dir, TVM_ASSETS[1]), "wb") as fout: fout.write(code) vm = tvm.runtime.vm.VirtualMachine(exe, ctx) res = vm.run(x_data, y_data) ref_res = np.exp(y_data + x_data) tvm.testing.assert_allclose(res.numpy(), ref_res, atol=1e-5, rtol=1e-5) return export_dir
def test_gemm_mma_m8n8k4_row_row_fp16fp16fp32(): sch = tvm.tir.Schedule(gemm_mma_m8n8k4_row_row_fp16fp16fp32) arch = tvm.contrib.nvcc.get_target_compute_version() major, minor = tvm.contrib.nvcc.parse_compute_version(arch) if major < 7: # Require at least SM70 return cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-1, 1, [16, 4]).astype("float16") B_np = np.random.uniform(-1, 1, [4, 16]).astype("float16") C_np = np.zeros([16, 16]).astype("float32") ctx = tvm.cuda() A_tvm = tvm.nd.array(A_np, ctx) B_tvm = tvm.nd.array(B_np, ctx) C_tvm = tvm.nd.array(C_np, ctx) cuda_mod(A_tvm, B_tvm, C_tvm) golden = np.matmul(A_np.astype("float32"), B_np.astype("float32")) C_numpy = C_tvm.numpy() tvm.testing.assert_allclose(golden, C_numpy, atol=1e-3, rtol=1e-3)
def test_inject_async_copy(): for dtype, vec_size in [("float16", 8), ("float16", 4), ("float32", 4), ("float32", 1)]: if vec_size == 1: f = ptx_global_to_shared_copy_fp32x1 else: f = generate_global_to_shared_vectorized_copy(dtype, vec_size) mod = tvm.IRModule.from_expr(f) mod = tvm.tir.transform.FlattenBuffer()(mod) if vec_size > 1: mod = tvm.tir.transform.VectorizeLoop()(mod) mod = tvm.tir.transform.InjectPTXAsyncCopy()(mod) assert count_cp_async(mod["main"].body) == 1 if not tvm.testing.is_ampere_or_newer(): continue with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}): mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda") A_np = np.random.rand(32, 128).astype(dtype) B_np = np.zeros((32, 128)).astype(dtype) dev = tvm.cuda(0) A_nd = tvm.nd.array(A_np, device=dev) B_nd = tvm.nd.array(B_np, device=dev) mod(A_nd, B_nd) tvm.testing.assert_allclose(B_nd.numpy(), A_np)
def test_cuda_graph_executor(): mod, params = relay.testing.synthetic.get_workload() with tvm.transform.PassContext(opt_level=3): complied_graph_lib = relay.build_module.build(mod, "cuda", params=params) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") dev = tvm.cuda() try: gmod = complied_graph_lib["cuda_graph_create"](dev) except: print("Skip because cuda_graph not enabled") return set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] set_input("data", tvm.nd.array(data)) run() out = get_output(0).numpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # cuda graph executor wrapper cu_gmod = cuda_graph_executor.GraphModuleCudaGraph(gmod) cu_gmod.set_input("data", data) cu_gmod.run() out = cu_gmod.get_output(0).numpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
def test_gemm_mma_m16n8k16_row_col_s8u8s32(): sch = tvm.tir.Schedule(gemm_mma_m16n8k16_row_col_s8u8s32) arch = tvm.contrib.nvcc.get_target_compute_version() major, minor = tvm.contrib.nvcc.parse_compute_version(arch) if major < 8: # Require at least SM80 return cuda_mod = tvm.build(sch.mod, target="cuda") cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-10, 10, [16, 16]).astype("int8") B_np = np.random.uniform(-10, 10, [8, 16]).astype("uint8") C_np = np.zeros([16, 8]).astype("int32") ctx = tvm.cuda() A_tvm = tvm.nd.array(A_np, ctx) B_tvm = tvm.nd.array(B_np, ctx) C_tvm = tvm.nd.array(C_np, ctx) cuda_mod(A_tvm, B_tvm, C_tvm) golden = np.matmul(A_np.astype("int32"), B_np.astype("int32").T) C_numpy = C_tvm.numpy() tvm.testing.assert_allclose(golden, C_numpy, atol=1e-3, rtol=1e-3)
def get_tvm_elementwise_output( graph, model_path, input1: flow.tensor, input2: flow.tensor, target="llvm", dtype="float32", ): """Generic function to execute and get tvm elementwise output""" input1_numpy = input1.numpy() input2_numpy = input2.numpy() if target == "llvm": device = tvm.cpu(0) elif target == "cuda": device = tvm.cuda(0) mod, params = relay.frontend.from_oneflow(graph, model_path) with tvm.transform.PassContext(opt_level=10): intrp = relay.build_module.create_executor("graph", mod, device, target) tvm_output = intrp.evaluate()( tvm.nd.array(input1_numpy.astype(dtype)), tvm.nd.array(input2_numpy.astype(dtype)), **params, ).numpy() return tvm_output
def test_fp16_build(): dtype = "float16" dev = tvm.cuda(0) if dtype == "float16" and not have_fp16(dev.compute_version): print("skip because gpu does not support fp16") return x = relay.var("x", dtype=dtype, shape=(4, 4)) y = relay.var("y", dtype=dtype, shape=(4, 4)) z = x + y func = relay.Function([x, y], z) X = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev) Y = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev) params = { "x": X, "y": Y, } # build g_json, mmod, params = relay.build(func, "cuda", params=params) # test rt = tvm.contrib.graph_executor.create(g_json, mmod, dev) rt.load_params(runtime.save_param_dict(params)) rt.run() out = rt.get_output(0) np.testing.assert_allclose(out.numpy(), X.numpy() + Y.numpy(), atol=1e-5, rtol=1e-5)
def test_gpu(): mod, params = relay.testing.synthetic.get_workload() with relay.build_config(opt_level=3): complied_graph_lib = relay.build_module.build(mod, "cuda", params=params) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") dev = tvm.cuda() # raw api gmod = complied_graph_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] set_input("data", tvm.nd.array(data)) run() out = get_output(0).numpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph executor wrapper gmod = graph_executor.GraphModule(complied_graph_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).numpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
def eval_acc(model, dataset, batch_fn, target=tvm.target.cuda(), device=tvm.cuda(), log_interval=100): with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(model, target) # create runtime module m = tvm.contrib.graph_executor.create(graph, lib, device) m.set_input(**params) # setup evaluaiton metric dataset.reset() batch_size = dataset.batch_size acc_top1 = mx.metric.Accuracy() acc_top5 = mx.metric.TopKAccuracy(5) acc_top1.reset() acc_top5.reset() # Execute for i, batch in enumerate(dataset): data, label = batch_fn(batch, [mx.cpu(0)]) m.run(data=data[0].asnumpy()) out_arr = m.get_output(0) acc_top1.update(label, [mx.nd.array(out_arr.asnumpy())]) acc_top5.update(label, [mx.nd.array(out_arr.asnumpy())]) if not (i + 1) % log_interval: _, top1 = acc_top1.get() _, top5 = acc_top5.get() nsamples = (i + 1) * batch_size logging.info("[%d samples] validation: acc-top1=%f acc-top5=%f", nsamples, top1, top5) logging.info("[final] validation: acc-top1=%f acc-top5=%f", top1, top5) return top1
def test_bias_add(): for dtype in ["float16", "float32"]: xshape = (10, 2, 3, 4) bshape = (2,) rtol = 1e-2 if dtype == "float16" else 1e-5 x = relay.var("x", shape=xshape, dtype=dtype) bias = relay.var("bias", dtype=dtype) z = relay.nn.bias_add(x, bias) zz = run_infer_type(z) assert "axis=" not in zz.astext() assert zz.args[1].checked_type == relay.TensorType(bshape, dtype) func = relay.Function([x, bias], z) x_data = np.random.uniform(size=xshape).astype(dtype) y_data = np.random.uniform(size=bshape).astype(dtype) ref_res = x_data + y_data.reshape((2, 1, 1)) for target, dev in tvm.testing.enabled_targets(): if ( dtype == "float16" and target == "cuda" and not have_fp16(tvm.cuda(0).compute_version) ): continue op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)( x_data, y_data ) np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol)
def test_vm_shape_of(): if not tvm.testing.device_enabled("cuda") or not tvm.cuda(0).exist: return mod = tvm.IRModule() data_shape = (relay.Any(), ) x = relay.var("x", shape=data_shape) y = relay.op.vm.shape_of(x) mod["main"] = relay.Function([x], y) ca = context_analysis(mod, tvm.cuda()) main = mod["main"] cpu_dev = tvm.cpu().device_type gpu_dev = tvm.cuda().device_type assert main.params[0] in ca and ca[main.params[0]][0].value == gpu_dev assert main.body in ca and ca[main.body][0].value == cpu_dev
def test_cuda_lib(): dev = tvm.cuda(0) for device in ["llvm", "cuda"]: if not tvm.testing.device_enabled(device): print("skip because %s is not enabled..." % device) return nn = 12 n = tvm.runtime.convert(nn) A = te.placeholder((n, ), name="A") B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") s = te.create_schedule(B.op) bx, tx = s[B].split(B.op.axis[0], factor=4) s[B].bind(bx, te.thread_axis("blockIdx.x")) s[B].bind(tx, te.thread_axis("threadIdx.x")) from tvm.contrib import utils temp = utils.tempdir() fn_add = tvm.build(s, [A, B], target="cuda --host=llvm", name="add") path_lib = temp.relpath("deploy_lib.so") fn_add.export_library(path_lib) m = tvm.runtime.load_module(path_lib) a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), dev) b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), dev) m["add"](a, b) np.testing.assert_equal(b.numpy(), a.numpy() + 1)
def check_single_op(opfunc, ref, dtype): shape = (10, 4) dtype = dtype tp = relay.TensorType(shape) x = relay.var("x", tp, dtype=dtype) y = opfunc(x) # test printer assert ("{}(%x)".format(y.op.name)) in y.astext() # test type inference yy = run_infer_type(y) assert yy.checked_type == tp if ref is not None: data = np.random.rand(*shape).astype(dtype) ref_res = ref(data) func = relay.Function([x], y) for target, dev in tvm.testing.enabled_targets(): # use graph by execuor default for testing, as we need # create function explicitly to avoid constant-folding. if (dtype == "float16" and target == "cuda" and not have_fp16(tvm.cuda(0).compute_version)): continue intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(data) np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
def verify_batch_matmul(Ashape, Bshape, Cshape, in_dtype, out_dtype, rtol=1e-5): A = te.placeholder(Ashape, name="A", dtype=in_dtype) B = te.placeholder(Bshape, name="B", dtype=in_dtype) C = cublas.batch_matmul(A, B, dtype=out_dtype) s = te.create_schedule(C.op) dev = tvm.cuda(0) f = tvm.build(s, [A, B, C], "cuda") if "int" in in_dtype: a = tvm.nd.array( np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev) b = tvm.nd.array( np.random.uniform(1, 10, size=Bshape).astype(in_dtype), dev) else: a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev) c = tvm.nd.array(np.zeros(Cshape, dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose( c.numpy(), np.matmul(a.numpy().astype(C.dtype), b.numpy().astype(C.dtype)).astype(C.dtype), rtol=rtol, )
def skip_runtime_test(): if not tvm.runtime.enabled("cuda") or not tvm.cuda(0).exist: print("Skip because CUDA is not enabled.") return True if not tensorrt.is_tensorrt_runtime_enabled(): print("Skip because TensorRT runtime is not available.") return True return False
def test_concatenate(): for dtype in ["float16", "float32"]: n, t, d = te.size_var("n"), te.size_var("t"), 100 x = relay.var("x", shape=(n, t, d)) y = relay.var("y", shape=(n, t, d)) z = relay.concatenate((x, y), axis=-1) assert "axis=" in z.astext() zz = run_infer_type(z) assert zz.checked_type == relay.TensorType((n, t, 200)) x = relay.exp(x) z = relay.concatenate((x, y), axis=2) zz = run_infer_type(z) assert zz.checked_type == relay.TensorType((n, t, 200)) z = relay.concatenate((x, y), axis=1) zz = run_infer_type(z) assert zz.checked_type == relay.TensorType((n, t + t, 100)) # check shape mismatches (the following case is expected to raise tvm._ffi.base.TVMError. try: x = relay.var("p1", shape=(2, 5)) y = relay.var("p2", shape=(2, 3)) c = relay.concatenate([x, y], axis=0) func = relay.Function([x, y], c) zz = run_infer_type(func) except tvm._ffi.base.TVMError: pass else: assert False x = relay.var("x", shape=(10, 5), dtype=dtype) y = relay.var("y", shape=(10, 5), dtype=dtype) t = relay.var("z", shape=(), dtype=dtype) z = relay.concatenate((x, y), axis=1) z = relay.add(z, t) # Check result. func = relay.Function([x, y, t], z) x_data = np.random.rand(10, 5).astype(dtype) y_data = np.random.rand(10, 5).astype(dtype) t_data = np.random.uniform(size=()).astype(dtype) ref_res = np.concatenate((x_data, y_data), axis=1) + t_data for target, dev in tvm.testing.enabled_targets(): if ( dtype == "float16" and target == "cuda" and not have_fp16(tvm.cuda(0).compute_version) ): continue op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)( x_data, y_data, t_data ) tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=0.01) op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)( x_data, y_data, t_data ) tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=0.01)
def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1): in_channel = 4 out_channel = 16 filter_d = 3 filter_h = 3 filter_w = 3 pad_d = 1 pad_h = 1 pad_w = 1 stride_d = 1 stride_h = 1 stride_w = 1 dilation_d = 1 dilation_h = 1 dilation_w = 1 batch = 3 depth = 32 height = 32 width = 32 # schedule xshape = [batch, in_channel, depth, height, width] wshape = [out_channel, in_channel // groups, filter_d, filter_h, filter_w] X = te.placeholder(xshape, name="X", dtype=data_dtype) W = te.placeholder(wshape, name="W", dtype=data_dtype) Y = cudnn.conv_forward( X, W, [pad_d, pad_h, pad_w], [stride_d, stride_h, stride_w], [dilation_d, dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=-1, conv_dtype=conv_dtype, groups=groups, ) yshape = [x.value for x in Y.shape] s = te.create_schedule(Y.op) # validation dev = tvm.cuda(0) f = tvm.build(s, [X, W, Y], target="cuda --host=llvm", name="conv3d") x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype) w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype) y_np = np.zeros(yshape).astype(data_dtype) x = tvm.nd.array(x_np, dev) w = tvm.nd.array(w_np, dev) y = tvm.nd.array(y_np, dev) if tensor_format == 0: c_np = tvm.topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1, groups) else: raise AssertionError( "For now, conv3d tensor format only support: 0(NCHW)") f(x, w, y) tvm.testing.assert_allclose(y.numpy(), c_np, atol=3e-5, rtol=1e-4)
def test_cuda_tensor_core(model_name, input_shape): """Integration tests of auto tensorization with CUDA tensor core""" target = tvm.target.Target("nvidia/geforce-rtx-3070") dev = tvm.cuda() if model_name.startswith("bert"): data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev) # embedding size else: data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev) mod, params, (input_name, _, _) = relay_workload.get_network(model_name, input_shape) seq = tvm.transform.Sequential( [ relay.transform.ToMixedPrecision(), ] ) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) def convert_layout(mod): seq = tvm.transform.Sequential( [relay.transform.ConvertLayout({"nn.conv2d": ["NHWC", "OHWI"]})] ) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) return mod with tempfile.TemporaryDirectory() as work_dir: with ms.Profiler() as profiler: rt_mod1: tvm.runtime.Module = ms.tune_relay( mod=convert_layout(mod), params=params, target=target, config=ms.TuneConfig( num_trials_per_iter=32, max_trials_per_task=200, max_trials_global=3000, ), sch_rules=ms.default_config._DefaultCUDATensorCore.schedule_rules, postprocs=ms.default_config._DefaultCUDATensorCore.postprocs, work_dir=work_dir, ) print(profiler.table()) # Compile without MetaSchedule for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod1) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-2, atol=2e-2)
def verify_conv2d_backward_filter(data_dtype, conv_dtype, tensor_format=0, tol=1e-5): batch = 3 in_channel = 4 out_channel = 16 filter_h, filter_w = 3, 3 pad_h, pad_w = 1, 1 stride_h, stride_w = 1, 1 height, width = 32, 32 if tensor_format == 0: x_shape = [batch, in_channel, height, width] dy_shape = [batch, out_channel, height, width] else: x_shape = [batch, height, width, in_channel] dy_shape = [batch, height, width, out_channel] x_np = np.random.uniform(-1, 1, x_shape).astype(data_dtype) dy_np = np.random.uniform(-1, 1, dy_shape).astype(data_dtype) dw_np = tvm.topi.testing.conv2d_backward_weight_python( dy_np, x_np, (filter_h, filter_w), (stride_h, stride_w), (pad_h, pad_w), "NCHW" if tensor_format == 0 else "NHWC", ) x = te.placeholder(x_shape, name="x", dtype=data_dtype) dy = te.placeholder(dy_shape, name="dy", dtype=data_dtype) dw = cudnn.conv_backward_filter( dy, x, (filter_h, filter_w), [pad_h, pad_w], [stride_h, stride_w], [1, 1], conv_mode=1, tensor_format=tensor_format, conv_dtype=conv_dtype, ) s = te.create_schedule(dw.op) dev = tvm.cuda(0) f = tvm.build(s, [dy, x, dw], "cuda --host=llvm", name="conv2d_backward_filter") x = tvm.nd.array(x_np, dev) dy = tvm.nd.array(dy_np, dev) dw = tvm.nd.array(dw_np, dev) f(dy, x, dw) tvm.testing.assert_allclose(dw.numpy(), dw_np, atol=tol, rtol=tol)