def test_bigendian_rpc_param(): """Test big endian rpc when there is a PowerPC RPC server available""" host = os.environ.get("TVM_POWERPC_TEST_HOST", None) port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090) if host is None: return def verify_graph_runtime(remote, target, shape, dtype): x = relay.var('x') y = relay.const(1) z = relay.add(x, y) func = relay.Function([x], z) x_in = np.ones(shape).astype(dtype) params = {'x': x_in} graph, lib, params = relay.build(func, target=target, params=params) temp = util.tempdir() path_dso = temp.relpath("dev_lib.o") lib.save(path_dso) remote.upload(path_dso) lib = remote.load_module("dev_lib.o") ctx = remote.cpu(0) mod = graph_runtime.create(graph, lib, ctx) mod.load_params(relay.save_param_dict(params)) mod.run() out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx)) tvm.testing.assert_allclose(x_in + 1, out.asnumpy()) print("Test RPC connection to PowerPC...") remote = rpc.connect(host, port) target = "llvm -mtriple=powerpc-linux-gnu" for dtype in ["float32", "float64", "int32", "int8"]: verify_graph_runtime(remote, target, (10,), dtype)
def test_rpc_array(): if not tvm.module.enabled("rpc"): return # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) remote = rpc.connect(proxy_host, proxy_port, key="js") target = "llvm -target=asmjs-unknown-emscripten -system-lib" def check_remote(): if not tvm.module.enabled(target): print("Skip because %s is not enabled" % target) return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], target, name="myadd") path_obj = temp.relpath("dev_lib.bc") path_dso = temp.relpath("dev_lib.js") f.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) # Upload to suffix as dso so it can be loaded remotely remote.upload(path_dso, "dev_lib.dso") data = remote.download("dev_lib.dso") f1 = remote.load_module("dev_lib.dso") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote()
def test_bigendian_rpc(): """Test big endian rpc when there is a PowerPC RPC server available""" host = os.environ.get("TVM_POWERPC_TEST_HOST", None) port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090) if host is None: return def verify_rpc(remote, target, shape, dtype): A = tvm.placeholder(shape, dtype=dtype) B = tvm.compute(A.shape, lambda i: A[i]+tvm.const(1, A.dtype)) s = tvm.create_schedule(B.op) f = tvm.build(s, [A, B], target, name="myadd") ctx = remote.cpu(0) a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), ctx=ctx) b = tvm.nd.array(np.zeros(shape).astype(A.dtype), ctx=ctx) temp = util.tempdir() path_dso = temp.relpath("dev_lib.o") f.save(path_dso) remote.upload(path_dso) f = remote.load_module("dev_lib.o") f(a, b) tvm.testing.assert_allclose(a.asnumpy() + 1, b.asnumpy()) print("Test RPC connection to PowerPC...") remote = rpc.connect(host, port) target = "llvm -mtriple=powerpc-linux-gnu" for dtype in ["float32", "float64", "int32", "int8"]: verify_rpc(remote, target, (10,), dtype)
def test_rpc_simple(): if not tvm.module.enabled("rpc"): return @tvm.register_func("rpc.test.addone") def addone(x): return x + 1 @tvm.register_func("rpc.test.strcat") def strcat(name, x): return "%s:%d" % (name, x) @tvm.register_func("rpc.test.except") def remotethrow(name): raise ValueError("%s" % name) server = rpc.Server("localhost", key="x1") client = rpc.connect(server.host, server.port, key="x1") f1 = client.get_function("rpc.test.addone") assert f1(10) == 11 f3 = client.get_function("rpc.test.except") try: f3("abc") assert False except tvm.TVMError as e: assert "abc" in str(e) f2 = client.get_function("rpc.test.strcat") assert f2("abc", 11) == "abc:11"
def build_arm(): target = "llvm -target=armv7-none-linux-gnueabihf" if not tvm.module.enabled(target): print("Skip because %s is not enabled.." % target) return temp = util.tempdir() f = tvm.build(s, [A, B, C], target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x28) asm_path = temp.relpath("myadd.asm") f.save(asm_path) # Do a RPC verification, launch kernel on Arm Board if available. host = os.environ.get('TVM_RPC_ARM_HOST', None) remote = None if host: port = int(os.environ['TVM_RPC_ARM_PORT']) try: remote = rpc.connect(host, port) except tvm.TVMError as e: pass if remote: remote.upload(path) farm = remote.load_module("myadd.o") ctx = remote.cpu(0) n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) farm(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) print("Verification finish on remote..")
def run_opencl(): # NOTE: This is the setting for my rk3399 board. You need to modify # them according to your environment. target_host = "llvm -target=aarch64-linux-gnu" opencl_device_host = '10.77.1.145' opencl_device_port = 9090 # create scheule for the above "add one" compute decleration s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) func = tvm.build(s, [A, B], "opencl", target_host=target_host) remote = rpc.connect(opencl_device_host, opencl_device_port) # export and upload path = temp.relpath('lib_cl.tar') func.export_library(path) remote.upload(path) func = remote.load_module('lib_cl.tar') # run ctx = remote.cl() a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) func(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) print("OpenCP test passed!")
def test_rpc_file_exchange(): if not tvm.module.enabled("rpc"): return server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) blob = bytearray(np.random.randint(0, 10, size=(10))) remote.upload(blob, "dat.bin") rev = remote.download("dat.bin") assert(rev == blob)
def program_rpc_bitstream(path=None): """Program the FPGA on the RPC server Parameters ---------- path : path to bitstream (optional) """ assert tvm.module.enabled("rpc") remote = rpc.connect(host, port) program_fpga(remote, path)
def test_rpc_return_func(): @tvm.register_func("rpc.test.remote_func") def addone(x): return lambda y: x+y server = rpc.Server("localhost", key="x1") client = rpc.connect(server.host, server.port, key="x1") f1 = client.get_function("rpc.test.remote_func") fadd = f1(10) assert fadd(12) == 22
def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') temp = util.tempdir() s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "metal", target_host=target, name="myadd") path_dso1 = temp.relpath("dev_lib.dylib") f.export_library(path_dso1, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso1) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso2 = temp.relpath("cpu_lib.dylib") f.export_library(path_dso2, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso2) # Start RPC test server that contains the compiled library. server = xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, libs=[path_dso1, path_dso2]) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.metal(0) f1 = remote.load_module("dev_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # CPU ctx = remote.cpu(0) f2 = remote.load_module("cpu_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def try_remote_save_load(): if not tvm.module.enabled("rpc"): return if not tvm.module.enabled("opengl"): return if not tvm.module.enabled("llvm"): return # Build the module. n = tvm.var("n") A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C") s = tvm.create_schedule(C.op) s[C].opengl() target_host = "llvm -target=asmjs-unknown-emscripten -system-lib" f = tvm.build(s, [A, B, C], "opengl", target_host=target_host, name="myadd") remote = rpc.connect(proxy_host, proxy_port, key="js") temp = util.tempdir() ctx = remote.opengl(0) path_obj = temp.relpath("myadd.bc") path_dso = temp.relpath("myadd.js") path_gl = temp.relpath("myadd.gl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) f.imported_modules[0].save(path_gl) remote.upload(path_dso, "myadd.dso") remote.upload(path_gl) remote.upload(path_json) remote.download("myadd.dso") remote.download("myadd.gl") remote.download("myadd.tvm_meta.json") print('Loading myadd.dso') fhost = remote.load_module("myadd.dso") print('Loading myadd.gl') fdev = remote.load_module("myadd.gl") print('import_module') fhost.import_module(fdev) print('running...') a = tvm.nd.array(np.random.uniform(size=16).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(16, dtype=A.dtype), ctx) c = tvm.nd.array(np.zeros(16, dtype=C.dtype), ctx) fhost(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
def run(run_func): """Run test function on all available env. Parameters ---------- run_func : function(env, remote) """ env = get_env() if env.TARGET == "sim": # Talk to local RPC if necessary to debug RPC server. # Compile vta on your host with make at the root. # Make sure TARGET is set to "sim" in the config.json file. # Then launch the RPC server on the host machine # with ./apps/pynq_rpc/start_rpc_server.sh # Set your VTA_LOCAL_SIM_RPC environment variable to # the port it's listening to, e.g. 9090 local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: remote = rpc.connect("localhost", local_rpc) run_func(env, remote) else: # Make sure simulation library exists # If this fails, build vta on host (make) # with TARGET="sim" in the json.config file. assert simulator.enabled() run_func(env, rpc.LocalSession()) elif env.TARGET == "pynq": # Run on PYNQ if env variable exists host = os.environ.get("VTA_PYNQ_RPC_HOST", None) port = int(os.environ.get("VTA_PYNQ_RPC_PORT", None)) if host and port: remote = rpc.connect(host, port) run_func(env, remote) else: raise RuntimeError( "Please set the VTA_PYNQ_RPC_HOST and VTA_PYNQ_RPC_PORT environment variables")
def test_rpc_array(): if not tvm.module.enabled("rpc"): return x = np.random.randint(0, 10, size=(3, 4)) @tvm.register_func("rpc.test.remote_array_func") def remote_array_func(y): np.testing.assert_equal(y.asnumpy(), x) server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) r_cpu = tvm.nd.array(x, remote.cpu(0)) assert str(r_cpu.context).startswith("remote") np.testing.assert_equal(r_cpu.asnumpy(), x) fremote = remote.get_function("rpc.test.remote_array_func") fremote(r_cpu)
def check_remote(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) temp = util.tempdir() ctx = remote.cpu(0) path_dso = temp.relpath("dev_lib.so") mlib.export_library(path_dso) remote.upload(path_dso) mlib = remote.load_module("dev_lib.so") mod = graph_runtime.create(graph, mlib, remote.cpu(0)) a = np.random.uniform(size=(n,)).astype(A.dtype) mod.run(x=tvm.nd.array(a, ctx)) out = tvm.nd.empty((n,), ctx=ctx) out = mod.get_output(0, out) np.testing.assert_equal(out.asnumpy(), a + 1)
def check(): if not tvm.module.enabled("rpc"): return @tvm.register_func("rpc.test2.addone") def addone(x): return x + 1 @tvm.register_func("rpc.test2.strcat") def addone(name, x): return "%s:%d" % (name, x) server = multiprocessing.Process( target=proxy.websocket_proxy_server, args=("ws://localhost:%d/ws" % web_port,"x1")) # Need to make sure that the connection start after proxy comes up time.sleep(0.1) server.deamon = True server.start() client = rpc.connect(prox.host, prox.port, key="x1") f1 = client.get_function("rpc.test2.addone") assert f1(10) == 11 f2 = client.get_function("rpc.test2.strcat") assert f2("abc", 11) == "abc:11"
def test_rpc_executor(): host = "localhost" port = 9021 server = rpc.Server(host, port, use_popen=True) time.sleep(1) x = sym.Variable("x") y = sym.Variable("y") z = sym.exp(y + x) shape = (10, 128) dtype = tvm.float32 shape_dict = {"x": shape, "y": shape} tmp = util.tempdir() lib_name = tmp.relpath("net.o") graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict) # save module lib.save(lib_name) remote = rpc.connect(host, port) remote.upload(lib_name) ctx = remote.cpu(0) # load remote rlib = remote.load_module("net.o") # Create remotemodule m = graph_runtime.create(graph, rlib, remote.cpu(0)) # get member functions set_input, run, get_output = m["set_input"], m["run"], m["get_output"] na = tvm.nd.array(np.ones(shape).astype(dtype), ctx) nb = tvm.nd.array(np.ones(shape).astype(dtype), ctx) # set inputs set_input("x", na) set_input("y", nb) # execute run() # get outputs out = tvm.nd.empty(shape, dtype, ctx) get_output(0, out) tvm.testing.assert_allclose( out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy())) server.terminate()
def test_rpc_return_ndarray(): # Use closure to check the ref counter correctness nd = tvm.nd.array(np.zeros(10).astype("float32")) @tvm.register_func("rpc.test.remote_return_nd") def my_module(name): if name == "get_arr": return lambda : nd elif name == "ref_count": return lambda : tvm._api_internal._ndarray_use_count(nd) elif name == "get_elem": return lambda idx: nd.asnumpy()[idx] elif name == "get_arr_elem": return lambda arr, idx: arr.asnumpy()[idx] # start server server = rpc.Server("localhost", key="x1") client = rpc.connect(server.host, server.port, key="x1") m = client.get_function("rpc.test.remote_return_nd") get_arr = m("get_arr") ref_count = m("ref_count") get_elem = m("get_elem") get_arr_elem = m("get_arr_elem") # array test def run_arr_test(): arr = get_arr() assert ref_count() == 2 arr2 = get_arr() assert ref_count() == 3 assert arr.context == client.cpu(0) arr.copyfrom(np.ones(10).astype(arr.dtype)) assert arr2.asnumpy()[0] == 1.0 assert get_elem(0) == 1.0 assert get_arr_elem(arr2, 0) == 1.0 assert ref_count() == 1 run_arr_test() # check recycle correctness assert ref_count() == 1
def test_bigendian_rpc_param(): """Test big endian rpc when there is a PowerPC RPC server available""" host = os.environ.get("TVM_POWERPC_TEST_HOST", None) port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090) if host is None: return def verify_nnvm(remote, target, shape, dtype): x = nnvm.sym.Variable("x") y = x + 1 graph, lib, _ = nnvm.compiler.build( y, target, shape={"x": shape}, dtype={"x": dtype}) temp = util.tempdir() path_dso = temp.relpath("dev_lib.o") lib.save(path_dso) remote.upload(path_dso) lib = remote.load_module("dev_lib.o") a = np.random.randint(0, 256, size=shape).astype(dtype) a[:] = 1 params = {"x" : a} ctx = remote.cpu(0) m = graph_runtime.create(graph, lib, ctx) # uses save param_dict m.load_params(nnvm.compiler.save_param_dict(params)) m.run() out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx)) tvm.testing.assert_allclose(a + 1, out.asnumpy()) print("Test RPC connection to PowerPC...") remote = rpc.connect(host, port) target = "llvm -mtriple=powerpc-linux-gnu" for dtype in ["float32", "float64", "int32", "int8"]: verify_nnvm(remote, target, (10,), dtype)
def run_model(): kernelstr = " no kernel" if args.nokernel else "" print("A", args.activation_bits, "W", args.weight_bits, kernelstr, sep="") global net, params net = net[net.entry_func] # compile kernels with history best records. with autotvm.apply_history_best(log_file): with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build( net, target=target, params=params) # Upload module to device host = os.environ['PI'] port = int(os.environ['PORT']) remote = rpc.connect(host, port) ctx = remote.cpu() # export library tmp = util.tempdir() lib_fname = tmp.relpath('net.tar') lib.export_library(lib_fname) # upload the library to remote device and load it remote.upload(lib_fname) rlib = remote.load_module('net.tar') # create the remote runtime module module = runtime.create(graph, rlib, ctx) # set parameter (upload params to the remote device. This may take a while) data = get_image() module.set_input(**params) synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/', '4d0b62f3d01426887599d4f7ede23ee5/raw/', '596b27d23537e5a1b5751d2b0481ef172f58b539/', 'imagenet1000_clsid_to_human.txt']) synset_name = 'imagenet1000_clsid_to_human.txt' synset_path = download_testdata(synset_url, synset_name, module='data') with open(synset_path) as f: synset = eval(f.read()) # Confirm correctness with tf model test_input = tf.constant(data.astype('float32')) output = model(test_input) top1_tf = np.argmax(output[0].numpy()) print('TF top-1 id: {}, class name: {}'.format(top1_tf, synset[top1_tf])) if args.nokernel: data = data.transpose((0, 3, 1, 2)) module.set_input('input_1', data) module.run() tvm_out = module.get_output(0) top1_tvm = np.argmax(tvm_out.asnumpy()[0]) print('RPI top-1 id: {}, class name: {}'.format(top1_tvm, synset[top1_tvm])) # Check the actual vector output is within fp error np.testing.assert_allclose(output, tvm_out.asnumpy(), rtol=1e-3) # Benchmark time print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=repeats, repeat=args.repeats) prof_res = np.array(ftimer().results) * 1000 # Convert to milliseconds mean_ms = np.mean(prof_res) std_dev_ms = np.std(prof_res) print("Mean inference time (std dev): %.2f ms (%.2f ms)\n" % (mean_ms, std_dev_ms)) with open("data/end2end.csv", "a") as f: ukernel = "no" if args.nokernel else "yes" f.write("ARM,A%dW%d,%s,%f,%f\n" % (args.activation_bits, args.weight_bits, ukernel, mean_ms, std_dev_ms))
def test_can_call_remote_function_with_rpc_standalone(host, port): remote_session = rpc.connect(host, port) f = remote_session.get_function("runtime.GetFFIString") assert f("hello") == "hello"
def test_gemm_gpu(N, times, bn, num_block, num_thread): assert bn <= N assert num_thread * num_thread * 16 <= N assert num_block * num_block * 2 <= N A = te.placeholder((N, N), name="A") B = te.placeholder((N, N), name="Btmp") k = te.reduce_axis((0, N), name="k") packedB = te.compute((N, N / bn, bn), lambda x, y, z: B[x, y * bn + z], name="B") C = te.compute( (N, N), lambda ii, jj: te.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k), name="C" ) s = te.create_schedule(C.op) CC = s.cache_write(C, "local") block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_xz = te.thread_axis((0, 2), "vthread", name="vx") thread_yz = te.thread_axis((0, 2), "vthread", name="vy") pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread) pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread) s[packedB].bind(pby, thread_y) s[packedB].bind(pbx, thread_x) pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8) s[packedB].vectorize(pbk) by, yi = s[C].split(C.op.axis[0], nparts=num_block) bx, xi = s[C].split(C.op.axis[1], nparts=num_thread) s[C].bind(by, block_y) s[C].bind(bx, thread_y) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_block) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, block_x) s[C].bind(tx, thread_x) xyi, xxi = s[C].split(xi, factor=8) s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi) s[C].vectorize(xxi) s[CC].compute_at(s[C], yi) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) xo, xi = s[CC].split(xo, factor=8) s[CC].vectorize(xi) ko, ki = s[CC].split(k, factor=2) s[CC].unroll(ki) print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu") temp = utils.tempdir() path_dso = temp.relpath("gemm_gpu.so") f.export_library(path_dso, ndk.create_shared) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.cl(0) remote.upload(path_dso) f = remote.load_module("gemm_gpu.so") evaluate(f, ctx, N, times)
lib.export_library(lib_fname) ###################################################################### # Deploy the Model Remotely by RPC # -------------------------------- # With RPC, you can deploy the model remotely from your host machine # to the remote device. # obtain an RPC session from remote device. if local_demo: remote = rpc.LocalSession() else: # The following is my environment, change this to the IP address of your target device host = '10.77.1.145' port = 9090 remote = rpc.connect(host, port) # upload the library to remote device and load it remote.upload(lib_fname) rlib = remote.load_module('net.tar') ctx = remote.cpu(0) if local_demo else remote.cl(0) # upload the parameter rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} # create the remote runtime module module = runtime.create(graph, rlib, ctx) # set parameter module.set_input(**rparams) # set input data module.set_input('data', tvm.nd.array(x.astype('float32')))
def test_can_call_remote_function_with_rpc_proxy(host, port): remote_session = rpc.connect(host, port, key=DEVICE_KEY) f = remote_session.get_function("runtime.GetFFIString") assert f("hello") == "hello"
# Otherwise, if target is 'sim', execute locally. if env.TARGET not in ["sim", "tsim"]: # Get remote from tracker node if environment variable is set. # To set up the tracker, you'll need to follow the "Auto-tuning # a convolutional network for VTA" tutorial. tracker_host = os.environ.get("TVM_TRACKER_HOST", None) tracker_port = os.environ.get("TVM_TRACKER_PORT", None) # Otherwise if you have a device you want to program directly from # the host, make sure you've set the variables below to the IP of # your board. device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091") if not tracker_host or not tracker_port: remote = rpc.connect(device_host, int(device_port)) else: remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000) # Reconfigure the JIT runtime and FPGA. # You can program the FPGA with your own custom bitstream # by passing the path to the bitstream file instead of None. reconfig_start = time.time() vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) reconfig_time = time.time() - reconfig_start print( "Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
def test_rpc_return_func(): server = rpc.Server("localhost", key="x1") client = rpc.connect(server.host, server.port, key="x1") f1 = client.get_function("rpc.test.remote_func") fadd = f1(10) assert fadd(12) == 22
def run_module( tvmc_package: TVMCPackage, device: str, hostname: Optional[str] = None, port: Union[int, str] = 9090, rpc_key: Optional[str] = None, inputs: Optional[Dict[str, np.ndarray]] = None, fill_mode: str = "random", repeat: int = 10, number: int = 10, profile: bool = False, end_to_end: bool = False, options: dict = None, ): """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- tvmc_package: TVMCPackage The compiled model package object that will be run. device: str, the device (e.g. "cpu" or "cuda") to be targeted by the RPC session, local or remote). hostname : str, optional The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. inputs : dict, optional A dictionary that maps input names to numpy values. If not provided, inputs will be generated using the fill_mode argument. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. number : int, optional The number of runs to measure within each repeat. profile : bool Whether to profile the run with the debug runtime. end_to_end : bool Whether to measure the time of memory copies as well as model execution. Turning this on can provide a more realistic estimate of how long running the model in production would take. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ if not isinstance(tvmc_package, TVMCPackage): raise TVMCException( "This model doesn't seem to have been compiled yet. " "Try calling tvmc.compile on the model before running it.") with ExitStack() as stack: # Currently only two package formats are supported: "classic" and # "mlf". The later can only be used for micro targets, i.e. with microTVM. if device == "micro": if tvmc_package.type != "mlf": raise TVMCException( f"Model {tvmc_package.package_path} is not a MLF archive.") project_dir = get_project_dir(tvmc_package.project_dir) # This is guaranteed to work since project_dir was already checked when # building the dynamic parser to accommodate the project options, so no # checks are in place when calling GeneratedProject. project_ = project.GeneratedProject.from_directory( project_dir, options) else: if tvmc_package.type == "mlf": raise TVMCException( "You're trying to run a model saved using the Model Library Format (MLF). " "MLF can only be used to run micro device ('--device micro')." ) if hostname: if isinstance(port, str): port = int(port) # Remote RPC if rpc_key: logger.debug("Running on remote RPC tracker with key %s.", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("Running on remote RPC with no key.") session = rpc.connect(hostname, port) elif device == "micro": # Remote RPC (running on a micro target) logger.debug("Running on remote RPC (micro target).") try: session = tvm.micro.Session(project_.transport()) stack.enter_context(session) except: raise TVMCException( "Could not open a session with the micro target.") else: # Local logger.debug("Running a local session.") session = rpc.LocalSession() # Micro targets don't support uploading a model. The model to be run # must be already flashed into the micro target before one tries # to run it. Hence skip model upload for micro targets. if device != "micro": session.upload(tvmc_package.lib_path) lib = session.load_module(tvmc_package.lib_name) # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("Device is %s.", device) if device == "cuda": dev = session.cuda() elif device == "cl": dev = session.cl() elif device == "metal": dev = session.metal() elif device == "vulkan": dev = session.vulkan() elif device == "rocm": dev = session.rocm() elif device == "micro": dev = session.device lib = session.get_system_lib() else: assert device == "cpu" dev = session.cpu() # TODO(gromero): Adjust for micro targets. if profile: logger.debug("Creating runtime with profiling enabled.") module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof") else: if device == "micro": logger.debug( "Creating runtime (micro) with profiling disabled.") module = tvm.micro.create_local_graph_executor( tvmc_package.graph, lib, dev) else: logger.debug("Creating runtime with profiling disabled.") module = runtime.create(tvmc_package.graph, lib, dev) logger.debug("Loading params into the runtime module.") module.load_params(tvmc_package.params) logger.debug("Collecting graph input shape and type:") shape_dict, dtype_dict = module.get_input_info() logger.debug("Graph input shape: %s", shape_dict) logger.debug("Graph input type: %s", dtype_dict) inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode) logger.debug("Setting inputs to the module.") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.info("Running the module with profiling enabled.") report = module.profile() # This print is intentional print(report) if device == "micro": # TODO(gromero): Fix time_evaluator() for micro targets. Once it's # fixed module.benchmark() can be used instead and this if/else can # be removed. module.run() times = [] else: # Call the benchmarking function of the executor. # Optionally measure e2e data transfers from the # CPU to device memory overheads (e.g. PCIE # overheads if the device is a discrete GPU). if end_to_end: dev = session.cpu() times = module.benchmark(dev, number=number, repeat=repeat, end_to_end=end_to_end) logger.debug("Collecting the output tensors.") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).numpy() return TVMCResult(outputs, times)
def run_module( module_file, device, hostname=None, port=9090, rpc_key=None, inputs=None, fill_mode="random", repeat=1, profile=False, ): """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- module_file : str The path to the module file (a .tar file). device: str, the device (e.g. "cpu" or "gpu") to be targeted by the RPC session, local or remote). hostname : str, optional The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. inputs : dict, optional A dictionary that maps input names to numpy values. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. profile : bool Whether to profile the run with the debug runtime. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ with tempfile.TemporaryDirectory() as tmp_dir: logger.debug("extracting module file %s", module_file) t = tarfile.open(module_file) t.extractall(tmp_dir) graph = open(os.path.join(tmp_dir, "mod.json")).read() params = bytearray( open(os.path.join(tmp_dir, "mod.params"), "rb").read()) if hostname: # Remote RPC if rpc_key: logger.debug("running on remote RPC tracker with key %s", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("running on remote RPC with no key") session = rpc.connect(hostname, port) else: # Local logger.debug("running a local session") session = rpc.LocalSession() session.upload(os.path.join(tmp_dir, "mod.so")) lib = session.load_module("mod.so") # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("device is %s", device) if device == "gpu": dev = session.gpu() elif device == "cl": dev = session.cl() else: assert device == "cpu" dev = session.cpu() if profile: logger.debug("creating runtime with profiling enabled") module = debug_executor.create(graph, lib, dev, dump_root="./prof") else: logger.debug("creating runtime with profiling disabled") module = runtime.create(graph, lib, dev) logger.debug("load params into the runtime module") module.load_params(params) shape_dict, dtype_dict = get_input_info(graph, params) inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode) logger.debug("setting inputs to the module") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.debug("running the module with profiling enabled") module.run() # create the module time evaluator (returns a function) timer = module.module.time_evaluator("run", dev, 1, repeat=repeat) # call the evaluator function to invoke the module and save execution times prof_result = timer() # collect a list of execution times from the profiling results times = prof_result.results logger.debug("collecting the output tensors") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).asnumpy() return outputs, times
def test_rpc_remote_module(): if not tvm.runtime.enabled("rpc"): return # graph n = tvm.runtime.convert(102) A = te.placeholder((n, ), name="A") B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") s = te.create_schedule(B.op) server0 = rpc.Server("localhost", key="x0") server1 = rpc.Server("localhost", key="x1") client = rpc.connect( server0.host, server0.port, key="x0", session_constructor_args=[ "rpc.Connect", server1.host, server1.port, "x1" ], ) def check_remote(remote): temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print("%g secs/op" % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) def check_minrpc(): if tvm.get_global_func("rpc.PopenSession", allow_missing=True) is None: return # export to minrpc temp = util.tempdir() f = tvm.build(s, [A, B], "llvm --system-lib", name="myadd") path_minrpc = temp.relpath("dev_lib.minrpc") f.export_library(path_minrpc, rpc.with_minrpc(cc.create_executable)) with pytest.raises(RuntimeError): rpc.PopenSession("filenotexist") # statrt the minrpc session. remote = tvm.rpc.PopenSession(path_minrpc) ctx = remote.cpu(0) f1 = remote.system_lib() a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) time_f = f1.time_evaluator("myadd", remote.cpu(0), number=1) cost = time_f(a, b).mean np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # change to not executable os.chmod(path_minrpc, stat.S_IRUSR) with pytest.raises(RuntimeError): rpc.PopenSession(path_minrpc) def check_remote_link_cl(remote): """Test function to run remote code such as cl This is not enabled because there is forking issue of TVM runtime when server launches after OpenCL runtime initializes. We leave it as an example on how to do rpc when we want to do linking on remote. """ if not tvm.testing.device_enabled("opencl"): print("Skip because opencl is not enabled") return temp = util.tempdir() ctx = remote.cl(0) s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, te.thread_axis("blockIdx.x")) s[B].bind(xi, te.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd") # Option 1: save modules separately and rely on remote compiler path_o = temp.relpath("myadd.o") path_cl = temp.relpath("myadd.cl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_o) f.imported_modules[0].save(path_cl) remote.upload(path_o) remote.upload(path_cl) # upload meta data remote.upload(path_json) fhost = remote.load_module("myadd.o") fdev = remote.load_module("myadd.cl") fhost.import_module(fdev) a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Option 2: export library as a tar ball then handled by remote compiler path_tar = temp.relpath("myadd.tar") f.export_library(path_tar) remote.upload(path_tar) fhost = remote.load_module("myadd.tar") a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote(rpc.LocalSession()) check_remote(client) check_minrpc()
def test_rpc_return_func(): server = rpc.Server(key="x1") client = rpc.connect("127.0.0.1", server.port, key="x1") f1 = client.get_function("rpc.test.add_to_lhs") fadd = f1(10) assert fadd(12) == 22
def deploy_rpc(): """Runs the demo that deploys a model remotely through RPC. """ from tvm import rpc from tvm.contrib import util, emscripten # As usual, load the resnet18 model. net, params, data_shape, out_shape = load_mxnet_resnet() # Compile the model. # Note that this time we are changing the target. # This is because we want to translate the host library into JavaScript # through Emscripten. graph, lib, params = compile_net( net, target_host="llvm -target=asmjs-unknown-emscripten -system-lib", target="opengl", data_shape=data_shape, params=params) # Now we want to deploy our model through RPC. # First we ned to prepare the module files locally. print("Saving the compiled module...") temp = util.tempdir() path_obj = temp.relpath("deploy.bc") # host LLVM part path_dso = temp.relpath("deploy.js") # host JavaScript part path_gl = temp.relpath("deploy.gl") # device GLSL part path_json = temp.relpath("deploy.tvm_meta.json") lib.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) lib.imported_modules[0].save(path_gl) print("- Saved files:", temp.listdir()) # Connect to the RPC server. print("Connecting to RPC server...") proxy_host = 'localhost' proxy_port = 9090 remote = rpc.connect(proxy_host, proxy_port, key="js") print("- Connected to RPC server!") # Upload module to RPC server. print("Uploading module to RPC server...") remote.upload(path_dso, "deploy.dso") remote.upload(path_gl) remote.upload(path_json) print("- Upload completed!") # Load remote library. print("Loading remote library...") fdev = remote.load_module("deploy.gl") fhost = remote.load_module("deploy.dso") fhost.import_module(fdev) rlib = fhost print("- Remote library loaded!") ctx = remote.opengl(0) # Upload the parameters. print("Uploading parameters...") rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} print("- Parameters uploaded!") # Create the remote runtime module. print("Running remote module...") from tvm.contrib import graph_runtime module = graph_runtime.create(graph, rlib, ctx) # Set parameter. module.set_input(**rparams) # Set input data. input_data = np.random.uniform(size=data_shape) module.set_input('data', tvm.nd.array(input_data.astype('float32'))) # Run. module.run() print("- Remote module execution completed!") out = module.get_output(0, out=tvm.nd.empty(out_shape, ctx=ctx)) # Print first 10 elements of output. print(out.asnumpy()[0][0:10])
def reconfig_rpc_runtime(): """Reconfig the RPC server runtime """ assert tvm.module.enabled("rpc") remote = rpc.connect(host, port) reconfig_runtime(remote)
A = tvm.placeholder((n, ), name='a') B = tvm.placeholder((n, ), name='b') C = tvm.compute(A.shape, lambda i: A[i] + B[i], name='c') return A, B, C A, B, C = vector_add(n) s = tvm.create_schedule(C.op) mod = tvm.build(s, [A, B, C]) mod_fname = 'vector-add.tar' mod.export_library(mod_fname) if target_url != '0.0.0.0': remote = rpc.connect(url=target_url, port=target_port) else: remote = rpc.LocalSession() # Even if running a pretrained model in remote machine, it only need upload `mod_fname` remote.upload(mod_fname) remote_mod = remote.load_module(mod_fname) ctx = remote.cpu() a = tvm.nd.array(np.array([1, 2, 3], dtype=dtype), ctx=ctx) b = tvm.nd.array(np.array([4, 5, 6], dtype=dtype), ctx=ctx) c = tvm.nd.empty(b.shape) remote_mod(a, b, c) print(c.asnumpy())
def run_module( tvmc_package: TVMCPackage, device: str, hostname: Optional[str] = None, port: Union[int, str] = 9090, rpc_key: Optional[str] = None, inputs: Optional[Dict[str, np.ndarray]] = None, fill_mode: str = "random", repeat: int = 10, number: int = 10, profile: bool = False, ): """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- tvmc_package: TVMCPackage The compiled model package object that will be run. device: str, the device (e.g. "cpu" or "cuda") to be targeted by the RPC session, local or remote). hostname : str, optional The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. inputs : dict, optional A dictionary that maps input names to numpy values. If not provided, inputs will be generated using the fill_mode argument. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. number : int, optional The number of runs to measure within each repeat. profile : bool Whether to profile the run with the debug runtime. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ if not isinstance(tvmc_package, TVMCPackage): raise TVMCException( "This model doesn't seem to have been compiled yet. " "Try calling tvmc.compile on the model before running it." ) # Currently only two package formats are supported: "classic" and # "mlf". The later can only be used for micro targets, i.e. with µTVM. if tvmc_package.type == "mlf": raise TVMCException( "You're trying to run a model saved using the Model Library Format (MLF)." "MLF can only be used to run micro targets (µTVM)." ) if hostname: if isinstance(port, str): port = int(port) # Remote RPC if rpc_key: logger.debug("Running on remote RPC tracker with key %s.", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("Running on remote RPC with no key.") session = rpc.connect(hostname, port) else: # Local logger.debug("Running a local session.") session = rpc.LocalSession() session.upload(tvmc_package.lib_path) lib = session.load_module(tvmc_package.lib_name) # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("Device is %s.", device) if device == "cuda": dev = session.cuda() elif device == "cl": dev = session.cl() else: assert device == "cpu" dev = session.cpu() if profile: logger.debug("Creating runtime with profiling enabled.") module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof") else: logger.debug("Creating runtime with profiling disabled.") module = runtime.create(tvmc_package.graph, lib, dev) logger.debug("Loading params into the runtime module.") module.load_params(tvmc_package.params) shape_dict, dtype_dict = get_input_info(tvmc_package.graph, tvmc_package.params) inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode) logger.debug("Setting inputs to the module.") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.info("Running the module with profiling enabled.") module.run() # create the module time evaluator (returns a function) timer = module.module.time_evaluator("run", dev, number=number, repeat=repeat) # call the evaluator function to invoke the module and save execution times prof_result = timer() # collect a list of execution times from the profiling results times = prof_result.results logger.debug("Collecting the output tensors.") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).numpy() return TVMCResult(outputs, times)
def reconfig_rpc_runtime(): """Reconfig the RPC server runtime """ assert tvm.runtime.enabled("rpc") remote = rpc.connect(host, port) reconfig_runtime(remote)
from vta.testing import simulator # Load VTA parameters from the vta/config/vta_config.json file env = vta.get_env() # We read the Pynq RPC host IP address and port number from the OS environment host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091")) # We configure both the bitstream and the runtime system on the Pynq # to match the VTA configuration specified by the vta_config.json file. if env.TARGET == "pynq": # Make sure that TVM was compiled with RPC=1 assert tvm.module.enabled("rpc") remote = rpc.connect(host, port) # Reconfigure the JIT runtime vta.reconfig_runtime(remote) # Program the FPGA with a pre-compiled VTA bitstream. # You can program the FPGA with your own custom bitstream # by passing the path to the bitstream file instead of None. vta.program_fpga(remote, bitstream=None) # In simulation mode, host the RPC server locally. elif env.TARGET == "sim": remote = rpc.LocalSession() ###################################################################### # Computation Declaration
def test_rpc_remote_module(): if not tvm.module.enabled("rpc"): return server = rpc.Server("localhost") client = rpc.connect(server.host, server.port) # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) def check_remote(remote): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) def check_remote_link_cl(remote): """Test function to run remote code such as cl This is not enabled because there is forking issue of TVM runtime when server launches after OpenCL runtime initializes. We leave it as an example on how to do rpc when we want to do linking on remote. """ if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return if not tvm.module.enabled("opencl"): print("Skip because opencl is not enabled") return temp = util.tempdir() ctx = remote.cl(0) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd") # Option 1: save modules separately and rely on remote compiler path_o = temp.relpath("myadd.o") path_cl = temp.relpath("myadd.cl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_o) f.imported_modules[0].save(path_cl) remote.upload(path_o) remote.upload(path_cl) # upload meta data remote.upload(path_json) fhost = remote.load_module("myadd.o") fdev = remote.load_module("myadd.cl") fhost.import_module(fdev) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Option 2: export library as a tar ball then handled by remote compiler path_tar = temp.relpath("myadd.tar") f.export_library(path_tar) remote.upload(path_tar) fhost = remote.load_module("myadd.tar") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote(client) check_remote(rpc.LocalSession())
def test_gemm_gpu(N, times, bn, num_block, num_thread): assert(bn <= N) assert(num_thread * num_thread * 16 <= N) assert(num_block * num_block * 2 <= N) A = tvm.placeholder((N, N), name='A') B = tvm.placeholder((N, N), name='Btmp') k = tvm.reduce_axis((0, N), name='k') packedB = tvm.compute((N, N / bn, bn), lambda x, y, z: B[x, y * bn + z], name = 'B') C = tvm.compute( (N, N), lambda ii, jj: tvm.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k), name='C') s = tvm.create_schedule(C.op) CC = s.cache_write(C, "local") block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_x = tvm.thread_axis("threadIdx.x") thread_y = tvm.thread_axis("threadIdx.y") thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx") thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy") pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread) pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread) s[packedB].bind(pby, thread_y) s[packedB].bind(pbx, thread_x) pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8) s[packedB].vectorize(pbk) by, yi = s[C].split(C.op.axis[0], nparts=num_block) bx, xi = s[C].split(C.op.axis[1], nparts=num_thread) s[C].bind(by, block_y) s[C].bind(bx, thread_y) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_block) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, block_x) s[C].bind(tx, thread_x) xyi, xxi = s[C].split(xi, factor=8) s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi) s[C].vectorize(xxi) s[CC].compute_at(s[C], yi) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) xo, xi = s[CC].split(xo, factor=8) s[CC].vectorize(xi) ko, ki = s[CC].split(k, factor=2) s[CC].unroll(ki) print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu") temp = util.tempdir() path_dso = temp.relpath("gemm_gpu.so") f.export_library(path_dso, ndk.create_shared) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.cl(0) remote.upload(path_dso) f = remote.load_module("gemm_gpu.so") evaluate(f, ctx, N, times)
dtype_dict = {input_name: data.dtype} # parse Caffe2 model and convert into Relay computation graph from tvm import relay mod, params = relay.frontend.from_caffe2(resnet50.init_net, resnet50.predict_net, shape_dict, dtype_dict) # compile the model target = 'metal' with relay.build_config(opt_level=3): graph, lib, params = relay.build(mod, target, target_host=target_host, params=params) #Save the library temp = util.tempdir() path_dso1 = temp.relpath("dev_lib.dylib") lib.export_library(path_dso1, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso1) # Start RPC test server that contains the compiled library. server = xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, libs=[path_dso1]) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.metal(0) load_lib = remote.load_module("dev_lib.dylib") module = graph_runtime.create(graph, loaded_lib, ctx) # This line is thrwoing error. module.load_params(loaded_params) caffe2_out = module.run(data=input_data)