def test_local_save_load(): if not tvm.module.enabled("opengl"): return if not tvm.module.enabled("llvm"): return n = tvm.var("n") A = tvm.placeholder((n,), name='A', dtype='int32') B = tvm.placeholder((n,), name='B', dtype='int32') C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C") s = tvm.create_schedule(C.op) s[C].opengl() f = tvm.build(s, [A, B, C], "opengl", target_host="llvm", name="myadd") ctx = tvm.opengl(0) n = 10 a = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((n), dtype=C.dtype), ctx) f(a, b, c) temp = util.tempdir() path_so = temp.relpath("myadd.so") f.export_library(path_so) f1 = tvm.module.load(path_so) f1(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
def check_c(): if not tvm.module.enabled("llvm"): return # Specifically allow offset to test codepath when offset is available Ab = tvm.decl_buffer( A.shape, A.dtype, elem_offset=tvm.var('Aoffset'), offset_factor=8, name='A') binds = {A : Ab} # BUILD and invoke the kernel. f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline") fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)] fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) mhost = tvm.codegen.build_module(fsplits[0], "c") temp = util.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.module.load(path_dso) fadd = m["fadd_pipeline"] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy())
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return temp = util.tempdir() name = "myadd_%s" % device if sys.platform == "darwin" or sys.platform.startswith('linux'): f = tvm.build(s, [A, B], device, "llvm -system-lib", name=name) elif sys.platform == "win32": f = tvm.build(s, [A, B], device, "llvm", name=name) else: raise ValueError("Unsupported platform") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) f1 = tvm.module.load(path_dso) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) f1(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) if sys.platform != "win32": f2 = tvm.module.system_lib() f2[name](a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def test_min_repeat_ms(): tmp = tempdir() filename = tmp.relpath("log") @tvm.register_func def my_debug(filename): """one call lasts for 100 ms and writes one character to a file""" time.sleep(0.1) with open(filename, "a") as fout: fout.write("c") X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename)) s = tvm.create_schedule(X.op) func = tvm.build(s, [X]) x = tvm.nd.empty((), dtype="int32") ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1) ftimer(x) with open(filename, "r") as fin: ct = len(fin.readline()) assert ct == 2 ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1, min_repeat_ms=1000) ftimer(x) # make sure we get more than 10 calls with open(filename, "r") as fin: ct = len(fin.readline()) assert ct > 10 + 2
def test_forward_inception_v1(): '''test inception V1 model''' with tf.Graph().as_default(): graph_def = nnvm.testing.tf.get_workload("InceptionV1/classify_image_graph_def-with_shapes.pb") # Call the utility to import the graph definition into default graph. graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def) # Build an image from random data. from PIL import Image from tvm.contrib import util img_array = np.random.uniform(size=(1, 600, 600, 3)).astype("uint8") img = Image.frombuffer('RGB', (600, 600), img_array.tostring(), 'raw', 'RGB', 0, 1) temp = util.tempdir() img_path = temp.relpath("tf-test.jpg") img.save(img_path); import os.path if not tf.gfile.Exists(os.path.join(img_path)): tf.logging.fatal('File does not exist %s', image) data = tf.gfile.FastGFile(os.path.join(img_path), 'rb').read() temp.remove() # Extract tensorflow decoded image frame for tvm input with tf.Session() as sess: tvm_data = run_tf_graph(sess, data, 'DecodeJpeg/contents:0', 'DecodeJpeg:0') with tf.Session() as sess: tf_output = run_tf_graph(sess, data, 'DecodeJpeg/contents:0', 'softmax:0') tvm_output = run_tvm_graph(graph_def, tvm_data, 'DecodeJpeg/contents') tvm.testing.assert_allclose(tf_output[0], tvm_output[0], rtol=1e-5, atol=1e-5)
def build_arm(): target = "llvm -target=armv7-none-linux-gnueabihf" if not tvm.module.enabled(target): print("Skip because %s is not enabled.." % target) return temp = util.tempdir() f = tvm.build(s, [A, B, C], target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x28) asm_path = temp.relpath("myadd.asm") f.save(asm_path) # Do a RPC verification, launch kernel on Arm Board if available. host = os.environ.get('TVM_RPC_ARM_HOST', None) remote = None if host: port = int(os.environ['TVM_RPC_ARM_PORT']) try: remote = rpc.connect(host, port) except tvm.TVMError as e: pass if remote: remote.upload(path) farm = remote.load_module("myadd.o") ctx = remote.cpu(0) n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) farm(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) print("Verification finish on remote..")
def verify(s, check_correctness): mod = tvm.build(s, [data, kernel, res], target_host=env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # verify ctx = remote.cpu(0) # Data in original format data_orig, kernel_orig, res_ref = get_ref_data() res_shape = topi.util.get_const_tuple(res.shape) res_np = np.zeros(res_shape).astype(res.dtype) data_arr = tvm.nd.array(data_orig, ctx) kernel_arr = tvm.nd.array(kernel_orig, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=5) cost = time_f(data_arr, kernel_arr, res_arr) res_unpack = res_arr.asnumpy() if check_correctness: assert wl.hpad == wl.wpad stride = (wl.hstride, wl.wstride) padding = wl.hpad res_ref = res_ref >> 8 res_ref = np.clip(res_ref, 0, 127).astype("int8") tvm.testing.assert_allclose(res_unpack, res_ref) return cost
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'], help="The model type.") parser.add_argument('--host', type=str, required=True, help="The host address of your Raspberry Pi.") parser.add_argument('--port', type=int, required=True, help="The port number of your Raspberry Pi.") parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.") parser.add_argument('--num-iter', type=int, default=50, help="Number of iteration during benchmark.") args = parser.parse_args() opt_level = args.opt_level num_iter = args.num_iter batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size,) + image_shape out_shape = (batch_size, num_classes) if args.model == 'resnet': net, params = nnvm.testing.resnet.get_workload( batch_size=1, image_shape=image_shape) elif args.model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape) else: raise ValueError('no benchmark prepared for {}.'.format(args.model)) with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build( net, tvm.target.rasp(), shape={"data": data_shape}, params=params) tmp = util.tempdir() lib_fname = tmp.relpath('net.o') lib.save(lib_fname) remote = rpc.connect(args.host, args.port) remote.upload(lib_fname) ctx = remote.cpu(0) rlib = remote.load_module('net.o') rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} module = runtime.create(graph, rlib, ctx) module.set_input('data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32"))) module.set_input(**rparams) module.run() out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx)) out.asnumpy() print('benchmark args: {}'.format(args)) ftimer = module.module.time_evaluator("run", ctx, num_iter) for i in range(3): prof_res = ftimer() print(prof_res) # sleep for avoiding cpu overheat time.sleep(45)
def _convert_to_remote(func, remote): """ convert module function to remote rpc function""" temp = util.tempdir() path_dso = temp.relpath("tmp_func.tar") func.export_library(path_dso) remote.upload(path_dso) func = remote.load_module("tmp_func.tar") return func
def test_variable_node_parsed(): sym = nnvm.sym.Variable('data') tempdir = util.tempdir() json_filename = 'test_nnvm_symbol.json' with open(tempdir.relpath(json_filename), 'w') as fo: fo.write(nnvm.graph.create(sym).json()) sym_str = open(tempdir.relpath(json_filename), 'r').read() sym = nnvm.graph.load_json(sym_str).symbol() sym = nnvm.sym.relu(sym)
def generate_graph(graph_fn, params_fn, device="vta"): # Measure build start time build_start = time.time() # Derive the TVM target target = tvm.target.create("llvm -device={}".format(device)) # Derive the LLVM compiler flags # When targetting the Pynq, cross-compile to ARMv7 ISA if env.TARGET == "sim": target_host = "llvm" elif env.TARGET == "pynq": target_host = "llvm -mtriple=armv7-none-linux-gnueabihf -mcpu=cortex-a9 -mattr=+neon" # Load the ResNet-18 graph and parameters sym = nnvm.graph.load_json(open(graph_fn).read()) params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read()) # Populate the shape and data type dictionary shape_dict = {"data": (1, 3, 224, 224)} dtype_dict = {"data": 'float32'} shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Apply NNVM graph optimization passes sym = vta.graph.clean_cast(sym) sym = vta.graph.clean_conv_fuse(sym) if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) # Compile NNVM graph with nnvm.compiler.build_config(opt_level=3): if target.device_name != "vta": graph, lib, params = nnvm.compiler.build( sym, target, shape_dict, dtype_dict, params=params, target_host=target_host) else: with vta.build_config(): graph, lib, params = nnvm.compiler.build( sym, target, shape_dict, dtype_dict, params=params, target_host=target_host) # Save the compiled inference graph library assert tvm.module.enabled("rpc") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) # Send the inference library over to the remote RPC server remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Measure build time build_time = time.time() - build_start print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time)) return graph, lib, params
def build_i386(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled..") return temp = util.tempdir() target = "llvm -target=i386-pc-linux-gnu" f = tvm.build(s, [A, B, C], target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x03)
def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') temp = util.tempdir() s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "metal", target_host=target, name="myadd") path_dso1 = temp.relpath("dev_lib.dylib") f.export_library(path_dso1, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso1) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso2 = temp.relpath("cpu_lib.dylib") f.export_library(path_dso2, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso2) # Start RPC test server that contains the compiled library. server = xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, libs=[path_dso1, path_dso2]) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.metal(0) f1 = remote.load_module("dev_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # CPU ctx = remote.cpu(0) f2 = remote.load_module("cpu_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def try_remote_save_load(): if not tvm.module.enabled("rpc"): return if not tvm.module.enabled("opengl"): return if not tvm.module.enabled("llvm"): return # Build the module. n = tvm.var("n") A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C") s = tvm.create_schedule(C.op) s[C].opengl() target_host = "llvm -target=asmjs-unknown-emscripten -system-lib" f = tvm.build(s, [A, B, C], "opengl", target_host=target_host, name="myadd") remote = rpc.connect(proxy_host, proxy_port, key="js") temp = util.tempdir() ctx = remote.opengl(0) path_obj = temp.relpath("myadd.bc") path_dso = temp.relpath("myadd.js") path_gl = temp.relpath("myadd.gl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) f.imported_modules[0].save(path_gl) remote.upload(path_dso, "myadd.dso") remote.upload(path_gl) remote.upload(path_json) remote.download("myadd.dso") remote.download("myadd.gl") remote.download("myadd.tvm_meta.json") print('Loading myadd.dso') fhost = remote.load_module("myadd.dso") print('Loading myadd.gl') fdev = remote.load_module("myadd.gl") print('import_module') fhost.import_module(fdev) print('running...') a = tvm.nd.array(np.random.uniform(size=16).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(16, dtype=A.dtype), ctx) c = tvm.nd.array(np.zeros(16, dtype=C.dtype), ctx) fhost(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
def test_outer_product(): n = tvm.var('n') m = tvm.var('m') a = tvm.placeholder((n, ), name='a') b = tvm.placeholder((m, ), name='b') try: c = outer_product(n, m, a, b) ir = c.op.body except IOError as err: assert sys.version_info[0] == 2 and str(err) == 'could not get source code' return #Check for i in (0, n) assert isinstance(ir, tvm.stmt.For) assert ir.loop_var.name == 'i' assert ir.min.value == 0 assert ir.extent.name == 'n' ibody = ir.body assert isinstance(ibody, tvm.stmt.For) #Check for j in (0, m) assert ibody.loop_var.name == 'j' assert ibody.min.value == 0 assert ibody.extent.name == 'm' #Check loop body jbody = ibody.body assert isinstance(jbody, tvm.stmt.AssertStmt) assert isinstance(jbody.message, tvm.expr.StringImm) assert jbody.message.value == "index out of range!" jbody = jbody.body assert isinstance(jbody, tvm.stmt.Provide) assert jbody.func.name == 'c' assert len(jbody.args) == 2 assert jbody.args[0].name == 'i' assert jbody.args[1].name == 'j' assert isinstance(jbody.value, tvm.expr.Mul) mul = jbody.value assert isinstance(mul.a, tvm.expr.Call) assert mul.a.name == 'a' assert mul.b.name == 'b' func, ins, outs = run_and_check(outer_product, [n, m, a, b], {n: 99, m: 101}) temp = util.tempdir() path = temp.relpath('%s.py' % func.name) func.save(path) func_ = tvm.hybrid.HybridModule() func_.load(path) run_and_check(func_, ins, {n: 99, m: 101}, outs=outs) for key, _ in HYBRID_GLOBALS.items(): assert key not in globals().keys() assert key not in outer_product.__globals__.keys()
def check_load_module(): temp = util.tempdir() path_lib = temp.relpath("deploy.so") mhost.export_library(path_lib) with open(temp.relpath("deploy.json"), "w") as out_file: out_file.write(graph) loaded_lib = tvm.module.load(path_lib) loaded_graph = open(temp.relpath("deploy.json")).read() mod = graph_runtime.create(loaded_graph, loaded_lib, ctx) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) np.testing.assert_equal( out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
def tune_and_evaluate(tuning_opt): # extract workloads from nnvm graph print("Extract tasks...") net, params, input_shape, out_shape = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_graph(net, target=target, target_host=target_host, shape={'data': input_shape}, dtype=dtype, symbols=(nnvm.sym.conv2d, nnvm.sym.dense)) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with nnvm.compiler.build_config(opt_level=3): graph, lib, params = nnvm.compiler.build( net, target=target, target_host=target_host, shape={'data': input_shape}, params=params, dtype=dtype) # export library tmp = tempdir() if use_android: from tvm.contrib import ndk filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "net.tar" lib.export_library(tmp.relpath(filename)) # upload module to device print("Upload...") remote = autotvm.measure.request_remote(device_key, 'localhost', 9190, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # upload parameters to device ctx = remote.context(str(target), 0) module = runtime.create(graph, rlib, ctx) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number==1, repeat=30) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def test_file_io(): temp = util.tempdir() file_path = temp.relpath("temp.log") tsk, target = get_sample_task() inputs = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(0, 10)] results = [MeasureResult((i, ), 0, 0, 0) for i in range(0, 10)] with open(file_path, "w") as fo: cb = autotvm.callback.log_to_file(fo) cb(None, inputs, results) ref = zip(inputs, results) for x, y in zip(ref, autotvm.record.load_from_file(file_path)): assert x[1] == y[1]
def check_stackvm(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return temp = util.tempdir() name = "myadd_%s" % device f = tvm.build(s, [A, B], device, "stackvm", name=name) path_dso = temp.relpath("dev_lib.stackvm") #f.export_library(path_dso) #f1 = tvm.module.load(path_dso) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) f(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def check_c(): mhost = tvm.build(s, [A, B, C], "c", name="fadd") temp = util.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.module.load(path_dso) fadd = m['fadd'] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy())
def verify_rpc(remote, target, shape, dtype): A = tvm.placeholder(shape, dtype=dtype) B = tvm.compute(A.shape, lambda i: A[i]+tvm.const(1, A.dtype)) s = tvm.create_schedule(B.op) f = tvm.build(s, [A, B], target, name="myadd") ctx = remote.cpu(0) a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), ctx=ctx) b = tvm.nd.array(np.zeros(shape).astype(A.dtype), ctx=ctx) temp = util.tempdir() path_dso = temp.relpath("dev_lib.o") f.save(path_dso) remote.upload(path_dso) f = remote.load_module("dev_lib.o") f(a, b) tvm.testing.assert_allclose(a.asnumpy() + 1, b.asnumpy())
def check_remote_link_cl(remote): """Test function to run remote code such as cl This is not enabled because there is forking issue of TVM runtime when server launches after OpenCL runtime initializes. We leave it as an example on how to do rpc when we want to do linking on remote. """ if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return if not tvm.module.enabled("opencl"): print("Skip because opencl is not enabled") return temp = util.tempdir() ctx = remote.cl(0) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd") # Option 1: save modules separately and rely on remote compiler path_o = temp.relpath("myadd.o") path_cl = temp.relpath("myadd.cl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_o) f.imported_modules[0].save(path_cl) remote.upload(path_o) remote.upload(path_cl) # upload meta data remote.upload(path_json) fhost = remote.load_module("myadd.o") fdev = remote.load_module("myadd.cl") fhost.import_module(fdev) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Option 2: export library as a tar ball then handled by remote compiler path_tar = temp.relpath("myadd.tar") f.export_library(path_tar) remote.upload(path_tar) fhost = remote.load_module("myadd.tar") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def check_remote(remote): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def gemv_impl(): cc_code = """ extern "C" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) { for (int i = 0; i < m; ++i) { for (int j = 0; j < l; ++j) { cc[i] += aa[j] * bb[i * stride + j]; } } return 0; } """ from tvm.contrib import util, clang temp = util.tempdir() ll_path = temp.relpath("temp.ll") # Create LLVM ir from c source code ll_code = clang.create_llvm(cc_code, output=ll_path) return ll_code
def verify(s, check_correctness): mod = vta.build(s, [data, kernel, bias, res], "ext_dev", env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # verify ctx = remote.ext_dev(0) # Data in original format data_orig, kernel_orig, res_ref = get_ref_data() bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32") bias_orig = np.abs(bias_orig) data_packed = data_orig.reshape( batch_size//env.BATCH, env.BATCH, wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3)) kernel_packed = kernel_orig.reshape( wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3)) bias_packed = bias_orig.reshape( 1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) res_shape = topi.util.get_const_tuple(res.shape) res_np = np.zeros(res_shape).astype(res.dtype) data_arr = tvm.nd.array(data_packed, ctx) kernel_arr = tvm.nd.array(kernel_packed, ctx) bias_arr = tvm.nd.array(bias_packed, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=5) cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) res_unpack = res_arr.asnumpy().transpose( (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width) if check_correctness: assert wl.hpad == wl.wpad stride = (wl.hstride, wl.wstride) padding = wl.hpad res_ref = res_ref >> 8 res_ref += bias_orig.reshape(wl.out_filter, 1, 1) res_ref = np.clip(res_ref, 0, 127).astype("int8") tvm.testing.assert_allclose(res_unpack, res_ref) return cost
def test_dso_module_load(): if not tvm.module.enabled("llvm"): return dtype = 'int64' temp = util.tempdir() def save_object(names): n = tvm.var('n') Ab = tvm.decl_buffer((n, ), dtype) i = tvm.var('i') # for i in 0 to n-1: stmt = tvm.make.For( i, 0, n - 1, 0, 0, tvm.make.Store(Ab.data, tvm.make.Load(dtype, Ab.data, i) + 1, i + 1)) fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) m = tvm.codegen.build_module(fapi, "llvm") for name in names: m.save(name) path_obj = temp.relpath("test.o") path_ll = temp.relpath("test.ll") path_bc = temp.relpath("test.bc") path_dso = temp.relpath("test.so") save_object([path_obj, path_ll, path_bc]) cc.create_shared(path_dso, [path_obj]) f1 = tvm.module.load(path_dso) f2 = tvm.module.load(path_ll) a = tvm.nd.array(np.zeros(10, dtype=dtype)) f1(a) np.testing.assert_equal(a.asnumpy(), np.arange(a.shape[0])) a = tvm.nd.array(np.zeros(10, dtype=dtype)) f2(a) np.testing.assert_equal(a.asnumpy(), np.arange(a.shape[0])) path_runtime_py = temp.relpath("runtime.py") with open(path_runtime_py, "w") as fo: fo.write(runtime_py) subprocess.check_call( "python %s %s %s" % (path_runtime_py, path_dso, dtype), shell=True)
def check_remote(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) temp = util.tempdir() ctx = remote.cpu(0) path_dso = temp.relpath("dev_lib.so") mlib.export_library(path_dso) remote.upload(path_dso) mlib = remote.load_module("dev_lib.so") mod = graph_runtime.create(graph, mlib, remote.cpu(0)) a = np.random.uniform(size=(n,)).astype(A.dtype) mod.run(x=tvm.nd.array(a, ctx)) out = tvm.nd.empty((n,), ctx=ctx) out = mod.get_output(0, out) np.testing.assert_equal(out.asnumpy(), a + 1)
def verify(s, check_correctness=True): mod = vta.build(s, [data, weight, res], "ext_dev", env.target_host, name="gemm") temp = util.tempdir() mod.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # verify ctx = remote.ext_dev(0) # Data in original format data_orig = np.random.randint( -128, 128, size=(batch_size, channel)).astype(data.dtype) weight_orig = np.random.randint( -128, 128, size=(channel, channel)).astype(weight.dtype) data_packed = data_orig.reshape( batch_size // env.BATCH, env.BATCH, channel // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) weight_packed = weight_orig.reshape( channel // env.BLOCK_OUT, env.BLOCK_OUT, channel // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) res_np = np.zeros(res_shape).astype(res.dtype) data_arr = tvm.nd.array(data_packed, ctx) weight_arr = tvm.nd.array(weight_packed, ctx) res_arr = tvm.nd.array(res_np, ctx) res_ref = np.zeros(res_shape).astype(env.acc_dtype) for b in range(batch_size // env.BATCH): for i in range(channel // env.BLOCK_OUT): for j in range(channel // env.BLOCK_IN): res_ref[b,i,:] += np.dot(data_packed[b,j,:].astype(env.acc_dtype), weight_packed[i,j].T.astype(env.acc_dtype)) res_ref = np.right_shift(res_ref, 8) res_ref = np.clip(res_ref, 0, (1<<(env.INP_WIDTH-1))-1).astype(res.dtype) time_f = f.time_evaluator("gemm", ctx, number=20) cost = time_f(data_arr, weight_arr, res_arr) res_unpack = res_arr.asnumpy().reshape(batch_size // env.BATCH, channel // env.BLOCK_OUT, env.BATCH, env.BLOCK_OUT) if check_correctness: tvm.testing.assert_allclose(res_unpack, res_ref) return cost
def verify_graph_runtime(remote, target, shape, dtype): x = relay.var('x') y = relay.const(1) z = relay.add(x, y) func = relay.Function([x], z) x_in = np.ones(shape).astype(dtype) params = {'x': x_in} graph, lib, params = relay.build(func, target=target, params=params) temp = util.tempdir() path_dso = temp.relpath("dev_lib.o") lib.save(path_dso) remote.upload(path_dso) lib = remote.load_module("dev_lib.o") ctx = remote.cpu(0) mod = graph_runtime.create(graph, lib, ctx) mod.load_params(relay.save_param_dict(params)) mod.run() out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx)) tvm.testing.assert_allclose(x_in + 1, out.asnumpy())
def check_remote(): if not tvm.module.enabled(target): print("Skip because %s is not enabled" % target) return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], target, name="myadd") path_obj = temp.relpath("dev_lib.bc") path_dso = temp.relpath("dev_lib.js") f.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) # Upload to suffix as dso so it can be loaded remotely remote.upload(path_dso, "dev_lib.dso") data = remote.download("dev_lib.dso") f1 = remote.load_module("dev_lib.dso") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
# connect to remote device tracker = tvm.rpc.connect_tracker(args.host, args.port) remote = tracker.request(args.rpc_key) print("--------------------------------------------------") print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)")) print("--------------------------------------------------") for network in networks: net, params, input_shape, output_shape = get_network(network, batch_size=1) with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']): graph, lib, params = nnvm.compiler.build( net, target=target, shape={'data': input_shape}, params=params, dtype=dtype) tmp = tempdir() if 'android' in str(target): from tvm.contrib import ndk filename = "%s.so" % network lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "%s.tar" % network lib.export_library(tmp.relpath(filename)) # upload library and params ctx = remote.context(str(target), 0) remote.upload(tmp.relpath(filename)) rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} rlib = remote.load_module(filename) module = runtime.create(graph, rlib, ctx)
def run_case(dtype, image): # Check image import os import json import sys STAT_REPEAT=os.environ.get('STAT_REPEAT','') if STAT_REPEAT=='' or STAT_REPEAT==None: STAT_REPEAT=10 STAT_REPEAT=int(STAT_REPEAT) # FGG: set model files via CK env CATEG_FILE = '../synset.txt' synset = eval(open(os.path.join(CATEG_FILE)).read()) files=[] val={} if image!=None and image!='': files=[image] else: ipath=os.environ.get('CK_ENV_DATASET_IMAGENET_VAL','') if ipath=='': print ('Error: path to ImageNet dataset is not set!') exit(1) if not os.path.isdir(ipath): print ('Error: path to ImageNet dataset was not found!') exit(1) # get all files d=os.listdir(ipath) for x in d: x1=x.lower() if x1.startswith('ilsvrc2012_val_'): files.append(os.path.join(ipath,x)) files=sorted(files) STAT_REPEAT=1 # Get correct labels ival=os.environ.get('CK_CAFFE_IMAGENET_VAL_TXT','') fval=open(ival).read().split('\n') val={} for x in fval: x=x.strip() if x!='': y=x.split(' ') val[y[0]]=int(y[1]) # FGG: set timers import time timers={} # Get first shape (expect that will be the same for all) dt=time.time() image = Image.open(os.path.join(files[0])).resize((224, 224)) if image.mode!='RGB': image=image.convert('RGB') timers['execution_time_load_image']=time.time()-dt dt=time.time() img = transform_image(image) timers['execution_time_transform_image']=time.time()-dt # load model from mxnet.gluon.model_zoo.vision import get_model from mxnet.gluon.utils import download model_path=os.environ['CK_ENV_MODEL_MXNET'] model_id=os.environ['MXNET_MODEL_ID'] block = get_model(model_id, pretrained=True, root=model_path) # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon net, params = nnvm.frontend.from_mxnet(block) # we want a probability so add a softmax operator net = nnvm.sym.softmax(net) # convert to wanted dtype (https://github.com/merrymercy/tvm-mali/issues/3) if dtype!='float32': params = {k: tvm.nd.array(v.asnumpy().astype(dtype)) for k, v in params.items()} # compile opt_level = 2 if dtype == 'float32' else 1 with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build( net, tvm.target.mali(), shape={"data": data_shape}, params=params, dtype=dtype, target_host=None) # upload model to remote device tmp = util.tempdir() lib_fname = tmp.relpath('net.tar') lib.export_library(lib_fname) ctx = tvm.cl(0) rlib = lib rparams = params # create graph runtime dt=time.time() module = runtime.create(graph, rlib, ctx) module.set_input('data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype))) module.set_input(**rparams) timers['execution_time_create_run_time_graph']=(time.time()-dt) total_images=0 correct_images_top1=0 correct_images_top5=0 # Shuffle files and pre-read JSON with accuracy to continue aggregating it # otherwise if FPGA board hangs, we can continue checking random images ... import random random.shuffle(files) if len(files)>1 and os.path.isfile('aggregate-ck-timer.json'): x=json.load(open('aggregate-ck-timer.json')) if 'total_images' in x: total_images=x['total_images'] if 'correct_images_top1' in x: correct_images_top1=x['correct_images_top1'] if 'correct_images_top5' in x: correct_images_top5=x['correct_images_top5'] dt1=time.time() for f in files: total_images+=1 print ('===============================================================================') print ('Image '+str(total_images)+' of '+str(len(files))+' : '+f) image = Image.open(os.path.join(f)).resize((224, 224)) if image.mode!='RGB': image=image.convert('RGB') img = transform_image(image) # set inputs module.set_input('data', tvm.nd.array(img.astype(dtype))) module.set_input(**rparams) # perform some warm up runs # print("warm up..") warm_up_timer = module.module.time_evaluator("run", ctx, 1) warm_up_timer() # execute print ('') print ("run ("+str(STAT_REPEAT)+" statistical repetitions)") dt=time.time() timer = module.module.time_evaluator("run", ctx, number=STAT_REPEAT) tcost = timer() timers['execution_time_classify']=(time.time()-dt)/STAT_REPEAT # get outputs tvm_output = module.get_output( 0, tvm.nd.empty((1000,), dtype, ctx)) top1 = np.argmax(tvm_output.asnumpy()) top5=[] atop5 = get_top5(tvm_output.asnumpy()) print ('') print('TVM prediction Top1:', top1, synset[top1]) print ('') print('TVM prediction Top5:') for q in atop5: x=q[1] y=synset[x] top5.append(x) print (x,y) print ('') print("Internal T-cost: %g" % tcost.mean) # Check correctness if available if len(val)>0: top=val[os.path.basename(f)] correct_top1=False if top==top1: correct_top1=True correct_images_top1+=1 print ('') if correct_top1: print ('Current prediction Top1: CORRECT') else: print ('Current prediction Top1: INCORRECT +('+str(top)+')') accuracy_top1=float(correct_images_top1)/float(total_images) print ('Current accuracy Top1: '+('%.5f'%accuracy_top1)) correct_top5=False if top in top5: correct_top5=True correct_images_top5+=1 print ('') if correct_top5: print ('Current prediction Top5: CORRECT') else: print ('Current prediction Top5: INCORRECT +('+str(top)+')') accuracy_top5=float(correct_images_top5)/float(total_images) print ('Current accuracy Top5: '+('%.5f'%accuracy_top5)) print ('') print ('Total elapsed time: '+('%.1f'%(time.time()-dt1))+' sec.') timers['total_images']=total_images timers['correct_images_top1']=correct_images_top1 timers['accuracy_top1']=accuracy_top1 timers['correct_images_top5']=correct_images_top5 timers['accuracy_top5']=accuracy_top5 timers['execution_time_classify_internal']=tcost.mean timers['execution_time']=tcost.mean with open ('tmp-ck-timer.json', 'w') as ftimers: json.dump(timers, ftimers, indent=2) with open ('aggregate-ck-timer.json', 'w') as ftimers: json.dump(timers, ftimers, indent=2) sys.stdout.flush() return
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from tvm import relay from tvm.relay import testing import tvm from tvm import te from tvm.contrib import util header_file_dir_path = util.tempdir() def gen_engine_header(): code = r''' #ifndef _ENGINE_H_ #define _ENGINE_H_ #include <cstdint> #include <string> #include <sstream> #include <vector> class Engine { }; #endif '''
def __init__(self): # pylint: disable=super-init-not-called self.context = nd.context self.get_function = tvm._ffi.get_global_func self._temp = util.tempdir()
def run_gemm(env, remote, target, batch_size, in_feat, out_feat, check_correctness=True, print_ir=True, samples=4): # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False elif "vta" in target.keys: data_pack = True # Derive shapes depending upon packing a_shape = (batch_size, in_feat) w_shape = (out_feat, in_feat) if data_pack: data_shape = (batch_size//env.BATCH, in_feat//env.BLOCK_IN, env.BATCH, env.BLOCK_IN) kernel_shape = (out_feat//env.BLOCK_OUT, in_feat//env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN) fcompute = vta.top.dense_packed fschedule = vta.top.schedule_dense_packed else: data_shape = a_shape kernel_shape = w_shape fcompute = topi.x86.dense_nopack fschedule = topi.x86.schedule_dense_nopack data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) # Define base computation schedule with target: res = fcompute( data, kernel, None, env.acc_dtype) res = topi.right_shift(res, 8) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, res], simple_mode=True)) # Derive number of ops num_ops = 2 * batch_size * in_feat * out_feat # @memoize("vta.tests.test_benchmark_topi.dense.verify") def get_ref_data(): # derive min max for act, wgt types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) r_np = np.dot(a_np.astype(env.acc_dtype), w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype) return a_np, w_np, r_np # Data in original format data_np, kernel_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape( batch_size//env.BATCH, env.BATCH, in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) kernel_np = kernel_np.reshape( out_feat//env.BLOCK_OUT, env.BLOCK_OUT, in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) # Build if "vta" in target.keys: mod = vta.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="dense") else: mod = tvm.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="dense") temp = util.tempdir() mod.save(temp.relpath("dense.o")) remote.upload(temp.relpath("dense.o")) f = remote.load_module("dense.o") ctx = remote.context(str(target)) res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, ctx) kernel_arr = tvm.nd.array(kernel_np, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("dense", ctx, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, res_arr) if env.TARGET == "sim": stats = json.loads(remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads(remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.asnumpy() if data_pack: res_orig = res_orig.reshape(batch_size, out_feat) res_ref = res_ref >> 8 res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10 ** 9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") mod, params, input_shape, _ = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_program(mod["main"], target=target, target_host=target_host, params=params, ops=(relay.op.nn.conv2d, )) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build( mod, target=target, params=params, target_host=target_host) # export library tmp = tempdir() if use_android: from tvm.contrib import ndk filename = "{}.so".format(module_export_prefix) lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "{}.tar".format(module_export_prefix) lib.export_library(tmp.relpath(filename)) lib.imported_modules[0].save( "{}-cuda.ptx".format(module_export_prefix)) lib.export_library("{}-lib.tar".format(module_export_prefix)) with open("{}-graph.json".format(module_export_prefix), "w") as fo: fo.write(graph) with open("{}-params.params".format(module_export_prefix), "wb") as fo: fo.write(relay.save_param_dict(params)) # upload module to device print("Upload...") remote = autotvm.measure.request_remote(device_key, tracker_host, tracker_port, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # upload parameters to device ctx = remote.context(str(target), 0) module = runtime.create(graph, rlib, ctx) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=30) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') a_np = np.random.uniform(size=1024).astype(A.dtype) temp = util.tempdir() # Establish remote connection with target hardware tracker = rpc.connect_tracker(tracker_host, tracker_port) remote = tracker.request(key, priority=0, session_timeout=60) # Compile the Graph for CPU target s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso_cpu = temp.relpath("cpu_lib.so") f.export_library(path_dso_cpu, ndk.create_shared) # Execute the portable graph on cpu target print('Run CPU test ...') ctx = remote.cpu(0) remote.upload(path_dso_cpu) f2 = remote.load_module("cpu_lib.so") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f2.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op\n' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Compile the Graph for OpenCL target if test_opencl: s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd") path_dso_cl = temp.relpath("dev_lib_cl.so") f.export_library(path_dso_cl, ndk.create_shared) print('Run GPU(OpenCL Flavor) test ...') ctx = remote.cl(0) remote.upload(path_dso_cl) f1 = remote.load_module("dev_lib_cl.so") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op\n' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Compile the Graph for Vulkan target if test_vulkan: s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd") path_dso_vulkan = temp.relpath("dev_lib_vulkan.so") f.export_library(path_dso_vulkan, ndk.create_shared) print('Run GPU(Vulkan Flavor) test ...') ctx = remote.vulkan(0) remote.upload(path_dso_vulkan) f1 = remote.load_module("dev_lib_vulkan.so") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op\n' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def test_one_time(one_time_length=1000, Test_sparse=True, image_shape=(3, 32, 32)): # Hyper-parameter define batch_size = 1 num_class = 10 data_shape = (batch_size, ) + image_shape out_shape = (batch_size, num_class) sparse_kernel_shape = (batch_size, 12) dtype = "float32" data = sym.Variable("data") sparse_kernel = sym.Variable("sparse_kernel", init=np.random.randint( 0, 2, sparse_kernel_shape).astype(dtype)) if Test_sparse: y1 = sym.conv2d_sparse(data=data, sparsity=sparse_kernel, channels=12, kernel_size=(3, 3), padding=(0, 0), use_bias=False, out_layout='NCHW') else: y1 = sym.conv2d(data=data, channels=10, kernel_size=(3, 3), padding=(0, 0), use_bias=False, out_layout='NCHW') # y = sym.flatten(y1) # y = sym.dense(y, units=10, use_bias=False) # y = sym.softmax(y) out = y1 # Test Graph compilation # Once the API is well-defined, this part will be OK # g = graph.create(out) # print("-------------Starts----------------") # print(g.json()) # print("-----------------------------------") # print(g.ir()) # print("--------------Ends-----------------") # Create workload net, params = create_sparse_workload(out, batch_size, image_shape, dtype) # print("-------------Starts2---------------") # print(net.debug_str()) # print(params) # print("--------------Ends2----------------") # Test Forward # NNVM-compiler build opt_level = 0 target = tvm.target.mali() target_host = "llvm -target=aarch64-linux-gnu" with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(net, target=target, shape={"data": data_shape}, params=params, target_host=target_host) tmp = util.tempdir() lib_fname = tmp.relpath("net.tar") lib.export_library(lib_fname) remote = rpc.connect('59.78.6.204', 9090) remote.upload(lib_fname) rlib = remote.load_module("net.tar") ctx = remote.cl(0) # create random input real_data = np.random.uniform(-1, 1, size=data_shape).astype(dtype) real_sparse_kernel = np.array(([[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])).astype(dtype) # real_sparse_kernel = np.random.randint(0, 2, sparse_kernel_shape).astype(dtype) # print(real_data) # print(real_sparse_kernel) # create module module = graph_runtime.create(graph, rlib, ctx) # set input and parameters module.set_input("data", real_data) if Test_sparse: module.set_input("sparse_kernel", real_sparse_kernel) module.set_input(**params) # run # localtime = time.asctime(time.localtime(time.time())) # print("Start time:" + localtime) starttime = time.time() for _ in range(one_time_length): module.run() endtime = time.time() # localtime = time.asctime(time.localtime(time.time())) # print("End time:" + localtime) print(endtime - starttime) # get output out = module.get_output(0) # convert to numpy out.asnumpy() # Print first 10 elements of output # print("-------------Starts3---------------") # # print(out.asnumpy().flatten()[0:10]) # print(out) # print("--------------Ends3----------------") return endtime - starttime
def __init__(self, model_dir): from tensorflow.core.framework import graph_pb2 self._tmp_dir = util.tempdir() self._model_dir = model_dir self._graph = graph_pb2.GraphDef()
def _run(env, remote): m = 2 n = 8 imm_shift = np.random.randint(0, 8) imm_scale = np.random.randint(1, 5) # compute a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf") # DRAM->SRAM res_shift = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a_buf(*i) + imm_shift, "res_shift") # compute res_scale = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_shift(*i) >> imm_scale, "res_scale") # compute res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_scale(*i).astype(env.inp_dtype), "res") # SRAM->DRAM # schedule s = tvm.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[res_shift].set_scope(env.acc_scope) # SRAM s[res_scale].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[res_shift].pragma(res_shift.op.axis[0], env.alu) # compute s[res_scale].pragma(res_scale.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM # build mod = vta.build(s, [a, res], "ext_dev", env.target_host) if not remote: return temp = util.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify ctx = remote.ext_dev(0) a_np = np.random.randint(-10, 10, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.right_shift((a_np + imm_shift), imm_scale) res_np = res_np.astype(res.dtype) a_nd = tvm.nd.array(a_np, ctx) res_nd = tvm.nd.array( np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(a_nd, res_nd) np.testing.assert_equal(res_np, res_nd.asnumpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Shift and scale execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def gemv_quantized_impl(M, N, data_type='uint8'): """ Assembly implementation of a blocked gemv. Given a block a of shape (4, k) and a block b' of shape (4, k) produces the output block c = a*b of shape (4,4) """ stepA = min(4, M) stepB = min(4, N) assert data_type in ['uint8', 'int8' ], 'Only uint8/int8 supported for this implementation' cc_code = """ extern "C" int gemv_{0}_{0}_int32_{1}_{2}(int *c_buffer, unsigned char *a_buffer, unsigned char *b_buffer, int K, int m, int n) """.format(data_type, stepA, stepB) cc_code += """ { unsigned char * a_ptr = a_buffer; unsigned char * b_ptr = b_buffer; int * c_ptr = c_buffer; int k = K / 16; __asm__ __volatile__ ( "movi v16.4s, #0\\n" "movi v17.4s, #0\\n" "movi v18.4s, #0\\n" "movi v19.4s, #0\\n" "movi v20.4s, #0\\n" "movi v21.4s, #0\\n" "movi v22.4s, #0\\n" "movi v23.4s, #0\\n" "movi v24.4s, #0\\n" "movi v25.4s, #0\\n" "movi v26.4s, #0\\n" "movi v27.4s, #0\\n" "movi v28.4s, #0\\n" "movi v29.4s, #0\\n" "movi v30.4s, #0\\n" "movi v31.4s, #0\\n" "1:" """ cc_code += ' "ldr q0, [%[a_ptr]]\\n" ' if M > 1: cc_code += ' "ldr q1, [%[a_ptr], #16]\\n" ' else: cc_code += ' "movi v1.4s, #0\\n" ' if M > 2: cc_code += ' "ldr q2, [%[a_ptr], #32]\\n" ' else: cc_code += ' "movi v2.4s, #0\\n" ' if M > 3: cc_code += ' "ldr q3, [%[a_ptr], #48]\\n" ' else: cc_code += ' "movi v3.4s, #0\\n" ' cc_code += ' "ldr q4, [%[b_ptr]]\\n" ' if N > 1: cc_code += ' "ldr q5, [%[b_ptr], #16]\\n" ' if N > 2: cc_code += ' "ldr q6, [%[b_ptr], #32]\\n" ' if N > 3: cc_code += ' "ldr q7, [%[b_ptr], #48]\\n" ' cc_code += """ // First half // Higher part of a0 * {b0,b1,b2,b3} "umull v8.8h, v0.8b, v4.8b\\n" "umull v9.8h, v0.8b, v5.8b\\n" "umull v10.8h, v0.8b, v6.8b\\n" "umull v11.8h, v0.8b, v7.8b\\n" // Higher part of a1 * {b0,b1,b2,b3} "umull v12.8h, v1.8b, v4.8b\\n" "umull v13.8h, v1.8b, v5.8b\\n" "umull v14.8h, v1.8b, v6.8b\\n" "umull v15.8h, v1.8b, v7.8b\\n" // Accumulate "uadalp v16.4s, v8.8h\\n" "uadalp v17.4s, v9.8h\\n" "uadalp v18.4s, v10.8h\\n" "uadalp v19.4s, v11.8h\\n" "uadalp v20.4s, v12.8h\\n" "uadalp v21.4s, v13.8h\\n" "uadalp v22.4s, v14.8h\\n" "uadalp v23.4s, v15.8h\\n" // Lower part of a0 * {b0,b1,b2,b3} "umull2 v8.8h, v0.16b, v4.16b\\n" "umull2 v9.8h, v0.16b, v5.16b\\n" "umull2 v10.8h, v0.16b, v6.16b\\n" "umull2 v11.8h, v0.16b, v7.16b\\n" // Lower part of a1 * {b0,b1,b2,b3} "umull2 v12.8h, v1.16b, v4.16b\\n" "umull2 v13.8h, v1.16b, v5.16b\\n" "umull2 v14.8h, v1.16b, v6.16b\\n" "umull2 v15.8h, v1.16b, v7.16b\\n" // Accumulate again "uadalp v16.4s, v8.8h\\n" "uadalp v17.4s, v9.8h\\n" "uadalp v18.4s, v10.8h\\n" "uadalp v19.4s, v11.8h\\n" "uadalp v20.4s, v12.8h\\n" "uadalp v21.4s, v13.8h\\n" "uadalp v22.4s, v14.8h\\n" "uadalp v23.4s, v15.8h\\n" // Second half // Lower part of a2 * {b0,b1,b2,b3} "umull v8.8h, v2.8b, v4.8b\\n" "umull v9.8h, v2.8b, v5.8b\\n" "umull v10.8h, v2.8b, v6.8b\\n" "umull v11.8h, v2.8b, v7.8b\\n" // Lower part of a3 * {b0,b1,b2,b3} "umull v12.8h, v3.8b, v4.8b\\n" "umull v13.8h, v3.8b, v5.8b\\n" "umull v14.8h, v3.8b, v6.8b\\n" "umull v15.8h, v3.8b, v7.8b\\n" // Accumulate "uadalp v24.4s, v8.8h\\n" "uadalp v25.4s, v9.8h\\n" "uadalp v26.4s, v10.8h\\n" "uadalp v27.4s, v11.8h\\n" "uadalp v28.4s, v12.8h\\n" "uadalp v29.4s, v13.8h\\n" "uadalp v30.4s, v14.8h\\n" "uadalp v31.4s, v15.8h\\n" // Higher part of a2 * {b0,b1,b2,b3} "umull2 v8.8h, v2.16b, v4.16b\\n" "umull2 v9.8h, v2.16b, v5.16b\\n" "umull2 v10.8h, v2.16b, v6.16b\\n" "umull2 v11.8h, v2.16b, v7.16b\\n" // Higher part of a3 * {b0,b1,b2,b3} "umull2 v12.8h, v3.16b, v4.16b\\n" "umull2 v13.8h, v3.16b, v5.16b\\n" "umull2 v14.8h, v3.16b, v6.16b\\n" "umull2 v15.8h, v3.16b, v7.16b\\n" // Accumulate again "uadalp v24.4s, v8.8h\\n" "uadalp v25.4s, v9.8h\\n" "uadalp v26.4s, v10.8h\\n" "uadalp v27.4s, v11.8h\\n" "uadalp v28.4s, v12.8h\\n" "uadalp v29.4s, v13.8h\\n" "uadalp v30.4s, v14.8h\\n" "uadalp v31.4s, v15.8h\\n" """ blockA = min(64, M * 16) blockB = min(64, N * 16) cc_code += """ // Increment pointers and decrement k "add %[a_ptr], %[a_ptr], #{0}\\n" "add %[b_ptr], %[b_ptr], #{1}\\n" "subs %w[k], %w[k], #1\\n" """.format(blockA, blockB) stepC = min(4, N) cc_code += """ "cbnz %w[k], 1b\\n" // Final additions // v16 contains the four partial sums of a[0, 0:K].*b[0,0:K], let's call them (a,b,c,d) // v17 contains the four partial sums of a[0, 0:K].*b[1,0:K], let's call them (e,f,g,h) // v18 contains the four partial sums of a[0, 0:K].*b[2,0:K], let's call them (i,j,k,l) // v19 contains the four partial sums of a[0, 0:K].*b[3,0:K], let's call them (m,n,o,p) "addp v16.4s, v16.4s, v17.4s\\n" // v16 = (a+b, c+d, e+f, g+h) "addp v17.4s, v18.4s, v19.4s\\n" // v17 = (i+j, k+l, m+n, o+p) "addp v16.4s, v16.4s, v17.4s\\n" // v16 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p) // v20 contains the four partial sums of a[1, 0:K].*b[0,0:K], let's call them (a,b,c,d) // v21 contains the four partial sums of a[1, 0:K].*b[1,0:K], let's call them (e,f,g,h) // v22 contains the four partial sums of a[1, 0:K].*b[2,0:K], let's call them (i,j,k,l) // v23 contains the four partial sums of a[1, 0:K].*b[3,0:K], let's call them (m,n,o,p) "addp v20.4s, v20.4s, v21.4s\\n" // v20 = (a+b, c+d, e+f, g+h) "addp v21.4s, v22.4s, v23.4s\\n" // v21 = (i+j, k+l, m+n, o+p) "addp v20.4s, v20.4s, v21.4s\\n" // v20 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p) // v24 contains the four partial sums of a[2, 0:K].*b[0,0:K], let's call them (a,b,c,d) // v25 contains the four partial sums of a[2, 0:K].*b[1,0:K], let's call them (e,f,g,h) // v26 contains the four partial sums of a[2, 0:K].*b[2,0:K], let's call them (i,j,k,l) // v27 contains the four partial sums of a[2, 0:K].*b[3,0:K], let's call them (m,n,o,p) "addp v24.4s, v24.4s, v25.4s\\n" // v24 = (a+b, c+d, e+f, g+h) "addp v25.4s, v26.4s, v27.4s\\n" // v25 = (i+j, k+l, m+n, o+p) "addp v24.4s, v24.4s, v25.4s\\n" // v24 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p) // v28 contains the four partial sums of a[3, 0:K].*b[0,0:K], let's call them (a,b,c,d) // v29 contains the four partial sums of a[3, 0:K].*b[1,0:K], let's call them (e,f,g,h) // v30 contains the four partial sums of a[3, 0:K].*b[2,0:K], let's call them (i,j,k,l) // v31 contains the four partial sums of a[3, 0:K].*b[3,0:K], let's call them (m,n,o,p) "addp v28.4s, v28.4s, v29.4s\\n" // v28 = (a+b, c+d, e+f, g+h) "addp v29.4s, v30.4s, v31.4s\\n" // v29 = (i+j, k+l, m+n, o+p) "addp v28.4s, v28.4s, v29.4s\\n" // v28 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p) "str q16, [%[c_ptr]]\\n" """ if M > 1: cc_code += ' "str q20, [%[c_ptr], #{0}]\\n" '.format(stepC * 4) if M > 2: cc_code += ' "str q24, [%[c_ptr], #{0}]\\n" '.format(stepC * 8) if M > 3: cc_code += ' "str q28, [%[c_ptr], #{0}]\\n" '.format(stepC * 12) cc_code += """ : [c_ptr] "+r" (c_ptr), [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [k] "+r" (k) : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); return 0; } """ if data_type == 'int8': cc_code = cc_code.replace('unsigned char', 'char') cc_code = cc_code.replace('umull', 'smull') cc_code = cc_code.replace('uadalp', 'sadalp') temp = util.tempdir() ll_path = temp.relpath("temp.ll") # Create LLVM ir from c source code ll_code = clang.create_llvm( cc_code, options=["-mtriple=aarch64-linux-gnu -mattr=+neon"], output=ll_path) return ll_code
def _run(env, remote): m = 8 n = 10 # compute a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf") # DRAM->SRAM max_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.max(a_buf(*i), 0), "res_buf") # relu min_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.min(max_buf(*i), (1 << (env.INP_WIDTH - 1)) - 1), "max_buf") # relu res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: min_buf(*i).astype(env.inp_dtype), "min_buf") # SRAM->DRAM # schedule s = tvm.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[max_buf].set_scope(env.acc_scope) # SRAM s[min_buf].set_scope(env.acc_scope) # SRAM s[max_buf].pragma(max_buf.op.axis[0], env.alu) # compute s[min_buf].pragma(min_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM # build with vta.build_config(): mod = vta.build(s, [a, res], "ext_dev", env.target_host) if not remote: return temp = util.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify ctx = remote.ext_dev(0) a_np = np.random.randint(-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.clip(a_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype) a_nd = tvm.nd.array(a_np, ctx) res_nd = tvm.nd.array( np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(a_nd, res_nd) np.testing.assert_equal(res_np, res_nd.asnumpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Relu execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def create_micro_lib_base(out_obj_path, in_src_path, toolchain_prefix, device_id, lib_type, options=None): """Compiles code into a binary for the target micro device. Parameters ---------- out_obj_path : str path to generated object file in_src_path : str path to source file toolchain_prefix : str toolchain prefix to be used. For example, a prefix of "riscv64-unknown-elf-" means "riscv64-unknown-elf-gcc" is used as the compiler and "riscv64-unknown-elf-ld" is used as the linker, etc. device_id : str unique identifier for the target device lib_type : micro.LibType whether to compile a MicroTVM runtime or operator library options : List[str] additional options to pass to GCC """ base_compile_cmd = [ f"{toolchain_prefix}gcc", "-std=c11", "-Wall", "-Wextra", "--pedantic", "-c", "-O0", "-g", "-nostartfiles", "-nodefaultlibs", "-nostdlib", "-fdata-sections", "-ffunction-sections", ] if options is not None: base_compile_cmd += options src_paths = [] include_paths = find_include_path() + [get_micro_host_driven_dir()] tmp_dir = _util.tempdir() # we might transform the src path in one of the branches below new_in_src_path = in_src_path if lib_type == LibType.RUNTIME: dev_dir = _get_device_source_dir(device_id) dev_src_paths = glob.glob(f"{dev_dir}/*.[csS]") # there needs to at least be a utvm_timer.c file assert dev_src_paths assert "utvm_timer.c" in map(os.path.basename, dev_src_paths) src_paths += dev_src_paths elif lib_type == LibType.OPERATOR: # create a temporary copy of the source, so we can inject the dev lib # header without modifying the original. temp_src_path = tmp_dir.relpath("temp.c") with open(in_src_path, "r") as f: src_lines = f.read().splitlines() src_lines.insert(0, "#include \"utvm_device_dylib_redirect.c\"") with open(temp_src_path, "w") as f: f.write("\n".join(src_lines)) new_in_src_path = temp_src_path base_compile_cmd += ["-c"] else: raise RuntimeError("unknown lib type") src_paths += [new_in_src_path] for path in include_paths: base_compile_cmd += ["-I", path] prereq_obj_paths = [] for src_path in src_paths: curr_obj_path = Path(src_path).with_suffix(".o").name assert curr_obj_path not in prereq_obj_paths prereq_obj_paths.append(curr_obj_path) curr_compile_cmd = base_compile_cmd + [src_path, "-o", curr_obj_path] run_cmd(curr_compile_cmd) ld_cmd = [f"{toolchain_prefix}ld", "-relocatable"] ld_cmd += prereq_obj_paths ld_cmd += ["-o", out_obj_path] run_cmd(ld_cmd)
def _listen_loop(sock, port, rpc_key, tracker_addr, load_library, custom_addr): """Listening loop of the server master.""" def _accept_conn(listen_sock, tracker_conn, ping_period=2): """Accept connection from the other places. Parameters ---------- listen_sock: Socket The socket used by listening process. tracker_conn : connnection to tracker Tracker connection ping_period : float, optional ping tracker every k seconds if no connection is accepted. """ old_keyset = set() # Report resource to tracker if tracker_conn: matchkey = base.random_key(rpc_key + ":") base.sendjson( tracker_conn, [TrackerCode.PUT, rpc_key, (port, matchkey), custom_addr]) assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS else: matchkey = rpc_key unmatch_period_count = 0 unmatch_timeout = 4 # Wait until we get a valid connection while True: if tracker_conn: trigger = select.select([listen_sock], [], [], ping_period) if not listen_sock in trigger[0]: base.sendjson(tracker_conn, [TrackerCode.GET_PENDING_MATCHKEYS]) pending_keys = base.recvjson(tracker_conn) old_keyset.add(matchkey) # if match key not in pending key set # it means the key is acquired by a client but not used. if matchkey not in pending_keys: unmatch_period_count += 1 else: unmatch_period_count = 0 # regenerate match key if key is acquired but not used for a while if unmatch_period_count * ping_period > unmatch_timeout + ping_period: logger.info( "no incoming connections, regenerate key ...") matchkey = base.random_key(rpc_key + ":", old_keyset) base.sendjson(tracker_conn, [ TrackerCode.PUT, rpc_key, (port, matchkey), custom_addr ]) assert base.recvjson( tracker_conn) == TrackerCode.SUCCESS unmatch_period_count = 0 continue conn, addr = listen_sock.accept() magic = struct.unpack("<i", base.recvall(conn, 4))[0] if magic != base.RPC_MAGIC: conn.close() continue keylen = struct.unpack("<i", base.recvall(conn, 4))[0] key = py_str(base.recvall(conn, keylen)) arr = key.split() expect_header = "client:" + matchkey server_key = "server:" + rpc_key if arr[0] != expect_header: conn.sendall(struct.pack("<i", base.RPC_CODE_MISMATCH)) conn.close() logger.warning("mismatch key from %s", addr) continue conn.sendall(struct.pack("<i", base.RPC_CODE_SUCCESS)) conn.sendall(struct.pack("<i", len(server_key))) conn.sendall(server_key.encode("utf-8")) return conn, addr, _parse_server_opt(arr[1:]) # Server logic tracker_conn = None while True: try: # step 1: setup tracker and report to tracker if tracker_addr and tracker_conn is None: tracker_conn = base.connect_with_retry(tracker_addr) tracker_conn.sendall(struct.pack("<i", base.RPC_TRACKER_MAGIC)) magic = struct.unpack("<i", base.recvall(tracker_conn, 4))[0] if magic != base.RPC_TRACKER_MAGIC: raise RuntimeError("%s is not RPC Tracker" % str(tracker_addr)) # report status of current queue cinfo = {"key": "server:" + rpc_key} base.sendjson(tracker_conn, [TrackerCode.UPDATE_INFO, cinfo]) assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS # step 2: wait for in-coming connections conn, addr, opts = _accept_conn(sock, tracker_conn) except (socket.error, IOError): # retry when tracker is dropped if tracker_conn: tracker_conn.close() tracker_conn = None continue except RuntimeError as exc: raise exc # step 3: serving work_path = util.tempdir() logger.info("connection from %s", addr) server_proc = multiprocessing.Process(target=_serve_loop, args=(conn, addr, load_library, work_path)) server_proc.deamon = True server_proc.start() # close from our side. conn.close() # wait until server process finish or timeout server_proc.join(opts.get("timeout", None)) if server_proc.is_alive(): logger.info("Timeout in RPC session, kill..") # pylint: disable=import-outside-toplevel import psutil parent = psutil.Process(server_proc.pid) # terminate worker childs for child in parent.children(recursive=True): child.terminate() # terminate the worker server_proc.terminate() work_path.remove()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'], help="The model type.") parser.add_argument('--host', type=str, required=True, help="The host address of your Raspberry Pi.") parser.add_argument('--port', type=int, required=True, help="The port number of your Raspberry Pi.") parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.") parser.add_argument('--num-iter', type=int, default=50, help="Number of iteration during benchmark.") args = parser.parse_args() opt_level = args.opt_level target = "llvm -target=armv7l-none-linux-anueabihf -mcpu=cortex-a53 -mattr=+neon" num_iter = args.num_iter batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size, ) + image_shape out_shape = (batch_size, num_classes) if args.model == 'resnet': net, params = nnvm.testing.resnet.get_workload(batch_size=1, image_shape=image_shape) elif args.model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape) else: raise ValueError('no benchmark prepared for {}.'.format(args.model)) with nnvm.compiler.build_config(opt_level=opt_level): with tvm.target.rasp(): graph, lib, params = nnvm.compiler.build( net, target, shape={"data": data_shape}, params=params) tmp = util.tempdir() lib_fname = tmp.relpath('net.o') lib.save(lib_fname) remote = rpc.connect(args.host, args.port) remote.upload(lib_fname) ctx = remote.cpu(0) rlib = remote.load_module('net.o') rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} module = runtime.create(graph, rlib, ctx) module.set_input( 'data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32"))) module.set_input(**rparams) module.run() out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx)) out.asnumpy() print('benchmark args: {}'.format(args)) ftimer = module.module.time_evaluator("run", ctx, num_iter) for i in range(3): prof_res = ftimer() print(prof_res) # sleep for avoiding cpu overheat time.sleep(45)
def tune_and_evaluate(tuning_opt): # extract workloads from nnvm graph print("Extract tasks...") net, params, input_shape, out_shape = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_graph(net, target=target, target_host=target_host, shape={'data': input_shape}, dtype=dtype, symbols=(nnvm.sym.conv2d, nnvm.sym.dense)) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with nnvm.compiler.build_config(opt_level=3): graph, lib, params = nnvm.compiler.build( net, target=target, target_host=target_host, shape={'data': input_shape}, params=params, dtype=dtype) # export library tmp = tempdir() if use_android: from tvm.contrib import ndk filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "net.tar" lib.export_library(tmp.relpath(filename)) # upload module to device print("Upload...") remote = autotvm.measure.request_remote(device_key, 'localhost', 9190, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # upload parameters to device ctx = remote.context(str(target), 0) rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) module = runtime.create(graph, rlib, ctx) module.set_input('data', data_tvm) module.set_input(**rparams) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=50, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def test_gemm_gpu(N, times, bn, num_block, num_thread): assert (bn <= N) assert (num_thread * num_thread * 16 <= N) assert (num_block * num_block * 2 <= N) A = te.placeholder((N, N), name='A') B = te.placeholder((N, N), name='Btmp') k = te.reduce_axis((0, N), name='k') packedB = te.compute((N, N / bn, bn), lambda x, y, z: B[x, y * bn + z], name='B') C = te.compute( (N, N), lambda ii, jj: te.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k), name='C') s = te.create_schedule(C.op) CC = s.cache_write(C, "local") block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_xz = te.thread_axis((0, 2), "vthread", name="vx") thread_yz = te.thread_axis((0, 2), "vthread", name="vy") pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread) pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread) s[packedB].bind(pby, thread_y) s[packedB].bind(pbx, thread_x) pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8) s[packedB].vectorize(pbk) by, yi = s[C].split(C.op.axis[0], nparts=num_block) bx, xi = s[C].split(C.op.axis[1], nparts=num_thread) s[C].bind(by, block_y) s[C].bind(bx, thread_y) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_block) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, block_x) s[C].bind(tx, thread_x) xyi, xxi = s[C].split(xi, factor=8) s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi) s[C].vectorize(xxi) s[CC].compute_at(s[C], yi) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) xo, xi = s[CC].split(xo, factor=8) s[CC].vectorize(xi) ko, ki = s[CC].split(k, factor=2) s[CC].unroll(ki) print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu") temp = util.tempdir() path_dso = temp.relpath("gemm_gpu.so") f.export_library(path_dso, ndk.create_shared) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.cl(0) remote.upload(path_dso) f = remote.load_module("gemm_gpu.so") evaluate(f, ctx, N, times)
def test_mobilenet(): temp = util.tempdir() image, synset = prepare_input() model, params = get_model("mobilenetv2_1.0", image.shape) def run(mod, target): with relay.build_config(opt_level=3): lib = relay.build(mod, target=target, target_host=target_host, params=params) path_dso = temp.relpath("deploy.dylib") lib.export_library(path_dso, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso) # Start RPC test server that contains the compiled library. xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, libs=[path_dso]) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) if target == "metal": ctx = remote.metal(0) else: ctx = remote.cpu(0) lib = remote.load_module("deploy.dylib") m = graph_runtime.GraphModule(lib["default"](ctx)) m.set_input("data", tvm.nd.array(image, ctx)) m.run() tvm_output = m.get_output(0) top1 = np.argmax(tvm_output.asnumpy()[0]) print("TVM prediction top-1:", top1, synset[top1]) # evaluate ftimer = m.module.time_evaluator("run", ctx, number=3, repeat=10) prof_res = np.array(ftimer().results) * 1000 print("%-19s (%s)" % ("%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))) def annotate(func, compiler): """ An annotator for Core ML. """ # Bind free variables to the constant values. bind_dict = {} for arg in func.params: name = arg.name_hint if name in params: bind_dict[arg] = relay.const(params[name]) func = relay.bind(func, bind_dict) # Annotate the entire graph for Core ML mod = tvm.IRModule() mod["main"] = func seq = tvm.transform.Sequential([ transform.SimplifyInference(), transform.FoldConstant(), transform.FoldScaleAxis(), transform.AnnotateTarget(compiler), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) with relay.build_config(opt_level=3): mod = seq(mod) return mod # CPU run(model, target_host) # Metal run(model, "metal") # CoreML run(annotate(model, "coremlcompiler"), target_host)
def check_alu(tvm_op, np_op=None, use_imm=False): """Test ALU""" m = 8 n = 8 imm = np.random.randint(1, 5) # compute a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf") #DRAM->SRAM if use_imm: res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), imm), "res_buf") #compute else: b = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="b", dtype=env.acc_dtype) b_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: b(*i), "b_buf") #DRAM->SRAM res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), b_buf(*i)), "res_buf") #compute5B res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_buf(*i).astype(env.inp_dtype), "res") #SRAM->DRAM # schedule s = tvm.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[res_buf].set_scope(env.acc_scope) # SRAM s[res_buf].pragma(res_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM if not use_imm: s[b_buf].set_scope(env.acc_scope) # SRAM s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy) # DRAM->SRAM if not remote: return # build with vta.build_config(): if use_imm: mod = vta.build(s, [a, res], "ext_dev", env.target_host) else: mod = vta.build(s, [a, b, res], "ext_dev", env.target_host) temp = util.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify ctx = remote.ext_dev(0) a_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) if use_imm: res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm) else: b_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(b.dtype) res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np) res_np = res_np.astype(res.dtype) a_nd = tvm.nd.array(a_np, ctx) res_nd = tvm.nd.array( np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx) if env.TARGET == "tsim": simulator.tsim_init("libvta_hw") if use_imm: f(a_nd, res_nd) else: b_nd = tvm.nd.array(b_np, ctx) f(a_nd, b_nd, res_nd) np.testing.assert_equal(res_np, res_nd.asnumpy())
def run_conv2d_transpose(env, remote, wl, target, check_correctness=True, print_ir=False, samples=4): # Workload assertions assert wl.hpad == wl.wpad # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) # Derive shapes depending upon packing a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) if data_pack: data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN) kernel_shape = (wl.out_filter // env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) else: data_shape = a_shape kernel_shape = w_shape data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) # Define base computation schedule with target: res = topi.nn.conv2d_transpose_nchw(data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), env.acc_dtype) res = topi.right_shift(res, env.WGT_WIDTH) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = topi.generic.schedule_conv2d_transpose_nchw([res]) if print_ir: print(vta.lower(s, [data, kernel, res], simple_mode=True)) # Derive number of ops fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") def get_ref_data(): # derive min max for act and wgt types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint(w_min, w_max, size=(wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)).astype(kernel.dtype) r_np = topi.testing.conv2d_transpose_nchw_python( a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad).astype(env.acc_dtype) return a_np, w_np, r_np # Data in original format data_np, kernel_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape(wl.batch // env.BATCH, env.BATCH, wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width).transpose( (0, 2, 4, 5, 1, 3)) kernel_np = kernel_np.reshape(wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.out_filter // env.BLOCK_OUT, env.BLOCK_OUT, wl.hkernel, wl.wkernel).transpose((2, 0, 4, 5, 3, 1)) kernel_np = np.flip(kernel_np, 2) kernel_np = np.flip(kernel_np, 3) # Build if "vta" in target.keys: mod = vta.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="conv2d_transpose") else: mod = tvm.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="conv2d_transpose") temp = util.tempdir() mod.save(temp.relpath("conv2d_transpose.o")) remote.upload(temp.relpath("conv2d_transpose.o")) f = remote.load_module("conv2d_transpose.o") ctx = remote.context(str(target)) res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, ctx) kernel_arr = tvm.nd.array(kernel_np, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d_transpose", ctx, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, res_arr) if env.TARGET == "sim": stats = json.loads( remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads( remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.asnumpy() if data_pack: res_orig = res_orig.transpose( (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width) res_ref = res_ref >> env.WGT_WIDTH res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats
def tune_and_evaluate(tuning_opt): if env.TARGET != "sim": # Get remote from fleet node remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() # Register VTA tuning tasks register_vta_tuning_tasks() # Perform task extraction on Relay program print("Extract tasks...") relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) mod = tvm.IRModule.from_expr(relay_prog) tasks = autotvm.task.extract_from_program(mod, params=params, ops=(relay.op.get("nn.conv2d"),), target=target, target_host=env.target_host) # filter out non-packed conv2d task tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks)) # We should have extracted 10 convolution tasks assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) for tsk in tasks: inp = tsk.args[0][1] wgt = tsk.args[1][1] batch = inp[0] * inp[4] in_filter = inp[1] * inp[5] out_filter = wgt[0] * wgt[4] height, width = inp[2], inp[3] hkernel, wkernel = wgt[2], wgt[3] hstride, wstride = tsk.args[2][0], tsk.args[2][1] hpad, wpad = tsk.args[3][0], tsk.args[3][1] print("({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format( batch, height, width, in_filter, out_filter, hkernel, wkernel, hpad, wpad, hstride, wstride)) # We do not run the tuning in our webpage server since it takes too long. # Comment the following line to run it by yourself. return # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): # Compile network print("Compile...") if target.device_name != "vta": with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Export library print("Upload...") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Generate the graph runtime ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) m = graph_runtime.create(graph, lib, ctx) # upload parameters to device image = tvm.nd.array( (np.random.uniform(size=(1, 3, 224, 224))).astype('float32')) m.set_input(**params) m.set_input('data', image) # evaluate print("Evaluate inference time cost...") timer = m.module.time_evaluator("run", ctx, number=1, repeat=10) tcost = timer() prof_res = np.array(tcost.results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def deploy_rpc(): """Runs the demo that deploys a model remotely through RPC. """ from tvm import rpc from tvm.contrib import util, emscripten # As usual, load the resnet18 model. net, params, data_shape, out_shape = load_mxnet_resnet() # Compile the model. # Note that this time we are changing the target. # This is because we want to translate the host library into JavaScript # through Emscripten. graph, lib, params = compile_net( net, target_host="llvm -target=asmjs-unknown-emscripten -system-lib", target="opengl", data_shape=data_shape, params=params) # Now we want to deploy our model through RPC. # First we ned to prepare the module files locally. print("Saving the compiled module...") temp = util.tempdir() path_obj = temp.relpath("deploy.bc") # host LLVM part path_dso = temp.relpath("deploy.js") # host JavaScript part path_gl = temp.relpath("deploy.gl") # device GLSL part path_json = temp.relpath("deploy.tvm_meta.json") lib.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) lib.imported_modules[0].save(path_gl) print("- Saved files:", temp.listdir()) # Connect to the RPC server. print("Connecting to RPC server...") proxy_host = 'localhost' proxy_port = 9090 remote = rpc.connect(proxy_host, proxy_port, key="js") print("- Connected to RPC server!") # Upload module to RPC server. print("Uploading module to RPC server...") remote.upload(path_dso, "deploy.dso") remote.upload(path_gl) remote.upload(path_json) print("- Upload completed!") # Load remote library. print("Loading remote library...") fdev = remote.load_module("deploy.gl") fhost = remote.load_module("deploy.dso") fhost.import_module(fdev) rlib = fhost print("- Remote library loaded!") ctx = remote.opengl(0) # Upload the parameters. print("Uploading parameters...") rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} print("- Parameters uploaded!") # Create the remote runtime module. print("Running remote module...") from tvm.contrib import graph_runtime module = graph_runtime.create(graph, rlib, ctx) # Set parameter. module.set_input(**rparams) # Set input data. input_data = np.random.uniform(size=data_shape) module.set_input('data', tvm.nd.array(input_data.astype('float32'))) # Run. module.run() print("- Remote module execution completed!") out = module.get_output(0, out=tvm.nd.empty(out_shape, ctx=ctx)) # Print first 10 elements of output. print(out.asnumpy()[0][0:10])
# Compile network print("Compiling network with best tuning parameters...") if target.device_name != "vta": with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host ) else: with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host ) # Export library temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # If detailed runtime info is needed build with debug runtime if opt.debug_profile: m = debug_runtime.create(graph, lib, ctx) else: m = graph_runtime.create(graph, lib, ctx) # Set the network parameters and synthetic input image = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 224))).astype("float32")) m.set_input(**params) m.set_input("data", image)
def compile_model(self): if device == 'vta': self.remote = rpc.connect(self.pynq_addr, 9091) vta.reconfig_runtime(self.remote) vta.program_fpga(self.remote, bitstream=None) else: self.remote = rpc.LocalSession() self.ctx = self.remote.ext_dev( 0) if device == 'vta' else self.remote.cpu(0) # Load pre-configured AutoTVM schedules with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ResNet input dtype_dict = {'data': 'float32'} shape_dict = {'data': (env.BATCH, 3, 224, 224)} gluon_model = vision.resnet18_v1( pretrained=True, ctx=ctx ).features if args.nonsplit else splitnet.resnet18_v1_split( self.id + 1) # Measure build start time build_start = time.time() # Start front end compilation mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod['main'], params=params) # Perform graph packing and constant folding for VTA target if target.device_name == 'vta': assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack(relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack) # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={'AlterOpLayout'}): if target.device_name != 'vta': graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) self.params = params # Measure Relay build time build_time = time.time() - build_start print(f'inference graph for thread {self.id} built in {0:.4f}s!'. format(build_time)) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath('graphlib.o')) self.remote.upload(temp.relpath('graphlib.o')) lib = self.remote.load_module('graphlib.o') # Graph runtime self.m = graph_runtime.create(graph, lib, self.ctx)
def main(): # extract workloads from relay program input_shape = (1, 3, 224, 224) print("Extrack tasks...") mod, params = get_workload(image_shape=input_shape[1:], batch_size=input_shape[0]) tasks = autotvm.task.extract_from_program(mod["main"], target=target, target_host=target_host, params=params, ops=( relay.op.nn.conv2d, relay.op.nn.dense, )) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_option) with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=0): graph, lib, params = relay.build_module.build( mod, target=target, params=params, target_host=target_host) tmp = tempdir() filename = "net.tar" lib.export_library(tmp.relpath(filename)) remote = autotvm.measure.request_remote(device_key, '0.0.0.0', 9192, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) ctx = remote.context(str(target), 0) module = runtime.create(graph, rlib, ctx) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) print("Run...") print("Set_input(\"data\")") module.set_input('data', data_tvm) print("Set_input(**param)") module.set_input(**params) #evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) prof_res = np.array(ftimer().results) * 1000 #print(ftimer().results) tmp = sorted(ftimer().results) print(tmp[0]) print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def generate_graph(graph_fn, params_fn, device="vta"): # Measure build start time build_start = time.time() # Derive the TVM target target = tvm.target.create("llvm -device={}".format(device)) # Derive the LLVM compiler flags # When targetting the Pynq, cross-compile to ARMv7 ISA if env.TARGET == "sim": target_host = "llvm" elif env.TARGET == "pynq": target_host = "llvm -mtriple=armv7-none-linux-gnueabihf -mcpu=cortex-a9 -mattr=+neon" # Load the ResNet-18 graph and parameters sym = nnvm.graph.load_json(open(graph_fn).read()) params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read()) # Populate the shape and data type dictionary shape_dict = {"data": (1, 3, 224, 224)} dtype_dict = {"data": 'float32'} shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Create NNVM graph graph = nnvm.graph.create(sym) graph_attr.set_shape_inputs(sym, shape_dict) graph_attr.set_dtype_inputs(sym, dtype_dict) graph = graph.apply("InferShape").apply("InferType") # Apply NNVM graph optimization passes sym = vta.graph.clean_cast(sym) sym = vta.graph.clean_conv_fuse(sym) if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) # Compile NNVM graph with nnvm.compiler.build_config(opt_level=3): if target.device_name != "vta": graph, lib, params = nnvm.compiler.build( sym, target, shape_dict, dtype_dict, params=params, target_host=target_host) else: with vta.build_config(): graph, lib, params = nnvm.compiler.build( sym, target, shape_dict, dtype_dict, params=params, target_host=target_host) # Save the compiled inference graph library assert tvm.module.enabled("rpc") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) # Send the inference library over to the remote RPC server remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Measure build time build_time = time.time() - build_start print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time)) return graph, lib, params
def check_local(coreml_model): temp = util.tempdir() compiled_model = xcode.compile_coreml(coreml_model, out_dir=temp.temp_dir) ctx = tvm.cpu(0) verify(coreml_model, compiled_model, ctx)
def __init__(self, model_dir): self._tmp_dir = util.tempdir() self._model_dir = model_dir self._graph = graph_pb2.GraphDef()
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from shutil import which import json import pytest import sys import numpy as np import tvm import tvm.runtime._ffi_api from tvm import relay from tvm.contrib import util tmp_path = util.tempdir() def generate_csource_module(): """Mock the codegen with an external library (e.g., CBLAS/cuDNN)""" code = r''' #include <tvm/runtime/c_runtime_api.h> #include <tvm/runtime/packed_func.h> #include <dlpack/dlpack.h> #include <cstdint> #include <cstring> #include <iostream> #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_) \ extern "C" void p_ID_(float* a, float* b, float* out) { \
def export_library(self, file_name, fcompile=None, **kwargs): """Export the module and its imported device code one library. This function only works on host llvm modules. It will pack all the imported modules Parameters ---------- file_name : str The name of the shared library. fcompile : function(target, file_list, kwargs), optional Compilation function to use create dynamic library. If fcompile has attribute object_format, will compile host library to that format. Otherwise, will use default format "o". kwargs : dict, optional Additional arguments passed to fcompile """ # NOTE: this function depends on contrib library features # which are only available in when TVM function is available. if _RUNTIME_ONLY: raise RuntimeError( "Cannot call export_library in runtime only mode") # Extra dependencies during runtime. from pathlib import Path from tvm.contrib import cc as _cc, tar as _tar, util as _util if isinstance(file_name, Path): file_name = str(file_name) if self.type_key == "stackvm": if not file_name.endswith(".stackvm"): raise ValueError( "Module[%s]: can only be saved as stackvm format." "did you build with LLVM enabled?" % self.type_key) self.save(file_name) return modules = self._collect_dso_modules() temp = _util.tempdir() files = [] is_system_lib = False has_c_module = False llvm_target_triple = None for index, module in enumerate(modules): if fcompile is not None and hasattr(fcompile, "object_format"): object_format = fcompile.object_format else: if module.type_key == "llvm": object_format = "o" else: assert module.type_key == "c" object_format = "cc" has_c_module = True path_obj = temp.relpath("lib" + str(index) + "." + object_format) module.save(path_obj) files.append(path_obj) is_system_lib = (module.type_key == "llvm" and module.get_function("__tvm_is_system_module")()) llvm_target_triple = (module.type_key == "llvm" and module.get_function("_get_target_triple")()) if not fcompile: if file_name.endswith(".tar"): fcompile = _tar.tar else: fcompile = _cc.create_shared if llvm_target_triple is None and hasattr(fcompile, "get_target_triple"): llvm_target_triple = fcompile.get_target_triple() if self.imported_modules: if enabled("llvm") and llvm_target_triple: path_obj = temp.relpath("devc.o") m = _ffi_api.ModulePackImportsToLLVM(self, is_system_lib, llvm_target_triple) m.save(path_obj) files.append(path_obj) else: path_cc = temp.relpath("devc.cc") with open(path_cc, "w") as f: f.write(_ffi_api.ModulePackImportsToC(self, is_system_lib)) files.append(path_cc) if has_c_module: options = [] if "options" in kwargs: opts = kwargs["options"] options = opts if isinstance(opts, (list, tuple)) else [opts] opts = options + ["-I" + path for path in find_include_path()] kwargs.update({'options': opts}) fcompile(file_name, files, **kwargs)