def benchmark(network, target, log_file): mod, params, input_shape, output_shape = get_network(network) # covert to NCHW desired_layouts = {'nn.conv2d': ['NCHW', 'default']} seq = tvm.transform.Sequential([ relay.transform.RemoveUnusedFunctions(), relay.transform.ConvertLayout(desired_layouts) ]) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) if network in ["bert"]: with autotvm.apply_history_best(log_file): with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, target=target, params=params) # upload parameters to device ctx = tvm.context(str(target), 0) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape[0])).astype(dtype)) token_types_tvm = tvm.nd.array( np.random.uniform(size=input_shape[1]).astype(dtype)) valid_length_tvm = tvm.nd.array( np.random.uniform(size=input_shape[2]).astype(dtype)) module = runtime.GraphModule(lib["default"](ctx)) module.set_input(data0=data_tvm, data1=token_types_tvm, data2=valid_length_tvm) else: with autotvm.apply_history_best(log_file): with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, target=target, params=params) # upload parameters to device ctx = tvm.context(str(target), 0) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) module = runtime.GraphModule(lib["default"](ctx)) module.set_input(args.inputname, data_tvm) # evaluate print("Evaluate...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=args.repeat) prof_res = np.array( ftimer().results) * 1000 # multiply 1000 for converting to millisecond print( "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
def benchmark(network, batch_size, dtype, target, log_prefix, repeat): layout = "NCHW" mod, params, input_name, input_shape, output_shape = get_network( network, batch_size, dtype, layout) if use_graph_tuner(network, batch_size, dtype, target): log_file = log_prefix + ".graph.log" history_best_context = autotvm.apply_graph_best(log_file) else: log_file = log_prefix + ".kernel.log" history_best_context = autotvm.apply_history_best(log_file) assert os.path.exists( log_file), "The log file '%s' does not exist." % log_file print("Use log file %s" % log_file) if network in ["bert"]: # Build module with history_best_context: with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) ctx = tvm.context(str(target), 0) module = runtime.GraphModule(lib["default"](ctx)) # Feed input data seq_length = input_shape[0][1] data = np.random.uniform(size=input_shape[0]) token_types = np.random.uniform(size=input_shape[1]) valid_length = np.array([seq_length] * batch_size) module.set_input(data0=data, data1=token_types, data2=valid_length) else: # Build module with history_best_context: with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) ctx = tvm.context(str(target), 0) module = runtime.GraphModule(lib["default"](ctx)) # Feed input data data = np.random.uniform(size=input_shape) module.set_input(input_name, data) # Evaluate ftimer = module.module.time_evaluator("run", ctx, min_repeat_ms=500, repeat=repeat) return np.array(ftimer().results)
def get_graph_runtime_output(mod, data, params, target, ctx, dtype="float32", number=2, repeat=20): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target, params=params) m = graph_runtime.GraphModule(lib["default"](ctx)) # set inputs m.set_input("data", data) m.run() out = m.get_output(0, tvm.nd.empty(out_shape, dtype)) if measure: print("Evaluate graph runtime inference cost of {} on " "{}".format(model, repr(ctx))) ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=20) # Measure in millisecond. prof_res = np.array(ftimer().results) * 1000 print( "Mean graph runtime inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) return out.asnumpy()
def test_gpu(): mod, params = relay.testing.synthetic.get_workload() with relay.build_config(opt_level=3): complied_graph_lib = relay.build_module.build(mod, "cuda", params=params) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") dev = tvm.gpu() # raw api gmod = complied_graph_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] set_input("data", tvm.nd.array(data)) run() out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper gmod = graph_runtime.GraphModule(complied_graph_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
def prepare(self, ctx_id): if self.use_tvm_build: print("Use TVM ArcFace Model") loaded_lib = tvm.runtime.load_module(self.param_file) self.model = graph_runtime.GraphModule(loaded_lib["default"](tvm.cpu())) else: if self.param_file: print("Use mxnet ArcFace Model") pos = self.param_file.rfind('-') prefix = self.param_file[0:pos] pos2 = self.param_file.rfind('.') epoch = int(self.param_file[pos+1:pos2]) sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) all_layers = sym.get_internals() sym = all_layers['fc1_output'] if ctx_id>=0: ctx = mx.gpu(ctx_id) else: ctx = mx.cpu() model = mx.mod.Module(symbol=sym, context=ctx, label_names = None) data_shape = (1,3)+self.image_size model.bind(data_shapes=[('data', data_shape)]) model.set_params(arg_params, aux_params) #warmup data = mx.nd.zeros(shape=data_shape) db = mx.io.DataBatch(data=(data,)) model.forward(db, is_train=False) embedding = model.get_outputs()[0].asnumpy() self.model = model else: pass
def run(mod, target): with relay.build_config(opt_level=3): lib = relay.build(mod, target=target, target_host=target_host, params=params) path_dso = temp.relpath("deploy.dylib") lib.export_library(path_dso, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso) # Start RPC test server that contains the compiled library. xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, libs=[path_dso]) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) if target == "metal": ctx = remote.metal(0) else: ctx = remote.cpu(0) lib = remote.load_module("deploy.dylib") m = graph_runtime.GraphModule(lib["default"](ctx)) m.set_input("data", tvm.nd.array(image, ctx)) m.run() tvm_output = m.get_output(0) top1 = np.argmax(tvm_output.asnumpy()[0]) print("TVM prediction top-1:", top1, synset[top1]) # evaluate ftimer = m.module.time_evaluator("run", ctx, number=3, repeat=10) prof_res = np.array(ftimer().results) * 1000 print("%-19s (%s)" % ("%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
def test_cpu(): if not tvm.testing.device_enabled("llvm"): print("Skip because llvm is not enabled") return mod, params = relay.testing.synthetic.get_workload() with relay.build_config(opt_level=3): complied_graph_lib = relay.build_module.build(mod, "llvm", params=params) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") # raw api ctx = tvm.cpu() gmod = complied_graph_lib["default"](ctx) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] set_input("data", tvm.nd.array(data)) run() out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper gmod = graph_runtime.GraphModule(complied_graph_lib["default"](ctx)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
def _get_tvm_output(net, data, build_dtype="float32", states=None): """Compute TVM output""" dtype = "float32" mod, params = relay.frontend.from_darknet(net, data.shape, dtype) # verify that from_darknet creates a valid, parsable relay program mod = relay.transform.InferType()(mod) astext(mod) target = "llvm" shape_dict = {"data": data.shape} lib = relay.build(mod, target, params=params) # Execute on TVM ctx = tvm.cpu(0) m = graph_runtime.GraphModule(lib["default"](ctx)) # set inputs m.set_input("data", tvm.nd.array(data.astype(dtype))) if states: for name in states.keys(): m.set_input(name, tvm.nd.array(states[name].astype(dtype))) m.run() # get outputs tvm_out = [] for i in range(m.get_num_outputs()): tvm_out.append(m.get_output(i).asnumpy()) return tvm_out
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") mod, params, data_shape, out_shape = get_network(model_name, batch_size) tasks = autotvm.task.extract_from_program( mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"), )) # run tuning tasks tune_kernels(tasks, **tuning_opt) tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file) # compile kernels with graph-level best records with autotvm.apply_graph_best(graph_opt_sch_file): print("Compile...") with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, target=target, params=params) # upload parameters to device dev = tvm.cpu() data_tvm = tvm.nd.array( (np.random.uniform(size=data_shape)).astype(dtype)) module = runtime.GraphModule(lib["default"](dev)) module.set_input(input_name, data_tvm) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", dev, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def get_output(data, lib): dev = tvm.cpu() module = graph_runtime.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() return module.get_output(0).asnumpy()
def test_gpu(): if not tvm.runtime.enabled("cuda"): print("Skip because cuda is not enabled") return mod, params = relay.testing.resnet.get_workload(num_layers=18) with relay.build_config(opt_level=3): complied_graph_lib = relay.build_module.build(mod, "cuda", params=params) data = np.random.uniform(-1, 1, size=(1, 3, 224, 224)).astype("float32") ctx = tvm.gpu() # raw api gmod = complied_graph_lib['default'](ctx) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] set_input("data", tvm.nd.array(data)) run() out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper gmod = graph_runtime.GraphModule(complied_graph_lib['default'](ctx)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") mod, params, input_shape, out_shape = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_program( mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"), )) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, target=target, params=params) # load parameters ctx = tvm.context(str(target), 0) module = runtime.GraphModule(lib["default"](ctx)) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def verify_result( mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu(), params=None, dpu_target="DPUCADX8G", tvm_ops=0, ): """To check the result between reference and byoc vitis-ai flow""" lib = build_module(mod, target, params=params, dpu_target=dpu_target, tvm_ops=tvm_ops) lib = update_lib(lib) rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu())) for name, data in map_inputs.items(): rt_mod.set_input(name, data) rt_mod.set_input(**params) rt_mod.run() out_shapes = out_shape if isinstance(out_shape, list) else [out_shape] results = result if isinstance(result, list) else [result] for idx, shape in enumerate(out_shapes): out = tvm.nd.empty(shape, device=device) out = rt_mod.get_output(idx, out) tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=tol, atol=tol)
def build_and_run( mod, inputs, outputs, params, device, enable_acl=True, no_runs=1, tvm_ops=0, acl_partitions=1, config=None, ): """Build and run the relay module.""" if config is None: config = {} try: lib = build_module(mod, device.target, params, enable_acl, tvm_ops, acl_partitions) except Exception as e: err_msg = "The module could not be built.\n" if config: err_msg += f"The test failed with the following parameters: {config}\n" err_msg += str(e) raise Exception(err_msg) lib = update_lib(lib, device.device, device.cross_compile) gen_module = graph_runtime.GraphModule(lib["default"](device.device.cpu(0))) gen_module.set_input(**inputs) out = [] for _ in range(no_runs): gen_module.run() out.append([gen_module.get_output(i) for i in range(outputs)]) return out
def infer_value(input_val, params, mod=None): """A hack for getting the value of an expression by evaluating a portion of the relay graph. This is often needed for functions that whose output shape depends on the value of a tensor. """ # Check that all free variables have associated parameters. assert all( var.name_hint in params.keys() for var in analysis.free_vars(input_val) ), "All inputs to infer must be available in params." try: # TODO(kevinthesun): Use VM for all cases. # pylint: disable=import-outside-toplevel from tvm.contrib import graph_runtime func = _function.Function(analysis.free_vars(input_val), input_val) with tvm.transform.PassContext(opt_level=0): lib = tvm.relay.build(func, target="llvm", params=params) ctx = tvm.cpu(0) m = graph_runtime.GraphModule(lib["default"](ctx)) m.run() return m.get_output(0) except Exception: if isinstance(mod, IRModule): mod["main"] = _function.Function(analysis.free_vars(input_val), input_val) else: mod = IRModule.from_expr(input_val) exc = tvm.relay.create_executor("debug", mod=mod, ctx=tvm.cpu(), target="llvm") inputs = [] for param in mod["main"].params: inputs.append(params[param.name_hint]) result = exc.evaluate()(*inputs) return result
def run_tvm(lib): from tvm.contrib import graph_runtime rt_mod = graph_runtime.GraphModule(lib['default'](tvm.cpu(0))) rt_mod.set_input('input', data) rt_mod.run() tvm_res = rt_mod.get_output(0).asnumpy() tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1] return tvm_pred, rt_mod
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") mod, params, input_shape, _ = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_program( mod["main"], target=target, target_host=target_host, params=params, ops=(relay.op.get("nn.conv2d"), ), ) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, target=target, params=params, target_host=target_host) # export library tmp = tempdir() if use_android: from tvm.contrib import ndk filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "net.tar" lib.export_library(tmp.relpath(filename)) # upload module to device print("Upload...") remote = autotvm.measure.request_remote(device_key, "0.0.0.0", 9190, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # upload parameters to device ctx = remote.context(str(target), 0) module = runtime.GraphModule(rlib["default"](ctx)) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=30) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def tune_and_evaluate(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials= 200, # change this to 20000 to achieve the best performance builder=auto_scheduler.LocalBuilder( build_func="ndk" if use_ndk else "default"), runner=auto_scheduler.RPCRunner(device_key, host="0.0.0.0", port=9190, repeat=3, timeout=50), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option) # Compile the whole network print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, target_host=target_host, params=params) # Create graph runtime print("=============== Request Remote ===============") from tvm.auto_scheduler.utils import request_remote remote = request_remote(device_key, "0.0.0.0", 9190) ctx = remote.cl() from tvm.contrib import utils, ndk temp = utils.tempdir() filename = "deploy_lib.so" path_lib = temp.relpath(filename) lib.export_library(path_lib, ndk.create_shared) remote.upload(path_lib) loaded_lib = remote.load_module(filename) module = graph_runtime.GraphModule(loaded_lib["default"](ctx)) data = (np.random.uniform(size=input_shape)).astype(dtype) data_tvm = tvm.nd.array(data) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def benchmark(network, batch_size, dtype, target, log_file, repeat): layout = "NHWC" mod, params, input_name, input_shape, output_shape = get_network( network, batch_size, dtype, layout ) assert os.path.exists(log_file), "The log file '%s' does not exist." % log_file print("Use log file %s" % log_file) if network in ["bert"]: # Build module with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True} ): lib = relay.build(mod, target=target, params=params) ctx = tvm.context(str(target), 0) module = runtime.GraphModule(lib["default"](ctx)) # Feed input data seq_length = input_shape[0][1] data = np.random.uniform(size=input_shape[0]) token_types = np.random.uniform(size=input_shape[1]) valid_length = np.array([seq_length] * batch_size) module.set_input(data0=data, data1=token_types, data2=valid_length) else: # Build module with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True} ): lib = relay.build(mod, target=target, params=params) ctx = tvm.context(str(target), 0) module = runtime.GraphModule(lib["default"](ctx)) # Feed input data data = np.random.uniform(size=input_shape) module.set_input(input_name, data) # Evaluate ftimer = module.module.time_evaluator("run", ctx, min_repeat_ms=500, repeat=repeat) return np.array(ftimer().results)
def run(lib, ctx): # Build TVM runtime m = graph_runtime.GraphModule(lib["default"](ctx)) tvm_input = tvm.nd.array(x.asnumpy(), ctx=ctx) m.set_input("data", tvm_input) # execute m.run() # get outputs class_IDs, scores, bounding_boxs = m.get_output(0), m.get_output(1), m.get_output(2) return class_IDs, scores, bounding_boxs
def get_tvm_output(xs, target, ctx, dtype="float32"): shape_dict = {name: x.shape for (name, x) in zip(keras_model.input_names, xs)} mod, params = relay.frontend.from_keras(keras_model, shape_dict, layout=layout) with tvm.transform.PassContext(opt_level=2): lib = relay.build(mod, target, params=params) m = graph_runtime.GraphModule(lib["default"](ctx)) for name, x in zip(keras_model.input_names, xs): m.set_input(name, tvm.nd.array(x.astype(dtype))) m.run() return [m.get_output(i).asnumpy() for i in range(m.get_num_outputs())]
def verify_rpc_gpu_remove_package_params(obj_format): if not tvm.testing.device_enabled("cuda"): print("Skip because cuda is not enabled") return mod, params = relay.testing.synthetic.get_workload() with relay.build_config(opt_level=3): complied_graph_lib = relay.build_module.build(mod, "cuda", params=params) from tvm.contrib import utils temp = utils.tempdir() if obj_format == ".so": file_name = "deploy_lib.so" else: assert obj_format == ".tar" file_name = "deploy_lib.tar" path_lib = temp.relpath(file_name) complied_graph_lib_no_params = complied_graph_lib["remove_params"]() complied_graph_lib_no_params.export_library(path_lib) path_params = temp.relpath("deploy_param.params") with open(path_params, "wb") as fo: fo.write(runtime.save_param_dict(complied_graph_lib.get_params())) from tvm import rpc remote = rpc.LocalSession() remote.upload(path_lib) loaded_lib = remote.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") dev = remote.gpu() # raw api gmod = loaded_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] load_params = gmod["load_params"] loaded_params = bytearray(open(path_params, "rb").read()) set_input("data", tvm.nd.array(data, device=dev)) load_params(loaded_params) run() out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) loaded_params = bytearray(open(path_params, "rb").read()) gmod.set_input("data", data) gmod.load_params(loaded_params) gmod.run() out = gmod.get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
def tf2tvm_runner(model_name, batch_size=1, backend='cuda'): # tvm cuda will have issue with mobilenet if model_name == 'mobilenet' and backend == 'cuda': return None model, shape = util.tf_keras_model(model_name) # TODO: why tvm needs reversed shape shape = shape[::-1] data = np.random.rand(batch_size, *shape) # input_name has to match model's input name # use model.input_names[0] instead of input_1 to compile different models inside same round # TODO: why would same models with cuda/lvvm can compile in same process? (different backends models doens't affect each other?) input_name = model.input_names[0] shape_dict = {input_name: data.shape} mod, params = relay.frontend.from_keras(model, shape_dict) if backend == 'llvm': with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target='llvm', target_host='llvm', params=params) ctx = tvm.cpu() module = graph_runtime.GraphModule(lib["default"](ctx)) else: with tvm.transform.PassContext(opt_level=3): # has to specify target to tvm.target.cuda(), 'cuda' doesn't work lib = relay.build(mod, target=tvm.target.cuda(), params=params) ctx = tvm.gpu() module = graph_runtime.GraphModule(lib["default"](ctx)) # FIXME: why neccessary to have dtype as float32 here, failed with float64? dtype = "float32" data = tvm.nd.array(data.astype(dtype)) def runner(data_size): for _ in range(data_size // batch_size): module.set_input(input_name, data) tvm_output = module.get_output(0) return runner
def verify_rpc_gpu_remove_package_params(obj_format): if not tvm.runtime.enabled("cuda"): print("Skip because cuda is not enabled") return mod, params = relay.testing.resnet.get_workload(num_layers=18) with relay.build_config(opt_level=3): complied_graph_lib = relay.build_module.build(mod, "cuda", params=params) from tvm.contrib import util temp = util.tempdir() if obj_format == ".so": file_name = "deploy_lib.so" else: assert obj_format == ".tar" file_name = "deploy_lib.tar" path_lib = temp.relpath(file_name) complied_graph_lib_no_params = complied_graph_lib["remove_params"]() complied_graph_lib_no_params.export_library(path_lib) path_params = temp.relpath("deploy_param.params") with open(path_params, "wb") as fo: fo.write(relay.save_param_dict(complied_graph_lib.get_params())) from tvm import rpc server = rpc.Server("localhost", use_popen=True) remote = rpc.connect(server.host, server.port) remote.upload(path_lib) loaded_lib = remote.load_module(path_lib) data = np.random.uniform(-1, 1, size=(1, 3, 224, 224)).astype("float32") ctx = remote.gpu() # raw api gmod = loaded_lib['default'](ctx) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] load_params = gmod["load_params"] loaded_params = bytearray(open(path_params, "rb").read()) set_input("data", tvm.nd.array(data, ctx=ctx)) load_params(loaded_params) run() out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper gmod = graph_runtime.GraphModule(loaded_lib['default'](ctx)) loaded_params = bytearray(open(path_params, "rb").read()) gmod.set_input("data", data) gmod.load_params(loaded_params) gmod.run() out = gmod.get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
def verify(data): mod, params = relay.testing.synthetic.get_workload( input_shape=input_shape) with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, "llvm", params=params) ctx = tvm.cpu() module = graph_runtime.GraphModule(lib["default"](ctx)) module.set_input("data", data) module.run() out = module.get_output(0).asnumpy() return out
def torch2tvm_runner(model_name, batch_size=1, backend='cuda'): # TODO: add batch input_name = "input0" model, shape = util.torch_model(model_name) model.eval() data = torch.randn([batch_size] + shape, dtype=torch.float32) shape_list = [(input_name, data.shape)] scripted_model = torch.jit.trace(model, data).eval() mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) # TODO: how opt_level affects performance opt_level = 3 if backend == 'llvm': with tvm.transform.PassContext(opt_level=opt_level): lib = relay.build(mod, target='llvm', target_host='llvm', params=params) ctx = tvm.cpu() module = graph_runtime.GraphModule(lib["default"](ctx)) else: target = tvm.target.cuda() with tvm.transform.PassContext(opt_level=opt_level): lib = relay.build(mod, target, params=params) ctx = tvm.gpu() module = graph_runtime.GraphModule(lib["default"](ctx)) data = tvm.nd.array(data) def runner(data_size): for _ in range(data_size // batch_size): data = torch.randn([batch_size] + shape, dtype=torch.float32) module.set_input(input_name, data) module.run() tvm_output = module.get_output(0) return runner
def onnx2tvm_runner(model_name, batch_size=1, backend='cuda'): model, shape = util.onnx_model(model_name) data = np.random.rand(batch_size, *shape) input_name = model.graph.input[0].name shape_dict = {input_name: tuple([batch_size, *shape])} mod, params = relay.frontend.from_onnx(model, shape_dict) # TODO: how opt_level affects performance opt_level = 3 if backend == 'llvm': with tvm.transform.PassContext(opt_level=opt_level): lib = relay.build(mod, target='llvm', target_host='llvm', params=params) ctx = tvm.cpu() module = graph_runtime.GraphModule(lib["default"](ctx)) module.set_input(input_name, data) else: target = tvm.target.cuda() with tvm.transform.PassContext(opt_level=opt_level): lib = relay.build(mod, target, params=params) ctx = tvm.gpu() module = graph_runtime.GraphModule(lib["default"](ctx)) module.set_input(input_name, data) dtype = "float32" data = tvm.nd.array(data.astype(dtype)) def runner(data_size): for _ in range(data_size // batch_size): module.set_input(input_name, data) module.run() return runner
def run_tvm_graph(coreml_model, target, ctx, input_data, input_name, output_shape, output_dtype="float32"): """ Generic function to compile on relay and execute on tvm """ if isinstance(input_data, list): shape_dict = {} dtype_dict = {} for i, e in enumerate(input_name): shape_dict[e] = input_data[i].shape dtype_dict[e] = input_data[i].dtype else: shape_dict = {input_name: input_data.shape} dtype_dict = {input_name: input_data.dtype} mod, params = relay.frontend.from_coreml(coreml_model, shape_dict) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target, params=params) from tvm.contrib import graph_runtime m = graph_runtime.GraphModule(lib["default"](ctx)) # set inputs if isinstance(input_data, list): for i, e in enumerate(input_name): m.set_input( e, tvm.nd.array(input_data[i].astype(input_data[i].dtype))) else: m.set_input(input_name, tvm.nd.array(input_data.astype(input_data.dtype))) # execute m.run() # get outputs if isinstance(output_shape, list) and isinstance(output_dtype, list): tvm_output_list = [] for i, s in enumerate(output_shape): tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i])) tvm_output_list.append(tvm_output.asnumpy()) return tvm_output_list else: if not output_shape: tvm_output = m.get_output(0) else: tvm_output = m.get_output( 0, tvm.nd.empty((output_shape), output_dtype)) return tvm_output.asnumpy()
def evaluate(lib, ctx, name_n_data, dtype): # Setup runtime module. mod = runtime.GraphModule(lib["default"](ctx)) for name, data in name_n_data.items(): mod.set_input(name, data) # Evaluate performance. sys.stderr.write("Evaluate inference time cost...\n") ftimer = mod.module.time_evaluator("run", ctx, number=5, min_repeat_ms=1000) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond sys.stderr.write("Median inference time: %.2f ms\n" % np.median(prof_res))
def run(lib, inputs, outputs, npu=True): # Export and load lib to confirm this works lib_name = "mod.so" temp = util.tempdir() lib_path = temp.relpath(lib_name) lib.export_library(lib_path) lib = tvm.runtime.load_module(lib_path) module = graph_runtime.GraphModule(lib["default"](tvm.cpu())) module.set_input(**inputs) module.run() out = [module.get_output(i) for i in range(outputs)] if not npu: inference_result(0, out) return out