def test_graph_executor(target, dev): mod, params = mlp.get_workload(1) exe = relay.build(mod, target, params=params) gr = debug_executor.create(exe.get_json(), exe.lib, dev) data = np.random.rand(1, 1, 28, 28).astype("float32") report = gr.profile(data=data) assert "fused_nn_softmax" in report assert "Total time" in report
def test_run_single_node(graph, n, A, myadd): mlib_proxy = tvm.support.FrontendTestModule() mlib_proxy["myadd"] = myadd mod: debug_executor.GraphModuleDebug = debug_executor.create(graph, mlib_proxy, tvm.cpu(0)) a = np.random.uniform(size=(n,)).astype(A.dtype) mod.set_input(x=a) assert len(mod.debug_datum.get_graph_nodes()) == 2 assert mod.debug_datum.get_graph_nodes()[0]["op"] == "param" assert mod.debug_datum.get_graph_nodes()[1]["op"] == "myadd" # Running a node with no associated function should return instantly and have 0 runtime assert mod.run_individual_node(0, number=1).mean == 0 # Meanwhile the actual function should take some time, more time if you run it more times repeat_1_result = mod.run_individual_node(1, repeat=1) assert repeat_1_result.mean > 0 # Running multiple times (10) should take longer than 1 time repeat_3_results = mod.run_individual_node(1, repeat=3) assert sum(repeat_3_results.results) > sum(repeat_1_result.results) # Increasing the number of repeats should give you the number of results asked for assert len(mod.run_individual_node(1, repeat=10).results) == 10 # Doing repeat_ms should have the run time greater than the asked amount start = time.time() mod.run_individual_node(1, min_repeat_ms=500) end = time.time() elapsed_time_in_seconds = end - start assert elapsed_time_in_seconds >= 0.5 # Doing `cooldown_interval_ms` should have the execution time increases start = time.time() mod.run_individual_node(1, repeat=2, min_repeat_ms=500, cooldown_interval_ms=1000) end = time.time() elapsed_time_in_seconds_with_def_rep = end - start assert elapsed_time_in_seconds_with_def_rep >= 3 # Doing with `repeats_to_cooldown` not equal 1 should not trigger # cooldown after each repeat start = time.time() mod.run_individual_node( 1, repeat=2, min_repeat_ms=500, cooldown_interval_ms=1000, repeats_to_cooldown=2 ) end = time.time() elapsed_time_in_seconds_with_rep_2 = end - start assert elapsed_time_in_seconds_with_rep_2 >= 2 and ( elapsed_time_in_seconds_with_rep_2 < elapsed_time_in_seconds_with_def_rep ) # Going out of bounds of node index throws a tvm error with pytest.raises(TVMError): mod.run_individual_node(2)
def f_per_layer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.debugger.debug_executor import create # pylint: enable=import-outside-toplevel mod = create(graph, rt_mod, dev) mod.set_input(input_name, input_data) graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) print("|graph_nodes| = ", len(graph_nodes)) print("|graph_time| = ", len(graph_time)) graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)} for k, v in graph_nodes_time.items(): print(f"{k} : {v:.3f}")
def test_rpc_graph(): server = rpc.Server(key="profiling") remote = rpc.connect("127.0.0.1", server.port, key="profiling") mod, params = mlp.get_workload(1) exe = relay.build(mod, "llvm", params=params) temp = utils.tempdir() path = temp.relpath("lib.tar") exe.export_library(path) remote.upload(path) rexec = remote.load_module("lib.tar") gr = debug_executor.create(exe.get_graph_json(), rexec, remote.cpu()) data = np.random.rand(1, 1, 28, 28).astype("float32") report = gr.profile(data=data) assert len(report.calls) > 0
def _build_tvm(self, debug_runtime=False): # compile kernels with history best records with autotvm.apply_history_best(self.log_file): with tvm.transform.PassContext(opt_level=3): self.tvm_graph, self.tvm_lib, self.tvm_params = relay.build( self.mod, target=self.target, params=self.params) if not debug_runtime: self.tvm_module = graph_executor.create(self.tvm_graph, self.tvm_lib, device=self.dev) else: self.tvm_module = debug_executor.create(self.tvm_graph, self.tvm_lib, device=self.dev) self.tvm_module.set_input(**self.tvm_params) return self.tvm_module
def check_remote(server): mlib = tvm.build(s, [A, B], "llvm", name="myadd") remote = rpc.connect(server.host, server.port) temp = utils.tempdir() dev = remote.cpu(0) path_dso = temp.relpath("dev_lib.so") mlib.export_library(path_dso) remote.upload(path_dso) mlib = remote.load_module("dev_lib.so") try: mod = debug_executor.create(graph, mlib, remote.cpu(0)) except ValueError: print("Skip because debug runtime not enabled") return a = np.random.uniform(size=(n, )).astype(A.dtype) mod.run(x=tvm.nd.array(a, dev)) out = tvm.nd.empty((n, ), device=dev) out = mod.get_output(0, out) np.testing.assert_equal(out.numpy(), a + 1)
def f_time_per_layer( rt_mod: tvm.runtime.Module, dev: tvm.device, input_data: Dict[str, NDArray], ) -> None: """Run and benchmark the per-layer performance of given runtime module, print out the result. Parameters ---------- rt_mod : tvm.runtime.Module The runtime module. dev : tvm.device The device type to run workload. input_data : Dict[str, np.ndarray] The input data as a dictionary. """ # pylint:disable=import-outside-toplevel from tvm.contrib.debugger.debug_executor import create # pylint:enable=import-outside-toplevel try: mod = create(graph, rt_mod, dev) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) print("Running time of each layer:") print("---------------------------") print("|graph_nodes| = ", len(graph_nodes)) print("|graph_time| = ", len(graph_time)) for k, v in zip(graph_nodes, graph_time): print(k, float(v) * 1e6, "us") except Exception as exc: # pylint: disable=broad-except print( f"Run module f_time_per_layer via RPC failed, exception: {exc}", )
def check_verify(): mlib = tvm.build(s, [A, B], "llvm", name="myadd") def myadd(*args): to_return = mlib["myadd"](*args) time.sleep(0.25) return to_return mlib_proxy = tvm.support.FrontendTestModule() mlib_proxy["myadd"] = myadd try: mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0)) except ValueError: return a = np.random.uniform(size=(n, )).astype(A.dtype) mod.set_input(x=a) # verify dumproot created directory = mod._dump_path assert os.path.exists(directory) # verify graph is there GRAPH_DUMP_FILE_NAME = "_tvmdbg_graph_dump.json" assert len(os.listdir(directory)) == 1 # verify the file name is proper graph_dump_path = os.path.join(directory, GRAPH_DUMP_FILE_NAME) assert os.path.exists(graph_dump_path) # verify the graph contains some expected keys with open(graph_dump_path) as graph_f: dumped_graph = json.load(graph_f) assert isinstance(dumped_graph, dict) for k in ("nodes", "arg_nodes", "node_row_ptr", "heads", "attrs"): assert k in dumped_graph, f"key {k} not in dumped graph {graph!r}" mod.run() # Verify the tensors are dumped assert len(os.listdir(directory)) > 1 debug_lines = mod.debug_datum.get_debug_result().split("\n") def split_debug_line(i): to_return = re.split(r" [ ]*", debug_lines[i]) assert to_return[-1] == "" to_return = to_return[:-1] # strip empty trailing part return to_return assert split_debug_line(0) == [ "Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs", ] myadd_lines = split_debug_line(2) assert myadd_lines[0] == "add" assert myadd_lines[1] == "myadd" runtime_sec = float(myadd_lines[2]) / 1e6 # printed in us # Ensure runtime is at least the sleep time and less than a unit prefix order of magnitude. # Here we just care that the prefix is correct. assert runtime_sec > 0.25 and runtime_sec < 0.25 * 1000 total_lines = split_debug_line(3) assert total_lines[0] == "Total_time" assert total_lines[2] == myadd_lines[2] CHROME_TRACE_FILE_NAME = "_tvmdbg_execution_trace.json" assert os.path.exists(os.path.join(directory, CHROME_TRACE_FILE_NAME)) with open(os.path.join(directory, CHROME_TRACE_FILE_NAME)) as f: trace = json.load(f) assert trace["displayTimeUnit"] == "ns" events = trace["traceEvents"] assert len(events) == 4 assert all(event["ph"] in ("B", "E") for event in events) assert all(event["pid"] == 1 for event in events) assert all(event["tid"] == 1 for event in events) assert all(event["name"] == "x" for event in events[:2]) assert all(event["name"] == "add" for event in events[2:]) assert events[0]["ts"] == 0 assert events[0]["ph"] == "B" # verify the output is correct out = mod.get_output(0, tvm.nd.empty((n, ))) np.testing.assert_equal(out.asnumpy(), a + 1) mod.exit() # verify dump root delete after cleanup assert not os.path.exists(directory)
def run_module( tvmc_package: TVMCPackage, device: str, hostname: Optional[str] = None, port: Union[int, str] = 9090, rpc_key: Optional[str] = None, inputs: Optional[Dict[str, np.ndarray]] = None, fill_mode: str = "random", repeat: int = 10, number: int = 10, profile: bool = False, end_to_end: bool = False, options: dict = None, ): """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- tvmc_package: TVMCPackage The compiled model package object that will be run. device: str, the device (e.g. "cpu" or "cuda") to be targeted by the RPC session, local or remote). hostname : str, optional The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. inputs : dict, optional A dictionary that maps input names to numpy values. If not provided, inputs will be generated using the fill_mode argument. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. number : int, optional The number of runs to measure within each repeat. profile : bool Whether to profile the run with the debug executor. end_to_end : bool Whether to measure the time of memory copies as well as model execution. Turning this on can provide a more realistic estimate of how long running the model in production would take. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ if not isinstance(tvmc_package, TVMCPackage): raise TVMCException( "This model doesn't seem to have been compiled yet. " "Try calling tvmc.compile on the model before running it.") with ExitStack() as stack: # Currently only two package formats are supported: "classic" and # "mlf". The later can only be used for micro targets, i.e. with microTVM. if device == "micro": if tvmc_package.type != "mlf": raise TVMCException( f"Model {tvmc_package.package_path} is not a MLF archive.") project_dir = get_project_dir(tvmc_package.project_dir) # This is guaranteed to work since project_dir was already checked when # building the dynamic parser to accommodate the project options, so no # checks are in place when calling GeneratedProject. project_ = project.GeneratedProject.from_directory( project_dir, options) else: if tvmc_package.type == "mlf": raise TVMCException( "You're trying to run a model saved using the Model Library Format (MLF). " "MLF can only be used to run micro device ('--device micro')." ) if hostname: if isinstance(port, str): port = int(port) # Remote RPC if rpc_key: logger.debug("Running on remote RPC tracker with key %s.", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("Running on remote RPC with no key.") session = rpc.connect(hostname, port) elif device == "micro": # Remote RPC (running on a micro target) logger.debug("Running on remote RPC (micro target).") try: session = tvm.micro.Session(project_.transport()) stack.enter_context(session) except: raise TVMCException( "Could not open a session with the micro target.") else: # Local logger.debug("Running a local session.") session = rpc.LocalSession() # Micro targets don't support uploading a model. The model to be run # must be already flashed into the micro target before one tries # to run it. Hence skip model upload for micro targets. if device != "micro": session.upload(tvmc_package.lib_path) lib = session.load_module(tvmc_package.lib_name) # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("Device is %s.", device) if device == "cuda": dev = session.cuda() elif device == "cl": dev = session.cl() elif device == "metal": dev = session.metal() elif device == "vulkan": dev = session.vulkan() elif device == "rocm": dev = session.rocm() elif device == "micro": dev = session.device lib = session.get_system_lib() else: assert device == "cpu" dev = session.cpu() if tvmc_package.type == "vm": assert inputs is not None, "vm runner requires inputs to be provided as a dict" input_tensor = {} for e, i in inputs.items(): input_tensor[e] = tvm.nd.array(i, dev) if profile: logger.debug("Creating vm with profile enabled.") exe = profiler_vm.VirtualMachineProfiler(lib, dev) res = exe.profile(**input_tensor, func_name="main") # This print is intentional print(res) else: exe = vm.VirtualMachine(lib, dev) exe_outputs = exe.invoke("main", **input_tensor) times = exe.benchmark( dev, **input_tensor, func_name="main", repeat=repeat, number=number, end_to_end=end_to_end, ) # Special handling if the output only has a single value if not isinstance(exe_outputs, list): exe_outputs = [exe_outputs] outputs = {} for i, val in enumerate(exe_outputs): output_name = "output_{}".format(i) outputs[output_name] = val.numpy() else: # TODO(gromero): Adjust for micro targets. if profile: logger.debug("Creating runtime with profiling enabled.") module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof") else: if device == "micro": logger.debug( "Creating runtime (micro) with profiling disabled.") module = tvm.micro.create_local_graph_executor( tvmc_package.graph, lib, dev) else: logger.debug("Creating runtime with profiling disabled.") module = executor.create(tvmc_package.graph, lib, dev) logger.debug("Loading params into the runtime module.") module.load_params(tvmc_package.params) logger.debug("Collecting graph input shape and type:") shape_dict, dtype_dict = module.get_input_info() logger.debug("Graph input shape: %s", shape_dict) logger.debug("Graph input type: %s", dtype_dict) inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode) logger.debug("Setting inputs to the module.") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.info("Running the module with profiling enabled.") report = module.profile() # This print is intentional print(report) if device == "micro": # TODO(gromero): Fix time_evaluator() for micro targets. Once it's # fixed module.benchmark() can be used instead and this if/else can # be removed. module.run() times = [] else: # Call the benchmarking function of the executor. # Optionally measure e2e data transfers from the # CPU to device memory overheads (e.g. PCIE # overheads if the device is a discrete GPU). if end_to_end: dev = session.cpu() times = module.benchmark(dev, number=number, repeat=repeat, end_to_end=end_to_end) logger.debug("Collecting the output tensors.") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).numpy() return TVMCResult(outputs, times)
# print(len(task.config_space)) # tune_kernels(tasks, **tuning_option) # tuning # tune_graph(mod["main"], data_shape, 'unet_cpu_2_thread.log', graph_opt_sch_file,exec_num = 1000) # tuning # # #只需要得到这个opt_sch_file就可 with autotvm.apply_graph_best(graph_opt_sch_file): # graph_opt_sch_file print("compile...") with tvm.transform.PassContext(opt_level=3): # set < 3 # lib = relay.build_module.build(mod,target,params = params) lib = relay.build(mod, target, params=params) # m = graph_executor.GraphModule(lib["default"](dev)) with open(graph_opt_sch_file, 'r') as f: graph = f.read() m = graph_executor.create(graph, lib['default'], dev, dump_root="/tmp/tvmdbg") # set input and get_output m.set_input(input_name, tvm.nd.array(x.astype(dtype))) # input_name = 'x' # must set 'x' as input here due to previous channel translating # automatically change our original model input name # And here have to maintain the correspondence between # real img size,data type and the model's inputs' # evaluate # print("Evaluate inference time cost...") # ftimer = m.module.time_evaluator("run", dev, number=10, repeat=3) # a easy one # prof_res = np.array(ftimer().results) * 1000 # convert to millisecond # print( # "Mean inference time (std dev): %.2f ms (%.2f ms)"
def run_module( tvmc_package: TVMCPackage, device: str, hostname: Optional[str] = None, port: Union[int, str] = 9090, rpc_key: Optional[str] = None, inputs: Optional[Dict[str, np.ndarray]] = None, fill_mode: str = "random", repeat: int = 10, number: int = 10, profile: bool = False, ): """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- tvmc_package: TVMCPackage The compiled model package object that will be run. device: str, the device (e.g. "cpu" or "cuda") to be targeted by the RPC session, local or remote). hostname : str, optional The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. inputs : dict, optional A dictionary that maps input names to numpy values. If not provided, inputs will be generated using the fill_mode argument. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. number : int, optional The number of runs to measure within each repeat. profile : bool Whether to profile the run with the debug runtime. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ if not isinstance(tvmc_package, TVMCPackage): raise TVMCException( "This model doesn't seem to have been compiled yet. " "Try calling tvmc.compile on the model before running it.") # Currently only two package formats are supported: "classic" and # "mlf". The later can only be used for micro targets, i.e. with µTVM. if tvmc_package.type == "mlf": raise TVMCException( "You're trying to run a model saved using the Model Library Format (MLF)." "MLF can only be used to run micro targets (µTVM).") if hostname: if isinstance(port, str): port = int(port) # Remote RPC if rpc_key: logger.debug("Running on remote RPC tracker with key %s.", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("Running on remote RPC with no key.") session = rpc.connect(hostname, port) else: # Local logger.debug("Running a local session.") session = rpc.LocalSession() session.upload(tvmc_package.lib_path) lib = session.load_module(tvmc_package.lib_name) # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("Device is %s.", device) if device == "cuda": dev = session.cuda() elif device == "cl": dev = session.cl() elif device == "metal": dev = session.metal() else: assert device == "cpu" dev = session.cpu() if profile: logger.debug("Creating runtime with profiling enabled.") module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof") else: logger.debug("Creating runtime with profiling disabled.") module = runtime.create(tvmc_package.graph, lib, dev) logger.debug("Loading params into the runtime module.") module.load_params(tvmc_package.params) shape_dict, dtype_dict = get_input_info(tvmc_package.graph, tvmc_package.params) inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode) logger.debug("Setting inputs to the module.") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.info("Running the module with profiling enabled.") module.run() # create the module time evaluator (returns a function) timer = module.module.time_evaluator("run", dev, number=number, repeat=repeat) # call the evaluator function to invoke the module and save execution times prof_result = timer() # collect a list of execution times from the profiling results times = prof_result.results logger.debug("Collecting the output tensors.") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).numpy() return TVMCResult(outputs, times)
disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params, ) # Export library temp = utils.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # If detailed runtime info is needed build with debug runtime if opt.debug_profile: m = debug_executor.create(graph, lib, ctx) else: m = graph_executor.create(graph, lib, ctx) # Set the network parameters and synthetic input image = tvm.nd.array( (np.random.uniform(size=(1, 3, 224, 224))).astype("float32")) m.set_input(**params) m.set_input("data", image) # Perform inference timer = m.module.time_evaluator("run", ctx, number=4, repeat=opt.measurements) tcost = timer()
def run_module( module_file, device, hostname=None, port=9090, rpc_key=None, inputs=None, fill_mode="random", repeat=1, profile=False, ): """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- module_file : str The path to the module file (a .tar file). device: str, the device (e.g. "cpu" or "gpu") to be targeted by the RPC session, local or remote). hostname : str, optional The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. inputs : dict, optional A dictionary that maps input names to numpy values. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. profile : bool Whether to profile the run with the debug runtime. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ with tempfile.TemporaryDirectory() as tmp_dir: logger.debug("extracting module file %s", module_file) t = tarfile.open(module_file) t.extractall(tmp_dir) graph = open(os.path.join(tmp_dir, "mod.json")).read() params = bytearray( open(os.path.join(tmp_dir, "mod.params"), "rb").read()) if hostname: # Remote RPC if rpc_key: logger.debug("running on remote RPC tracker with key %s", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("running on remote RPC with no key") session = rpc.connect(hostname, port) else: # Local logger.debug("running a local session") session = rpc.LocalSession() session.upload(os.path.join(tmp_dir, "mod.so")) lib = session.load_module("mod.so") # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("device is %s", device) if device == "gpu": dev = session.gpu() elif device == "cl": dev = session.cl() else: assert device == "cpu" dev = session.cpu() if profile: logger.debug("creating runtime with profiling enabled") module = debug_executor.create(graph, lib, dev, dump_root="./prof") else: logger.debug("creating runtime with profiling disabled") module = runtime.create(graph, lib, dev) logger.debug("load params into the runtime module") module.load_params(params) shape_dict, dtype_dict = get_input_info(graph, params) inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode) logger.debug("setting inputs to the module") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.debug("running the module with profiling enabled") module.run() # create the module time evaluator (returns a function) timer = module.module.time_evaluator("run", dev, 1, repeat=repeat) # call the evaluator function to invoke the module and save execution times prof_result = timer() # collect a list of execution times from the profiling results times = prof_result.results logger.debug("collecting the output tensors") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).asnumpy() return outputs, times