def test_task_extraction_cuda(): auto_scheduler.enable_relay_integration() target = tvm.target.Target("cuda") mod, params = get_network("mlp") tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) assert len(tasks) == 1 assert sum(task_weights) == 2 for layout in ["NHWC", "NCHW"]: mod, params = get_network("resnet-18", layout=layout) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) assert len(tasks) == 21 assert sum(task_weights) == 22 mod, params = get_network("mobilenet", layout=layout) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) assert len(tasks) == 20 assert sum(task_weights) == 28 for layout in ["NCDHW", "NDHWC"]: mod, params = get_network("resnet3d-18", layout=layout) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) assert len(tasks) == 21 assert sum(task_weights) == 22 auto_scheduler.enable_relay_integration(False)
def test_task_extraction_cuda(): target = tvm.target.Target("cuda") mod, params = get_network("mlp") tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) assert len(tasks) == 1 assert sum(task_weights) == 2 for layout in ["NHWC", "NCHW"]: mod, params = get_network("resnet-18", layout=layout) tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target) assert len(tasks) == 24 assert sum(task_weights) == 25 mod, params = get_network("mobilenet", layout=layout) tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target) assert len(tasks) == 22 assert sum(task_weights) == 30 for layout in ["NCDHW", "NDHWC"]: mod, params = get_network("resnet3d-18", layout=layout) tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target) assert len(tasks) == 23 assert sum(task_weights) == 24, sum(task_weights)
def test_task_extraction_cuda(): auto_scheduler.enable_relay_integration() mod, params = get_network("mlp") target = tvm.target.Target("cuda") tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) assert len(tasks) == 1 assert sum(task_weights) == 2 mod, params = get_network("resnet-18") target = tvm.target.Target("cuda") tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) assert len(tasks) == 21 assert sum(task_weights) == 22
def auto_scheduler_tune(network, target, input_name, log_file): if os.path.exists(log_file): os.remove(log_file) mod, net_params, input_shape, output_shape = get_network(network) if network not in ["bert"]: # convert to NHWC layout desired_layouts = {'nn.conv2d': ['NHWC', 'default']} seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(), relay.transform.ConvertLayout(desired_layouts)]) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) if "cpu" in target.keys: tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=20000, # change this to 20000 to achieve the best performance runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) else: measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10) tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=20000, # change this to 20000 to achieve the best performance runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], net_params, target) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune(tuning_opt)
def local_auto_scheduler(self, repeat=1, min_repeat_ms=300, timeout=10, num_measure_trials=200): # extract tasks tasks, task_weights = auto_scheduler.extract_tasks( self.mod["main"], self.params, self.target) for idx, task in enumerate(tasks): logger.debug("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) logger.debug(task.compute_dag) # generate tuner tuner = auto_scheduler.TaskScheduler(tasks, task_weights) logging.info("Begin tuning...") measure_ctx = auto_scheduler.LocalRPCMeasureContext( repeat=repeat, min_repeat_ms=min_repeat_ms, timeout=timeout) tune_option = auto_scheduler.TuningOptions( num_measure_trials=num_measure_trials, runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(self.log_file)], ) tuner.tune(tune_option) # update self.lib with auto_scheduler.ApplyHistoryBest(self.log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): self._lib = relay.build(self.mod, target=self.target, params=self.params) logger.info(f"load optimized library from {self.log_file}")
def test_custom_hash_func_extract_tasks(): @_ffi_api.register_func("auto_scheduler.compute_dag.hash_func") def counting_unique_hash(str_dag): ret = counting_unique_hash.i counting_unique_hash.i += 1 return ret counting_unique_hash.i = 0 mod, _ = get_network("mobilenet", layout="NHWC") tasks, _ = auto_scheduler.extract_tasks(mod["main"], None, "llvm", include_simple_tasks=True) hash_values = [] for task in tasks: # task.workload_key should look like # [43, [3, 3, 1024, 1], [1024], [3, 3, 1024, 1]] where the first int is the result of the hash # Extract the hash and keep track of every hash hash_value = int(task.workload_key[1:].split(",")[0]) hash_values.append(hash_value) # All values are unique, and we know the min and max # This is a sufficient condition to know that hashes in hash_values are an increasing list # of hashes up to counting_unique_hash.i - 1 assert len(hash_values) == len(set(hash_values)) assert min(hash_values) == 0 assert max(hash_values) == counting_unique_hash.i - 1
def test_tuning_cuda(): auto_scheduler.enable_relay_integration() # Extract tasks mod, params = get_network("mlp") target = tvm.target.Target("cuda") tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) objective = lambda costs: sum(c * w for c, w in zip(costs, task_weights)) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name # Tuning measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=100) tuner = auto_scheduler.TaskScheduler(tasks, objective) tune_option = auto_scheduler.TuningOptions( num_measure_trials=2, num_measures_per_round=1, runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option, search_policy="sketch.random") del measure_ctx # Compile with the history best with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) # Todo(merrymercy): compile without any history to test the fallback mechanism auto_scheduler.enable_relay_integration(False)
def tune_network(network, target): # Extract tasks mod, params = get_network(network) target = tvm.target.Target(target) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name # Tuning measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=100, num_measures_per_round=2, early_stopping=1, runner=measure_ctx.runner, builder=auto_scheduler.LocalBuilder(timeout=60), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option, search_policy="sketch.random") del measure_ctx # Compile with the history best with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params)
def verify_task_extraction(func, expected_task, include_simple_tasks=False): mod = tvm.IRModule.from_expr(func) tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], None, target, include_simple_tasks=include_simple_tasks ) assert len(tasks) == expected_task assert len(task_weights) == expected_task
def tune_network(network, target): # Extract tasks mod, params = get_network(network) target = tvm.target.Target(target) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name # Tuning measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=100, num_measures_per_round=2, early_stopping=1, runner=measure_ctx.runner, builder=auto_scheduler.LocalBuilder(timeout=60), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option, search_policy="sketch.random") del measure_ctx # Compile with the history best with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) # Compile without auto-scheduler and any other optimization for correctness check with tvm.transform.PassContext(opt_level=0): lib2 = relay.build(mod, target=target, params=params) # Check the correctness def get_output(data, lib): ctx = tvm.gpu() module = graph_runtime.GraphModule(lib["default"](ctx)) module.set_input("data", data) module.run() return module.get_output(0).asnumpy() np.random.seed(0) if network == "mlp": data = np.random.uniform(size=(1, 32)) elif network == "winograd-test": data = np.random.uniform(size=(1, 23, 40, 32)) else: raise ValueError("Unknown network: " + network) actual_output = get_output(data, lib) expected_output = get_output(data, lib2) tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4, atol=1e-4)
def test_task_extraction_cuda(params): target = tvm.target.Target("cuda") network, layout, expected_task, expected_weights = params mod, params = get_network(network, layout=layout) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) for task, weight in zip(tasks, task_weights): print(task.desc, task.workload_key, weight) assert len(tasks) == expected_task assert sum(task_weights) == expected_weights
def test_dump_workload_to_dag_extract_tasks(): mod, _ = get_network("mobilenet", layout="NHWC") with tempfile.NamedTemporaryFile() as f: tasks, _ = auto_scheduler.extract_tasks( mod["main"], None, "llvm", include_simple_tasks=True, dump_workload_to_dag_log=f.name) expected = {task.workload_key: str(task.compute_dag) for task in tasks} actual = json.load(f) assert expected == actual
def autoscheduler_get_tuning_tasks( mod: tvm.IRModule, params: Dict[str, tvm.nd.NDArray], target: str, target_host: Optional[str] = None, alter_layout: Optional[str] = None, hardware_params: Optional[HardwareParams] = None, include_simple_tasks: bool = False, ): """Get the autoscheduler tuning tasks for a given relay module. Parameters ---------- mod : tvm.IRModule The relay module from which to extract tuning tasks. params : dict The params for the relay module. target : tvm.target.Target The compilation target. target_host : str, optional The compilation target for the host. alter_layout : str, optional The layout to convert the graph to. Note, the convert layout pass doesn't currently guarantee the whole of the graph will be converted to the chosen layout. hardware_params : Optional[HardwareParams] Hardware parameters used for the search tasks Returns ------- tasks : list of autotvm.Tasks list of tasks to be tuned weights : List[int] the weight (i.e. the number of appearance) of extracted tasks """ target, target_host = Target.check_and_update_host_consist( target, target_host) if alter_layout: mod = common.convert_graph_layout(mod, alter_layout) # Extract the tasks tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target=target, hardware_params=hardware_params, include_simple_tasks=include_simple_tasks, ) return tasks, task_weights
def tune_and_check(mod, data, weight): # Extract tasks from a relay program target = tvm.target.Target("llvm") tasks, task_weights = auto_scheduler.extract_tasks( mod, target=target, params={"weight": weight}) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name # Tune tasks tuner = auto_scheduler.TaskScheduler(tasks, task_weights, callbacks=[]) tune_option = auto_scheduler.TuningOptions( num_measure_trials=1, num_measures_per_round=1, builder=auto_scheduler.LocalBuilder(timeout=60), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option, search_policy="sketch.random") # Compile with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}, ): lib = relay.build(mod, target=target, params={"weight": weight}) # Compile without auto-scheduler for correctness check with tvm.transform.PassContext(opt_level=0): lib2 = relay.build(mod, target=target, params={"weight": weight}) def get_output(data, lib): dev = tvm.cpu() module = graph_executor.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, lib) expected_output = get_output(data, lib2) tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
def test_check_auto_schedule_tuning(host, port): # pylint: disable=too-many-locals log_file = TEMPORARY_DIRECTORY.relpath("ios_tuning_stat.log") target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin") mod, params = relay.testing.mlp.get_workload(batch_size=4, image_shape=(1, 4, 4)) try: status_ok = True measure_runner = auto_scheduler.RPCRunner( DEVICE_KEY, host, port, min_repeat_ms=1, timeout=10, n_parallel=multiprocessing.cpu_count(), ) builder = auto_scheduler.LocalBuilder(timeout=10, build_func=ios_create_dylib) tune_option = auto_scheduler.TuningOptions( builder=builder, num_measure_trials=2, num_measures_per_round=1, runner=measure_runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=0, ) tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target) tasks, task_weights = tasks[:2], task_weights[:2] tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune(tune_option, search_policy="sketch.random") # Check tuning log tuning_statistic = list(load_records(log_file)) for _, measure_result in tuning_statistic: if measure_result.error_no != MeasureErrorNo.NO_ERROR: raise ValueError( f"Error for MeasureResult. Error code: {measure_result.error_no}," f" for details see MeasureErrorNO.") except Exception as e: # pylint: disable=broad-except status_ok = False print(e) assert status_ok, "Tuning failed, see logs."
def auto_scheduler_tune(network, batch_size, dtype, target, log_file): os.makedirs(os.path.dirname(log_file), exist_ok=True) #if os.path.exists(log_file): # os.remove(log_file) layout = "NHWC" mod, params, input_name, input_shape, output_shape = get_network( network, batch_size, dtype, layout) n_trials = network_to_n_trials[(network, batch_size, dtype, str(target.kind))] if "cpu" in target.keys: tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=n_trials, runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) else: min_repeat_ms = 450 if network in ["bert"] else 300 measure_ctx = auto_scheduler.LocalRPCMeasureContext( repeat=1, min_repeat_ms=min_repeat_ms, timeout=10) tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=n_trials, runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) print(log_file) update_file(log_file, tasks) return for idx, task in enumerate(tasks): print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) print(task.compute_dag) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune(tuning_opt)
def remote_auto_scheduler(self, device_key, rpc_host, rpc_port): # generate tasks tasks, task_weights = auto_scheduler.extract_tasks( self.mod["main"], self.params, self.target) for idx, task in enumerate(tasks): logger.debug("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) logger.debug(task.compute_dag) # generate tuner tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=200, builder=auto_scheduler.LocalBuilder(), runner=auto_scheduler.RPCRunner( device_key, host=rpc_host, port=rpc_port, timeout=30, repeat=1, min_repeat_ms=200, enable_cpu_cache_flush=True, ), measure_callbacks=[auto_scheduler.RecordToFile(self.log_file)], ) tuner.tune(tune_option) # update self.lib with auto_scheduler.ApplyHistoryBest(self.log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): self._lib = relay.build(self.mod, target=self.target, params=self.params) logger.info(f"load optimized library from {self.log_file}")
def tune_network(network, target): auto_scheduler.enable_relay_integration() # Extract tasks mod, params = get_network(network) target = tvm.target.Target(target) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name # Tuning measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=100, num_measures_per_round=2, early_stopping=1, runner=measure_ctx.runner, builder=auto_scheduler.LocalBuilder(timeout=60), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option, search_policy="sketch.random") del measure_ctx # Compile with the history best with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) # Todo(merrymercy): when the cpu backend is upstreamed, do the following things: # 1. compile without history to test the fallback mechanism # 2. check the correctness of layout rewrite / winograd pre-transform auto_scheduler.enable_relay_integration(False)
################################################################# # Extract Search Tasks # -------------------- # Next, we extract the search tasks and their weights from a network. # The weight of a task is the number of appearances of the task's subgraph # in the whole network. # By using the weight, we can approximate the end-to-end latency of the network # as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the # latency of a task and :code:`weight[t]` is the weight of the task. # The task scheduler will just optimize this objective. # Extract tasks from the network print("Extract tasks...") mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host) for idx, task in enumerate(tasks): print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) print(task.compute_dag) ###################################################################### # .. note:: How to get the hardware parameters from remote device # # .. code-block:: python # # from tvm.auto_scheduler.utils import request_remote # remote = request_remote(device_key, "0.0.0.0", 9190) # dev = remote.cl() # max_shared_memory_per_block = dev.max_shared_memory_per_block # # There is no explicit local memory limition # # so we can use INT32_MAX to disalbe the check on local_memory.
def _tvm_compile(fx_module, example_inputs, target=None, tuning_logfile=None, use_ansor_tuning=False): import tvm from tvm import relay, auto_scheduler from tvm.contrib import graph_executor import os # Find the target and device for TVM. dev = tvm.cpu(0) if target is None: raise ValueError("Setup the TVM target correctly.") elif isinstance(target, str): if "cuda" in target: dev = tvm.cuda(0) target = tvm.target.Target(target) elif isinstance(target, tvm.target.target.Target): if "cuda" in target.keys: dev = tvm.cuda(0) # JIT the model and pass it to Torchscript to Relay frontend parser. TVM # tutorials suggest tracing instead of scripting. The main reason is to # avoid Pythonic computation to show up in JIT module. However, with Python # key tracing, AOT Autograd leads to simpler graphs. Therefore, we use # scripting here to retrieve the JIT module. jit_mod = torch.jit.script(fx_module) shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)] mod, params = relay.frontend.from_pytorch(jit_mod, shape_list) # TVM Autotuning if use_ansor_tuning: tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target) if tuning_logfile is None: log_file = f"{time.time()}.json" else: log_file = f"{tuning_logfile}.json" if len(tasks) != 0: tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=20000, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], # early_stopping=1000, # verbose=2, ) tuner.tune(tune_option) elif tuning_logfile is not None: log_file = f"{tuning_logfile}.json" if use_ansor_tuning or tuning_logfile is not None: assert os.path.exists(log_file) with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) else: with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) # Get a graph executor graph module m = graph_executor.GraphModule(lib["default"](dev)) def exec_tvm(*args): for idx, arg in enumerate(args, 0): if arg.dim() != 0: m.set_input( f"inp_{idx}", tvm.nd.from_dlpack( torch.utils.dlpack.to_dlpack(arg.contiguous())), ) m.run() outs = [ torch.utils.dlpack.from_dlpack(m.get_output(i).to_dlpack()) for i in range(m.get_num_outputs()) ] return outs return exec_tvm
def test_task_extraction(): ishape = (1, 3, 224, 224) w1shape = (32, 3, 3, 3) w2shape = (32, 32, 3, 3) dtype = "float32" target = tvm.target.Target("llvm") def get_func(): data = relay.var("data", shape=(ishape), dtype=dtype) weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype) weight2 = relay.var("weight2", shape=(w2shape), dtype=dtype) conv2d = relay.nn.conv2d(data, weight1, kernel_size=(3, 3), padding=(1, 1)) relu = relay.nn.relu(conv2d) conv2d = relay.nn.conv2d(relu, weight2, kernel_size=(3, 3), padding=(1, 1)) out = relay.nn.relu(conv2d) return relay.Function([data, weight1, weight2], out) def get_fused_func(): data = relay.var("data", shape=(ishape), dtype=dtype) weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype) weight2 = relay.var("weight2", shape=(w2shape), dtype=dtype) fused_func = get_func() # Set to primitive to keep fuse_ops untouch. fused_func = fused_func.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) call = relay.Call(fused_func, [data, weight1, weight2]) return relay.Function([data, weight1, weight2], call) def get_simple_func(): data = relay.var("data", relay.TensorType((1, 2, 3), "float32")) out = relay.image.affine_grid(data, (150, 150)) return relay.Function([data], out) def get_func_with_unsupported_op(): def get_postproc_func(): data = relay.var("data", shape=((1, 3, 6)), dtype=dtype) out = relay.nn.relu(data) func = relay.Function([data], out) func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) return func cls_prob = relay.var("cls_prob", relay.ty.TensorType((1, 3, 3), "float32")) loc_pred = relay.var("loc_pred", relay.ty.TensorType((1, 3 * 4), "float32")) anchors = relay.var("anchors", relay.ty.TensorType((1, 3, 4), "float32")) mtl = relay.vision.multibox_transform_loc(cls_prob=cls_prob, loc_pred=loc_pred, anchor=anchors) nms = relay.vision.non_max_suppression(mtl[0], mtl[1], mtl[0], return_indices=False) out = relay.Call(get_postproc_func(), [nms]) return relay.Function([cls_prob, loc_pred, anchors], out) func = get_func() mod = tvm.IRModule.from_expr(func) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target) # Relay FuseOps puts two conv2ds to separate functions and results in two tasks. assert len(tasks) == 2 assert len(task_weights) == 2 func = get_fused_func() mod = tvm.IRModule.from_expr(func) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target) # By setting the function to primitive, Relay FuseOps will not break it and result in one task. assert len(tasks) == 1 assert len(task_weights) == 1 func = get_simple_func() mod = tvm.IRModule.from_expr(func) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target) # The Relay function without complex ops will not form a task by default. assert len(tasks) == 0 assert len(task_weights) == 0 tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], None, target, include_simple_tasks=True) # Every Relay function becomes a task regardless what ops in its body. assert len(tasks) == 1 assert len(task_weights) == 1 # Func1 (with NMS) -> Func2 (injective). func = get_func_with_unsupported_op() mod = tvm.IRModule.from_expr(func) tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], None, target, include_simple_tasks=True) # The function with NMS should fail, but the other function with ReLU should be a task. assert len(tasks) == 1 assert len(task_weights) == 1
def main(): log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json") runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=cpu_count(logical=True), number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, timeout=ARGS.rpc_config.session_timeout_sec, ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int( ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int( ARGS.target.attrs["max_threads_per_block"]), # The value `max_local_memory_per_block` is not used in AutoScheduler, # but is required by the API. max_local_memory_per_block=12345678, max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: print(f" input_name : {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) input_data = { item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape } with ms.Profiler() as profiler: tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target=ARGS.target, hardware_params=hardware_params, ) for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): print(f"==== Task {idx}: {task.desc} " f"(weight {task_weight} key: {task.workload_key}) =====") print(task.compute_dag) if ARGS.num_trials > 0: tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune( auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, runner=runner, measure_callbacks=[ auto_scheduler.RecordToFile(log_file), ], ), adaptive_training=ARGS.adaptive_training, ) relay_build = { "graph": relay.build, "vm": relay.vm.compile }[ARGS.backend] with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}, ): lib = relay_build( mod, target=ARGS.target, params=params, ) print("Tuning Time:") print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=create_timer(ARGS.backend), backend=ARGS.backend, )
from matplotlib import pyplot as plt out_y = Image.fromarray(np.uint8((tvm_output[0, 0]).clip(0, 255)), mode="L") out_cb = img_cb.resize(out_y.size, Image.BICUBIC) out_cr = img_cr.resize(out_y.size, Image.BICUBIC) result = Image.merge("YCbCr", [out_y, out_cb, out_cr]).convert("RGB") canvas = np.full((672, 672 * 2, 3), 255) canvas[0:224, 0:224, :] = np.asarray(img) canvas[:, 672:, :] = np.asarray(result) plt.imshow(canvas.astype(np.uint8)) plt.show() ###################################################################### # Tune the model # --------------------------------------------- tasks, task_weights = auto_scheduler.extract_tasks(mod['main'], params, target) dtype = "float32" batch_size = 1 log_file = "%s-B%d-%s.json" % ("superres", batch_size, target) for idx, task in enumerate(tasks): print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) print(task.compute_dag) def run_tuning(): print("Begin tuning...") measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300,
def tune_network(network, target): # Extract tasks mod, params = get_network(network) target = tvm.target.Target(target) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name # Tuning measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60, device=0) tuner = auto_scheduler.TaskScheduler(tasks, task_weights, callbacks=[]) tune_option = auto_scheduler.TuningOptions( num_measure_trials=100, num_measures_per_round=2, early_stopping=1, runner=measure_ctx.runner, builder=auto_scheduler.LocalBuilder(timeout=60), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option, search_policy="sketch.random") del measure_ctx # Compile with the history best with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True} ): lib = relay.build(mod, target=target, params=params) # Also test that multiple log files can be loaded. with auto_scheduler.ApplyHistoryBest([log_file, log_file]) as best: assert isinstance( best, auto_scheduler.dispatcher.ApplyHistoryBest ), "Unable to load multiple log files jointly." # Confirm iterables can be directly loaded. loaded_recs = auto_scheduler.dispatcher.load_records(log_file) with auto_scheduler.ApplyHistoryBest(iter(loaded_recs)) as best: assert isinstance( best, auto_scheduler.dispatcher.ApplyHistoryBest ), "Unable to ingest logs from an interator." # Sample a schedule when missing with auto_scheduler.ApplyHistoryBestOrSample(None, num_measure=2): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True} ): lib2 = relay.build(mod, target=target, params=params) # Compile without auto-scheduler and any other optimization for correctness check with tvm.transform.PassContext(opt_level=0): ref_lib = relay.build(mod, target=target, params=params) # Check the correctness def get_output(data, lib): dev = tvm.cuda() module = graph_executor.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() return module.get_output(0).numpy() np.random.seed(0) if network == "mlp": data = np.random.uniform(size=(1, 32)) elif network == "winograd-test": data = np.random.uniform(size=(1, 23, 40, 32)) else: raise ValueError("Unknown network: " + network) actual_output1 = get_output(data, lib) actual_output2 = get_output(data, lib2) expected_output = get_output(data, ref_lib) tvm.testing.assert_allclose(actual_output1, expected_output, rtol=1e-4, atol=1e-4) tvm.testing.assert_allclose(actual_output2, expected_output, rtol=1e-4, atol=1e-4)
def main(): log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json") runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=cpu_count(logical=True), number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int( ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int( ARGS.target.attrs["max_threads_per_block"]), # The value `max_local_memory_per_block` is not used in AutoScheduler, # but is required by the API. max_local_memory_per_block=12345678, max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: print(f" input_name: {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target=ARGS.target, hardware_params=hardware_params, ) for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): print( f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====" ) print(task.compute_dag) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune( auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, runner=runner, measure_callbacks=[ auto_scheduler.RecordToFile(log_file), ], )) with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}, ): lib = relay.build( mod, target=ARGS.target, params=params, ) graph, rt_mod, params = lib.graph_json, lib.lib, lib.params input_data = {} for item in ARGS.input_shape: input_name, input_shape, input_dtype = item["name"], item[ "shape"], item["dtype"] if input_dtype.startswith("float"): input_data[input_name] = np.random.uniform( size=input_shape).astype(input_dtype) else: input_data[input_name] = np.random.randint(low=0, high=10000, size=input_shape, dtype=input_dtype) def f_timer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.graph_executor import GraphModule # pylint: enable=import-outside-toplevel mod = GraphModule(rt_mod["default"](dev)) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) ftimer = mod.module.time_evaluator( "run", dev, min_repeat_ms=500, repeat=3, ) results = list(np.array(ftimer().results) * 1000.0) # type: ignore print("Running time in time_evaluator: ", results) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=f_timer, ) def f_per_layer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.debugger.debug_executor import create # pylint: enable=import-outside-toplevel mod = create(graph, rt_mod, dev) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) print("|graph_nodes| = ", len(graph_nodes)) print("|graph_time| = ", len(graph_time)) graph_nodes_time = { k: float(v) for k, v in zip(graph_nodes, graph_time) } for k, v in graph_nodes_time.items(): print(f"{k} : {v:.3f}") run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=rt_mod, dev_type=ARGS.target.kind.name, args=input_data, continuation=f_per_layer, )
def main(): # Get Model print("Get Model...") onnx_model = onnx.load(args.onnx_path) shape_dict = {} for input in onnx_model.graph.input: shape_dict[input.name] = [ dim.dim_value for dim in input.type.tensor_type.shape.dim ] mod, params = relay.frontend.from_onnx(onnx_model) bs_r = args.bsr bs_c = args.bsc sparsity = args.sparsity # Conver to Sparse Model mod, params = ddo.simplify_fc_transpose.convert(mod["main"], params) mod, params = ddo.bsr_conv2d.convert(mod, params, (bs_r, bs_c), sparsity_threshold=sparsity, layout='NHWC') mod = tvm.IRModule.from_expr(mod) # Set tune config target = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+neon") device_key = "pixel2" rpc_host = "127.0.0.1" rpc_port = 9190 log_file = f"{str(args.onnx_path).split('.')[-2]}.json" # Extract tasks print("Extract tasks...") tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) for idx, task in enumerate(tasks): print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) print(task.compute_dag) def tune_and_evaluate(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=200, builder=auto_scheduler.LocalBuilder(build_func="ndk"), runner=auto_scheduler.RPCRunner( device_key, host=rpc_host, port=rpc_port, timeout=30, repeat=1, min_repeat_ms=200, enable_cpu_cache_flush=True, ), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option) # Compile with the history best print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) # Export library tmp = tempdir() filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) # Upload module to device print("Upload...") remote = auto_scheduler.utils.request_remote(device_key, rpc_host, rpc_port, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # Create graph executor dev = remote.cpu() module = graph_executor.GraphModule(rlib["default"](dev)) for key, value in shape_dict.items(): data_tvm = tvm.nd.array( (np.random.uniform(size=value)).astype("float32")) module.set_input(key, data_tvm) # Evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) tune_and_evaluate()