def tune_and_evaluate(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials= 200, # change this to 20000 to achieve the best performance builder=auto_scheduler.LocalBuilder( build_func="ndk" if use_ndk else "default"), runner=auto_scheduler.RPCRunner( device_key, host=rpc_host, port=rpc_port, timeout=30, repeat=1, min_repeat_ms=200, enable_cpu_cache_flush=True, ), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option) # Compile with the history best print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) # Export library tmp = tempdir() if use_ndk: from tvm.contrib import ndk filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "net.tar" lib.export_library(tmp.relpath(filename)) # Upload module to device print("Upload...") remote = auto_scheduler.utils.request_remote(device_key, rpc_host, rpc_port, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # Create graph executor dev = remote.cpu() module = graph_executor.GraphModule(rlib["default"](dev)) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
def tune_and_evaluate(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=200, builder=auto_scheduler.LocalBuilder(build_func="ndk"), runner=auto_scheduler.RPCRunner( device_key, host=rpc_host, port=rpc_port, timeout=30, repeat=1, min_repeat_ms=200, enable_cpu_cache_flush=True, ), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option) # Compile with the history best print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) # Export library tmp = tempdir() filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) # Upload module to device print("Upload...") remote = auto_scheduler.utils.request_remote(device_key, rpc_host, rpc_port, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # Create graph executor dev = remote.cpu() module = graph_executor.GraphModule(rlib["default"](dev)) for key, value in shape_dict.items(): data_tvm = tvm.nd.array( (np.random.uniform(size=value)).astype("float32")) module.set_input(key, data_tvm) # Evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def main(): log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json") workload_func, params = CONFIGS[ARGS.workload] params = params[0] # type: ignore workload_func = auto_scheduler.register_workload(workload_func) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int( ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int( ARGS.target.attrs["max_threads_per_block"]), max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") task = auto_scheduler.SearchTask( func=workload_func, args=params, target=ARGS.target, hardware_params=hardware_params, ) runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=ARGS.rpc_workers, number=3, repeat=1, min_repeat_ms=100, enable_cpu_cache_flush=False, ) # Inspect the computational graph print("Computational DAG:") print(task.compute_dag) tune_option = auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, runner=runner, ) print("Running AutoTuning:") task.tune(tune_option) print("History Best:") print(task.print_best(log_file)) sch, args = task.apply_best(log_file) print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True))
def tune_and_evaluate(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials= 200, # change this to 20000 to achieve the best performance builder=auto_scheduler.LocalBuilder( build_func="ndk" if use_ndk else "default"), runner=auto_scheduler.RPCRunner(device_key, host="0.0.0.0", port=9190, repeat=3, timeout=50), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option) # Compile the whole network print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, target_host=target_host, params=params) # Create graph runtime print("=============== Request Remote ===============") from tvm.auto_scheduler.utils import request_remote remote = request_remote(device_key, "0.0.0.0", 9190) ctx = remote.cl() from tvm.contrib import utils, ndk temp = utils.tempdir() filename = "deploy_lib.so" path_lib = temp.relpath(filename) lib.export_library(path_lib, ndk.create_shared) remote.upload(path_lib) loaded_lib = remote.load_module(filename) module = graph_runtime.GraphModule(loaded_lib["default"](ctx)) data = (np.random.uniform(size=input_shape)).astype(dtype) data_tvm = tvm.nd.array(data) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def run_tuning(tasks, task_weights, log_file): print("Begin tuning...") measure_runner = auto_scheduler.RPCRunner("m1", "127.0.0.1", 9190, min_repeat_ms=300, timeout=30, repeat=2) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=10000, runner=measure_runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, ) tuner.tune(tune_option)
def test_check_auto_schedule_tuning(host, port): # pylint: disable=too-many-locals log_file = TEMPORARY_DIRECTORY.relpath("ios_tuning_stat.log") target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin") mod, params = relay.testing.mlp.get_workload(batch_size=4, image_shape=(1, 4, 4)) try: status_ok = True measure_runner = auto_scheduler.RPCRunner( DEVICE_KEY, host, port, min_repeat_ms=1, timeout=10, n_parallel=multiprocessing.cpu_count(), ) builder = auto_scheduler.LocalBuilder(timeout=10, build_func=ios_create_dylib) tune_option = auto_scheduler.TuningOptions( builder=builder, num_measure_trials=2, num_measures_per_round=1, runner=measure_runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=0, ) tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target) tasks, task_weights = tasks[:2], task_weights[:2] tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune(tune_option, search_policy="sketch.random") # Check tuning log tuning_statistic = list(load_records(log_file)) for _, measure_result in tuning_statistic: if measure_result.error_no != MeasureErrorNo.NO_ERROR: raise ValueError( f"Error for MeasureResult. Error code: {measure_result.error_no}," f" for details see MeasureErrorNO.") except Exception as e: # pylint: disable=broad-except status_ok = False print(e) assert status_ok, "Tuning failed, see logs."
def remote_auto_scheduler(self, device_key, rpc_host, rpc_port): # generate tasks tasks, task_weights = auto_scheduler.extract_tasks( self.mod["main"], self.params, self.target) for idx, task in enumerate(tasks): logger.debug("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) logger.debug(task.compute_dag) # generate tuner tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=200, builder=auto_scheduler.LocalBuilder(), runner=auto_scheduler.RPCRunner( device_key, host=rpc_host, port=rpc_port, timeout=30, repeat=1, min_repeat_ms=200, enable_cpu_cache_flush=True, ), measure_callbacks=[auto_scheduler.RecordToFile(self.log_file)], ) tuner.tune(tune_option) # update self.lib with auto_scheduler.ApplyHistoryBest(self.log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): self._lib = relay.build(self.mod, target=self.target, params=self.params) logger.info(f"load optimized library from {self.log_file}")
# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `conv2d.json`. # The measurement records can be used to query the history best, resume the search, # and do more analyses later. # * see :any:`auto_scheduler.TuningOptions`, # :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300) tune_option = auto_scheduler.TuningOptions( # change this to 1000 to achieve the best performance num_measure_trials=1024 * len(tasks) + 1, # runner=measure_ctx.runner, runner=auto_scheduler.RPCRunner( "nico2_v100_32", # change the device key to your key "0.0.0.0", 9190, n_parallel=2, number=5, repeat=1, timeout=20, min_repeat_ms=300, ), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, ) print("Begin tuning multiple convs...") # measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400, timeout=10) tuner = auto_scheduler.TaskScheduler(tasks, strategy='round-robin') tuner.tune(tune_option) sys.exit() ######################################################################
if __name__ == "__main__": target = tvm.target.Target("metal --max_num_threads=1024") train_flag = True #target = tvm.target.Target("llvm -mcpu=apple-latest -mtriple=arm64-apple-darwin20.1.0") M, N, K = 128, 3072, 768 task = auto_scheduler.SearchTask(func=gemm, args=(M, N, K), target=target) log_file = "gemm_{M}_{N}_{K}.json".format(M=M, N=N, K=K) # Inspect the computational graph print(task.compute_dag) if train_flag: measure_runner = auto_scheduler.RPCRunner("m1", "127.0.0.1", 9190, min_repeat_ms=300, timeout=30, repeat=3) tune_option = auto_scheduler.TuningOptions( num_measure_trials=1000, check_correctness=True, builder_n_parallel=1, runner=measure_runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, ) task.tune(tune_option) sch, args = task.apply_best(log_file) # Kill the process for measurement del measure_runner
# `sparse_dense.json`. # The measurement records can be used to query the history best, resume the search, # and do more analyses later. # * see :any:`auto_scheduler.TuningOptions` for more parameters # * Here, we need to create a :code:`auto_scheduler.SketchPolicy` object, and add the custom sketch # rule as a `init_search_callbacks`. log_file = "spmm_mali.json" tune_option = auto_scheduler.TuningOptions( num_measure_trials=1000, builder=auto_scheduler.LocalBuilder(build_func="default"), runner=auto_scheduler.RPCRunner( device_key, host=rpc_host, port=rpc_port, timeout=30, repeat=1, min_repeat_ms=200, enable_cpu_cache_flush=True, ), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, ) search_policy = auto_scheduler.SketchPolicy( task, program_cost_model=auto_scheduler.XGBModel(), init_search_callbacks=[ auto_scheduler.PreloadCustomSketchRule(meet_condition_func, apply_func, "SparseDense") ],
def tune_and_evaluate(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials= 200, # change this to 20000 to achieve the best performance runner=auto_scheduler.RPCRunner( device_key, host="0.0.0.0", port=9191, timeout=30, repeat=1, min_repeat_ms=200, enable_cpu_cache_flush=True, ), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option) # Compile with the history best print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) # Export library tmp = tempdir() if use_ndk: from tvm.contrib import ndk filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "net.tar" lib.export_library(tmp.relpath(filename)) # Upload module to device print("Upload...") remote = auto_scheduler.utils.request_remote(device_key, "0.0.0.0", 9191, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # Create graph runtime dev = remote.cpu() module = graph_runtime.GraphModule(rlib["default"](dev)) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def main(): log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json") runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=cpu_count(logical=True), number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, timeout=ARGS.rpc_config.session_timeout_sec, ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int( ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int( ARGS.target.attrs["max_threads_per_block"]), # The value `max_local_memory_per_block` is not used in AutoScheduler, # but is required by the API. max_local_memory_per_block=12345678, max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: print(f" input_name : {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) input_data = { item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape } with ms.Profiler() as profiler: tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target=ARGS.target, hardware_params=hardware_params, ) for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): print(f"==== Task {idx}: {task.desc} " f"(weight {task_weight} key: {task.workload_key}) =====") print(task.compute_dag) if ARGS.num_trials > 0: tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune( auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, runner=runner, measure_callbacks=[ auto_scheduler.RecordToFile(log_file), ], ), adaptive_training=ARGS.adaptive_training, ) relay_build = { "graph": relay.build, "vm": relay.vm.compile }[ARGS.backend] with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}, ): lib = relay_build( mod, target=ARGS.target, params=params, ) print("Tuning Time:") print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=create_timer(ARGS.backend), backend=ARGS.backend, )
def main(): log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json") runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=cpu_count(logical=True), number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int( ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int( ARGS.target.attrs["max_threads_per_block"]), # The value `max_local_memory_per_block` is not used in AutoScheduler, # but is required by the API. max_local_memory_per_block=12345678, max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: print(f" input_name: {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target=ARGS.target, hardware_params=hardware_params, ) for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): print( f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====" ) print(task.compute_dag) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune( auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, runner=runner, measure_callbacks=[ auto_scheduler.RecordToFile(log_file), ], )) with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}, ): lib = relay.build( mod, target=ARGS.target, params=params, ) graph, rt_mod, params = lib.graph_json, lib.lib, lib.params input_data = {} for item in ARGS.input_shape: input_name, input_shape, input_dtype = item["name"], item[ "shape"], item["dtype"] if input_dtype.startswith("float"): input_data[input_name] = np.random.uniform( size=input_shape).astype(input_dtype) else: input_data[input_name] = np.random.randint(low=0, high=10000, size=input_shape, dtype=input_dtype) def f_timer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.graph_executor import GraphModule # pylint: enable=import-outside-toplevel mod = GraphModule(rt_mod["default"](dev)) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) ftimer = mod.module.time_evaluator( "run", dev, min_repeat_ms=500, repeat=3, ) results = list(np.array(ftimer().results) * 1000.0) # type: ignore print("Running time in time_evaluator: ", results) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=f_timer, ) def f_per_layer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.debugger.debug_executor import create # pylint: enable=import-outside-toplevel mod = create(graph, rt_mod, dev) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) print("|graph_nodes| = ", len(graph_nodes)) print("|graph_time| = ", len(graph_time)) graph_nodes_time = { k: float(v) for k, v in zip(graph_nodes, graph_time) } for k, v in graph_nodes_time.items(): print(f"{k} : {v:.3f}") run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=rt_mod, dev_type=ARGS.target.kind.name, args=input_data, continuation=f_per_layer, )
def main(): log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json") runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=cpu_count(logical=True), number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, timeout=ARGS.rpc_config.session_timeout_sec, ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]), # The value `max_local_memory_per_block` is not used in AutoScheduler, # but is required by the API. max_local_memory_per_block=12345678, max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") describe() print(f"Workload: {ARGS.workload}") with ms.Profiler() as profiler: # Same as MetaSchedule Tune TE # Does not count ApplyHistoryBest time workload_func, params = CONFIGS[ARGS.workload] params = params[0] # type: ignore workload_func = auto_scheduler.register_workload(workload_func) task = auto_scheduler.SearchTask( func=workload_func, args=params, target=ARGS.target, hardware_params=hardware_params, ) # Inspect the computational graph print("Computational DAG:") print(task.compute_dag) tune_option = auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, runner=runner, ) if ARGS.num_trials > 0: print("Running AutoTuning:") task.tune(tune_option, adaptive_training=ARGS.adaptive_training) print("Tuning Time:") print(profiler.table()) print("History Best:") print(task.print_best(log_file)) sch, args = task.apply_best(log_file) print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True))