def main(): log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json") workload_func, params = CONFIGS[ARGS.workload] params = params[0] # type: ignore workload_func = auto_scheduler.register_workload(workload_func) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int( ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int( ARGS.target.attrs["max_threads_per_block"]), max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") task = auto_scheduler.SearchTask( func=workload_func, args=params, target=ARGS.target, hardware_params=hardware_params, ) runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=ARGS.rpc_workers, number=3, repeat=1, min_repeat_ms=100, enable_cpu_cache_flush=False, ) # Inspect the computational graph print("Computational DAG:") print(task.compute_dag) tune_option = auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, runner=runner, ) print("Running AutoTuning:") task.tune(tune_option) print("History Best:") print(task.print_best(log_file)) sch, args = task.apply_best(log_file) print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True))
def _autoscheduler_test_helper(model, tmpdir_name, early_stopping=1, prior_records=None): tvmc_model = tvmc.frontends.load_model(model) log_file = os.path.join(tmpdir_name, "autoscheduler.json") hardware_params = auto_scheduler.HardwareParams(num_cores=4, target="llvm") tvmc.tune( tvmc_model, target="llvm", tuning_records=log_file, prior_records=prior_records, early_stopping=early_stopping, enable_autoscheduler=True, trials=2, hardware_params=hardware_params, ) # testing whether the log file was produced assert path.exists(log_file), "autoscheduler log file should exist" with auto_scheduler.ApplyHistoryBest(log_file) as best: assert isinstance(best, auto_scheduler.dispatcher.ApplyHistoryBest ), "unable to load the best results of tuning" return log_file
def generate_sketches( workload_func, args, target, print_for_debug=False, init_search_callbacks=None ): # NOTE: test_cpu_matmul_sketch and test_cpu_max_pool2d_sketch assume 4 cores to trigger all # possible sketch generations. task = auto_scheduler.SearchTask( func=workload_func, args=args, target=target, hardware_params=auto_scheduler.HardwareParams(num_cores=4, target=target), ) policy = auto_scheduler.SketchPolicy( task, verbose=0, init_search_callbacks=init_search_callbacks ) return policy.generate_sketches(print_for_debug)
def drive_tune(args): """Invoke auto-tuning with command line arguments Parameters ---------- args: argparse.Namespace Arguments from command line parser. """ tvmc_model = frontends.load_model(args.FILE, args.model_format, shape_dict=args.input_shapes) # Specify hardware parameters, although they'll only be used if autoscheduling. hardware_params = auto_scheduler.HardwareParams( num_cores=args.num_cores, vector_unit_bytes=args.vector_unit_bytes, cache_line_bytes=args.cache_line_bytes, max_shared_memory_per_block=args.max_shared_memory_per_block, max_local_memory_per_block=args.max_local_memory_per_block, max_threads_per_block=args.max_threads_per_block, max_vthread_extent=args.max_vthread_extent, warp_size=args.warp_size, target=args.target, target_host=args.target_host, ) if args.rpc_tracker: parsed_url = urlparse("//%s" % args.rpc_tracker) rpc_hostname = parsed_url.hostname rpc_port = parsed_url.port or 9090 logger.info("RPC tracker hostname: %s", rpc_hostname) logger.info("RPC tracker port: %s", rpc_port) if not args.rpc_key: raise common.TVMCException( "need to provide an RPC tracker key (--rpc-key) for remote tuning" ) else: rpc_hostname = None rpc_port = None tune_model( tvmc_model, args.target, tuning_records=args.output, prior_records=args.tuning_records, enable_autoscheduler=args.enable_autoscheduler, rpc_key=args.rpc_key, hostname=rpc_hostname, port=rpc_port, trials=args.trials, target_host=args.target_host, tuner=args.tuner, min_repeat_ms=args.min_repeat_ms, early_stopping=args.early_stopping, desired_layout=args.desired_layout, timeout=args.timeout, repeat=args.repeat, number=args.number, parallel=args.parallel, hardware_params=hardware_params, include_simple_tasks=args.include_simple_tasks, log_estimated_latency=args.log_estimated_latency, additional_target_options=reconstruct_target_args(args), )
def test_gpu_feature(): # Use records to build a complicated GPU program json_records = "\n".join( ( """{"i": [["[\\"matmul_auto_scheduler_test\\", 512, 512, 512]", "cuda"], [[], [["CHW", 2, "local"], ["SP", 2, 0, 512, [1, 16, 32, 1], 1], ["SP", 2, 5, 512, [4, 1, 1, 16], 1], ["SP", 2, 10, 512, [1, 2], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 3, 0, 1, 3], ["FSP", 3, 4, 2, 3], ["RE", 3, [0, 4, 1, 5, 2, 6, 3, 7]], ["FU", 2, [0, 1]], ["FU", 3, [0, 1]], ["FU", 2, [1, 2]], ["FU", 3, [1, 2]], ["FU", 2, [2, 3]], ["FU", 3, [2, 3]], ["CA", 2, 3, 2], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 3], ["FU", 2, [0, 1]], ["FFSP", 2, 0, [1, 2], 1, 1], ["AN", 2, 1, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 3], ["FU", 1, [0, 1]], ["FFSP", 1, 0, [1, 2], 1, 1], ["AN", 1, 1, 6], ["AN", 5, 0, 5], ["AN", 5, 1, 4], ["AN", 5, 2, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00536798], 0, 2.49277, 1585564852], "v": "v0.1"}""", ) ) # load states with tempfile.NamedTemporaryFile(mode="w") as f: f.write(json_records) f.flush() inputs, _ = auto_scheduler.RecordReader(f.name).read_lines() inp = inputs[0] task = auto_scheduler.SearchTask( workload_key=inp.task.workload_key, target=inp.task.target, hardware_params=auto_scheduler.HardwareParams( 100000, 16, 64, 1 << 30, 1 << 30, 1 << 30, 1 << 30, 1 << 30 ), ) state = task.compute_dag.infer_bound_from_state(inputs[0].state) fea = auto_scheduler.feature.get_per_store_features_from_states([state], task)[0] names = auto_scheduler.feature.get_per_store_feature_names() # build feature dict fea_dicts = [] for i in range(len(fea)): tmp_dict = {} for j in range(len(names)): tmp_dict[names[j]] = fea[i][j] fea_dicts.append(tmp_dict) """ lowered IR: Placeholder: A, B blockIdx.x [email protected]@ (0,8) vthread [email protected]@ (0,4) threadIdx.x [email protected]@ (0,16) C.local auto_unroll: 1024 for k.0 (0,256) for ax0@[email protected] (0,8) threadIdx.x ax0@[email protected] (0,16) B.shared = ... for ax0@[email protected] (0,64) threadIdx.x ax0@[email protected] (0,16) A.shared = ... for i_c.3 (0,32) for k.2 (0,2) for j_c.4 (0,16) C.local = ... for i.3 (0,32) for j.3 (0,16) C = ... """ # check gpu-related features assert fequal(fea_dicts[0]["blockIdx_x_len"], math.log2(8 + 1)) assert fequal(fea_dicts[0]["vthread_len"], math.log2(4 + 1)) assert fequal(fea_dicts[1]["threadIdx_x_len"], math.log2(16 + 1)) assert fequal(fea_dicts[0]["threadIdx_y_len"], math.log2(1 + 1)) assert fequal(fea_dicts[2]["blockIdx_z_len"], math.log2(1 + 1)) assert fequal(fea_dicts[0]["is_gpu"], 1.0)
def drive_tune(args): """Invoke auto-tuning with command line arguments Parameters ---------- args: argparse.Namespace Arguments from command line parser. """ # extra arguments validation before importing the model, so that obvious errors # are pointed in advance. if args.rpc_tracker: parsed_url = urlparse("//%s" % args.rpc_tracker) rpc_hostname = parsed_url.hostname rpc_port = parsed_url.port or 9090 logger.info("RPC tracker hostname: %s", rpc_hostname) logger.info("RPC tracker port: %s", rpc_port) if not args.rpc_key: raise common.TVMCException( "need to provide an RPC tracker key (--rpc-key) for remote tuning" ) target, extra_targets = common.target_from_cli(args.target) target_host = args.target_host target, target_host = Target.check_and_update_host_consist( target, target_host) mod, params = frontends.load_model(args.FILE, args.model_format, shape_dict=args.input_shapes) for codegen_from_cli in extra_targets: codegen = composite_target.get_codegen_by_target( codegen_from_cli["name"]) partition_function = codegen["pass_pipeline"] mod = partition_function(mod, params, **codegen_from_cli["opts"]) # min_repeat_ms should be: # a. the value provided by the user, if any, or # b. 0ms in case target is "cpu"; otherwise 1000ms if args.min_repeat_ms is not None: min_repeat_ms = args.min_repeat_ms else: min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000 logger.debug("Default --min-repeat-ms for this target is %s", min_repeat_ms) if args.rpc_tracker: runner_ctor = auto_scheduler.RPCRunner if args.enable_autoscheduler else autotvm.RPCRunner runner = runner_ctor( key=args.rpc_key, host=rpc_hostname, port=rpc_port, number=args.number, repeat=args.repeat, n_parallel=args.parallel, timeout=args.timeout, min_repeat_ms=min_repeat_ms, ) else: logger.info("starting localhost tuning") runner_ctor = (auto_scheduler.LocalRunner if args.enable_autoscheduler else autotvm.LocalRunner) runner = runner_ctor( number=args.number, repeat=args.repeat, timeout=args.timeout, min_repeat_ms=min_repeat_ms, ) if args.enable_autoscheduler: # Specify hardware parameters hardware_params = auto_scheduler.HardwareParams( args.num_cores, args.vector_unit_bytes, args.cache_line_bytes, args.max_shared_memory_per_block, args.max_local_memory_per_block, args.max_threads_per_block, args.max_vthread_extent, args.warp_size, ) tasks, weights = autoscheduler_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=args.desired_layout, hardware_params=hardware_params, include_simple_tasks=args.include_simple_tasks, ) # Create the autoscheduler tuning options tuning_options = auto_scheduler.TuningOptions( num_measure_trials=args.trials, measure_callbacks=[auto_scheduler.RecordToFile(args.output)], runner=runner, early_stopping=args.early_stopping, ) # Schedule the tasks (i.e., produce a schedule for each task) schedule_tasks(tasks, weights, tuning_options, args.tuning_records, args.log_estimated_latency) else: tasks = autotvm_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=args.desired_layout, ) tuning_option = { "tuner": args.tuner, "trials": args.trials, "early_stopping": args.early_stopping, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="default"), runner=runner), "tuning_records": args.tuning_records, } logger.debug(" tuning options: %s", tuning_option) tune_tasks(tasks, args.output, **tuning_option)
def test_stage_order(): """Test if the stage order is preserved when recovering a DAG.""" N = 512 A, B, C, D, E = parallel_matmul_auto_scheduler_test(N) sch = te.create_schedule([D.op, E.op]) (D_local, ) = sch.cache_write([D], "local") (E_local, ) = sch.cache_write([E], "local") sch.cache_read(A, "shared", [D_local]) sch.cache_read(B, "shared", [D_local]) sch.cache_read(A, "shared", [E_local]) sch.cache_read(C, "shared", [E_local]) dag = auto_scheduler.ComputeDAG(sch) stage_ops_1 = dag.get_init_state().stage_ops # 3 placeholder, 4 x.shared, 2 {D,E}.local, 2 {D,E} compute assert len(stage_ops_1) == 11 # Cache read stage should follow the source stage for idx, op in enumerate(stage_ops_1): if op.name == "A": assert (stage_ops_1[idx + 1].name == "A.d.shared" and stage_ops_1[idx + 2].name == "A.shared") elif op.name in ["B", "C"]: assert stage_ops_1[idx + 1].name == "%s.shared" % op.name # Serialize and deserialize the ComputeDAG constructed by a schedule. loaded_dag = pickle.loads(pickle.dumps(dag)) assert str(loaded_dag.get_init_state()) == str(dag.get_init_state()) assert len(loaded_dag.get_init_state().stage_ops) == len( dag.get_init_state().stage_ops) # Apply the same schedule to Ansor state and it should have the same stage order dag = auto_scheduler.ComputeDAG([A, B, C, D, E]) state = dag.get_init_state() D_local = state.cache_write(D, "local") E_local = state.cache_write(E, "local") state.cache_read(A, "shared", [D_local]) state.cache_read(B, "shared", [D_local]) state.cache_read(A, "shared", [E_local]) state.cache_read(C, "shared", [E_local]) stage_ops_2 = state.stage_ops assert len(stage_ops_1) == len(stage_ops_2) # Cache read stage should follow the source stage for op1, op2 in zip(stage_ops_1, stage_ops_2): assert op1.name == op2.name # Serialize and deserialize the ComputeDAG constructed by a list of tensor ops. loaded_dag = pickle.loads(pickle.dumps(dag)) assert str(loaded_dag.get_init_state()) == str(dag.get_init_state()) assert len(loaded_dag.get_init_state().stage_ops) == len( dag.get_init_state().stage_ops) # Serialize and deserialize the search task. task = auto_scheduler.SearchTask( dag, json.dumps(("test-key", )), tvm.target.Target("llvm"), hardware_params=auto_scheduler.HardwareParams(100000, 16, 64, 0, 0, 0, 0, 0), ) task2 = pickle.loads(pickle.dumps(task)) assert "test-key" in auto_scheduler.workload_registry.WORKLOAD_FUNC_REGISTRY assert str(task.dag.get_init_state()) == str(task2.dag.get_init_state()) assert len(task.dag.get_init_state().stage_ops) == len( task2.dag.get_init_state().stage_ops) assert task.workload_key == task2.workload_key assert str(task.target) == str(task2.target) assert task.hardware_params.num_cores == task2.hardware_params.num_cores assert task.hardware_params.vector_unit_bytes == task2.hardware_params.vector_unit_bytes assert task.hardware_params.cache_line_bytes == task2.hardware_params.cache_line_bytes
def main(): log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json") runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=cpu_count(logical=True), number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, timeout=ARGS.rpc_config.session_timeout_sec, ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int( ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int( ARGS.target.attrs["max_threads_per_block"]), # The value `max_local_memory_per_block` is not used in AutoScheduler, # but is required by the API. max_local_memory_per_block=12345678, max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: print(f" input_name : {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) input_data = { item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape } with ms.Profiler() as profiler: tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target=ARGS.target, hardware_params=hardware_params, ) for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): print(f"==== Task {idx}: {task.desc} " f"(weight {task_weight} key: {task.workload_key}) =====") print(task.compute_dag) if ARGS.num_trials > 0: tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune( auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, runner=runner, measure_callbacks=[ auto_scheduler.RecordToFile(log_file), ], ), adaptive_training=ARGS.adaptive_training, ) relay_build = { "graph": relay.build, "vm": relay.vm.compile }[ARGS.backend] with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}, ): lib = relay_build( mod, target=ARGS.target, params=params, ) print("Tuning Time:") print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=create_timer(ARGS.backend), backend=ARGS.backend, )
def main(): log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json") runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=cpu_count(logical=True), number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int( ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int( ARGS.target.attrs["max_threads_per_block"]), # The value `max_local_memory_per_block` is not used in AutoScheduler, # but is required by the API. max_local_memory_per_block=12345678, max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: print(f" input_name: {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target=ARGS.target, hardware_params=hardware_params, ) for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): print( f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====" ) print(task.compute_dag) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune( auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, runner=runner, measure_callbacks=[ auto_scheduler.RecordToFile(log_file), ], )) with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}, ): lib = relay.build( mod, target=ARGS.target, params=params, ) graph, rt_mod, params = lib.graph_json, lib.lib, lib.params input_data = {} for item in ARGS.input_shape: input_name, input_shape, input_dtype = item["name"], item[ "shape"], item["dtype"] if input_dtype.startswith("float"): input_data[input_name] = np.random.uniform( size=input_shape).astype(input_dtype) else: input_data[input_name] = np.random.randint(low=0, high=10000, size=input_shape, dtype=input_dtype) def f_timer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.graph_executor import GraphModule # pylint: enable=import-outside-toplevel mod = GraphModule(rt_mod["default"](dev)) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) ftimer = mod.module.time_evaluator( "run", dev, min_repeat_ms=500, repeat=3, ) results = list(np.array(ftimer().results) * 1000.0) # type: ignore print("Running time in time_evaluator: ", results) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=f_timer, ) def f_per_layer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.debugger.debug_executor import create # pylint: enable=import-outside-toplevel mod = create(graph, rt_mod, dev) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) print("|graph_nodes| = ", len(graph_nodes)) print("|graph_time| = ", len(graph_time)) graph_nodes_time = { k: float(v) for k, v in zip(graph_nodes, graph_time) } for k, v in graph_nodes_time.items(): print(f"{k} : {v:.3f}") run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=rt_mod, dev_type=ARGS.target.kind.name, args=input_data, continuation=f_per_layer, )
def main(): log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json") runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=cpu_count(logical=True), number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, timeout=ARGS.rpc_config.session_timeout_sec, ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]), # The value `max_local_memory_per_block` is not used in AutoScheduler, # but is required by the API. max_local_memory_per_block=12345678, max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") describe() print(f"Workload: {ARGS.workload}") with ms.Profiler() as profiler: # Same as MetaSchedule Tune TE # Does not count ApplyHistoryBest time workload_func, params = CONFIGS[ARGS.workload] params = params[0] # type: ignore workload_func = auto_scheduler.register_workload(workload_func) task = auto_scheduler.SearchTask( func=workload_func, args=params, target=ARGS.target, hardware_params=hardware_params, ) # Inspect the computational graph print("Computational DAG:") print(task.compute_dag) tune_option = auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, runner=runner, ) if ARGS.num_trials > 0: print("Running AutoTuning:") task.tune(tune_option, adaptive_training=ARGS.adaptive_training) print("Tuning Time:") print(profiler.table()) print("History Best:") print(task.print_best(log_file)) sch, args = task.apply_best(log_file) print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True))