def auto_scheduler_tune(network, target, input_name, log_file): if os.path.exists(log_file): os.remove(log_file) mod, net_params, input_shape, output_shape = get_network(network) if network not in ["bert"]: # convert to NHWC layout desired_layouts = {'nn.conv2d': ['NHWC', 'default']} seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(), relay.transform.ConvertLayout(desired_layouts)]) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) if "cpu" in target.keys: tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=20000, # change this to 20000 to achieve the best performance runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) else: measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10) tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=20000, # change this to 20000 to achieve the best performance runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], net_params, target) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune(tuning_opt)
def local_auto_scheduler(self, repeat=1, min_repeat_ms=300, timeout=10, num_measure_trials=200): # extract tasks tasks, task_weights = auto_scheduler.extract_tasks( self.mod["main"], self.params, self.target) for idx, task in enumerate(tasks): logger.debug("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) logger.debug(task.compute_dag) # generate tuner tuner = auto_scheduler.TaskScheduler(tasks, task_weights) logging.info("Begin tuning...") measure_ctx = auto_scheduler.LocalRPCMeasureContext( repeat=repeat, min_repeat_ms=min_repeat_ms, timeout=timeout) tune_option = auto_scheduler.TuningOptions( num_measure_trials=num_measure_trials, runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(self.log_file)], ) tuner.tune(tune_option) # update self.lib with auto_scheduler.ApplyHistoryBest(self.log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): self._lib = relay.build(self.mod, target=self.target, params=self.params) logger.info(f"load optimized library from {self.log_file}")
def run_tuning(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials= 200, # change this to 20000 to achieve the best performance runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) if use_sparse: from tvm.topi.sparse.utils import sparse_sketch_rules search_policy = [ auto_scheduler.SketchPolicy( task, program_cost_model=auto_scheduler.XGBModel(), init_search_callbacks=sparse_sketch_rules(), ) for task in tasks ] tuner.tune(tune_option, search_policy=search_policy) else: tuner.tune(tune_option)
def search_common(workload=matmul_auto_scheduler_test, target="llvm", search_policy='empty', seed=random.randint(1, 1 << 30), runner='local', cost_model=auto_scheduler.RandomModel(), num_measure_trials=2, init_search_callbacks=None): print("Test %s schedule search with the default search policy" % (target)) random.seed(seed) N = 128 workload_key = auto_scheduler.make_workload_key(workload, (N, N, N)) dag = auto_scheduler.ComputeDAG(workload_key) target = tvm.target.create(target) task = auto_scheduler.SearchTask(dag, workload_key, target) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name init_search_callbacks = init_search_callbacks or [] init_search_callbacks.append( auto_scheduler.PreloadMeasuredStates(log_file)) if search_policy == 'empty': search_policy = auto_scheduler.EmptyPolicy(task) elif search_policy == 'sketch': search_policy = auto_scheduler.SketchPolicy( task, init_search_callbacks=init_search_callbacks) tuning_options = auto_scheduler.TuningOptions( num_measure_trials=num_measure_trials, runner=runner, verbose=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]) sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options) inp, res = auto_scheduler.load_best(log_file, workload_key, target) print("==== Python Code ====") print(dag.print_python_code_from_state(inp.state)) try: print("==== Lowered Stmt ====") print(tvm.lower(sch, args, simple_mode=True)) mod = tvm.build(sch, args, target) ctx = tvm.context(str(target), 0) dtype = dag.tensors[0].dtype a = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx) mod(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) print("==== Verification passed ====") except Exception: raise Exception("Error encountered with seed: %d" % (seed)) print()
def tune_and_evaluate(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials= 200, # change this to 20000 to achieve the best performance builder=auto_scheduler.LocalBuilder( build_func="ndk" if use_ndk else "default"), runner=auto_scheduler.RPCRunner( device_key, host=rpc_host, port=rpc_port, timeout=30, repeat=1, min_repeat_ms=200, enable_cpu_cache_flush=True, ), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option) # Compile with the history best print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) # Export library tmp = tempdir() if use_ndk: from tvm.contrib import ndk filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "net.tar" lib.export_library(tmp.relpath(filename)) # Upload module to device print("Upload...") remote = auto_scheduler.utils.request_remote(device_key, rpc_host, rpc_port, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # Create graph executor dev = remote.cpu() module = graph_executor.GraphModule(rlib["default"](dev)) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
def tune_network(network, target): # Extract tasks mod, params = get_network(network) target = tvm.target.Target(target) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name # Tuning measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=100, num_measures_per_round=2, early_stopping=1, runner=measure_ctx.runner, builder=auto_scheduler.LocalBuilder(timeout=60), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option, search_policy="sketch.random") del measure_ctx # Compile with the history best with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params)
def test_tuning_cuda(): auto_scheduler.enable_relay_integration() # Extract tasks mod, params = get_network("mlp") target = tvm.target.Target("cuda") tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) objective = lambda costs: sum(c * w for c, w in zip(costs, task_weights)) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name # Tuning measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=100) tuner = auto_scheduler.TaskScheduler(tasks, objective) tune_option = auto_scheduler.TuningOptions( num_measure_trials=2, num_measures_per_round=1, runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option, search_policy="sketch.random") del measure_ctx # Compile with the history best with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) # Todo(merrymercy): compile without any history to test the fallback mechanism auto_scheduler.enable_relay_integration(False)
def _autoscheduler_test_helper(model, tmpdir_name, tasks_weights=None, early_stopping=1, tuning_records=None): tasks, weights = tasks_weights if tasks_weights else _get_tasks(model) log_file = os.path.join(tmpdir_name, "autoscheduler.json") tuning_options = auto_scheduler.TuningOptions( num_measure_trials=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], runner="local", builder="local", verbose=0, early_stopping=early_stopping, ) tvmc.autotuner.schedule_tasks(tasks[:1], weights[:1], tuning_options, tuning_records) # testing whether the log file was produced assert path.exists(log_file), "autoscheduler log file should exist" with auto_scheduler.ApplyHistoryBest(log_file) as best: assert isinstance(best, auto_scheduler.dispatcher.ApplyHistoryBest ), "unable to load the best results of tuning" return log_file
def tune_and_evaluate(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=200, builder=auto_scheduler.LocalBuilder(build_func="ndk"), runner=auto_scheduler.RPCRunner( device_key, host=rpc_host, port=rpc_port, timeout=30, repeat=1, min_repeat_ms=200, enable_cpu_cache_flush=True, ), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option) # Compile with the history best print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) # Export library tmp = tempdir() filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) # Upload module to device print("Upload...") remote = auto_scheduler.utils.request_remote(device_key, rpc_host, rpc_port, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # Create graph executor dev = remote.cpu() module = graph_executor.GraphModule(rlib["default"](dev)) for key, value in shape_dict.items(): data_tvm = tvm.nd.array( (np.random.uniform(size=value)).astype("float32")) module.set_input(key, data_tvm) # Evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def test_correctness_layout_rewrite_insert_transform_stage(): N = 128 target = tvm.target.Target("llvm") task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), target) dag = task.compute_dag with tempfile.NamedTemporaryFile() as fp: log_file = fp.name search_policy = auto_scheduler.SketchPolicy(task) measure_ctx = auto_scheduler.LocalRPCMeasureContext() tuning_options = auto_scheduler.TuningOptions( num_measure_trials=2, runner=measure_ctx.runner, verbose=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) auto_scheduler.auto_schedule(task, search_policy, tuning_options) inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target) s, bufs = dag.apply_steps_from_state( inp.state, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG. InsertTransformStage) s_ref, bufs_ref = dag.apply_steps_from_state(inp.state) np_args = [ np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs ] func = tvm.build(s, bufs, target=target) func_ref = tvm.build(s_ref, bufs_ref, target=target) ctx = tvm.context(str(target)) ctx_ref = tvm.cpu() args = [tvm.nd.array(x, ctx=ctx) for x in np_args] args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args] ctx.sync() func(*args) func_ref(*args_ref) ctx.sync() tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3) tvm.testing.assert_allclose(args[1].asnumpy(), args_ref[1].asnumpy(), atol=1e-3, rtol=1e-3) tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), atol=1e-3, rtol=1e-3) del measure_ctx
def tune_network(network, target): # Extract tasks mod, params = get_network(network) target = tvm.target.Target(target) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name # Tuning measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=100, num_measures_per_round=2, early_stopping=1, runner=measure_ctx.runner, builder=auto_scheduler.LocalBuilder(timeout=60), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option, search_policy="sketch.random") del measure_ctx # Compile with the history best with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) # Compile without auto-scheduler and any other optimization for correctness check with tvm.transform.PassContext(opt_level=0): lib2 = relay.build(mod, target=target, params=params) # Check the correctness def get_output(data, lib): ctx = tvm.gpu() module = graph_runtime.GraphModule(lib["default"](ctx)) module.set_input("data", data) module.run() return module.get_output(0).asnumpy() np.random.seed(0) if network == "mlp": data = np.random.uniform(size=(1, 32)) elif network == "winograd-test": data = np.random.uniform(size=(1, 23, 40, 32)) else: raise ValueError("Unknown network: " + network) actual_output = get_output(data, lib) expected_output = get_output(data, lib2) tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4, atol=1e-4)
def main(): log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json") workload_func, params = CONFIGS[ARGS.workload] params = params[0] # type: ignore workload_func = auto_scheduler.register_workload(workload_func) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int( ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int( ARGS.target.attrs["max_threads_per_block"]), max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") task = auto_scheduler.SearchTask( func=workload_func, args=params, target=ARGS.target, hardware_params=hardware_params, ) runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=ARGS.rpc_workers, number=3, repeat=1, min_repeat_ms=100, enable_cpu_cache_flush=False, ) # Inspect the computational graph print("Computational DAG:") print(task.compute_dag) tune_option = auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, runner=runner, ) print("Running AutoTuning:") task.tune(tune_option) print("History Best:") print(task.print_best(log_file)) sch, args = task.apply_best(log_file) print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True))
def tune_and_evaluate(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials= 200, # change this to 20000 to achieve the best performance builder=auto_scheduler.LocalBuilder( build_func="ndk" if use_ndk else "default"), runner=auto_scheduler.RPCRunner(device_key, host="0.0.0.0", port=9190, repeat=3, timeout=50), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option) # Compile the whole network print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, target_host=target_host, params=params) # Create graph runtime print("=============== Request Remote ===============") from tvm.auto_scheduler.utils import request_remote remote = request_remote(device_key, "0.0.0.0", 9190) ctx = remote.cl() from tvm.contrib import utils, ndk temp = utils.tempdir() filename = "deploy_lib.so" path_lib = temp.relpath(filename) lib.export_library(path_lib, ndk.create_shared) remote.upload(path_lib) loaded_lib = remote.load_module(filename) module = graph_runtime.GraphModule(loaded_lib["default"](ctx)) data = (np.random.uniform(size=input_shape)).astype(dtype) data_tvm = tvm.nd.array(data) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def resume_search(task, log_file): print("Resume search:") cost_model = auto_scheduler.XGBModel() cost_model.update_from_file(log_file) search_policy = auto_scheduler.SketchPolicy( task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)] ) tune_option = auto_scheduler.TuningOptions( num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)] ) task.tune(tune_option, search_policy=search_policy)
def auto_scheduler_tune(network, batch_size, dtype, target, log_file): os.makedirs(os.path.dirname(log_file), exist_ok=True) #if os.path.exists(log_file): # os.remove(log_file) layout = "NHWC" mod, params, input_name, input_shape, output_shape = get_network( network, batch_size, dtype, layout) n_trials = network_to_n_trials[(network, batch_size, dtype, str(target.kind))] if "cpu" in target.keys: tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=n_trials, runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) else: min_repeat_ms = 450 if network in ["bert"] else 300 measure_ctx = auto_scheduler.LocalRPCMeasureContext( repeat=1, min_repeat_ms=min_repeat_ms, timeout=10) tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=n_trials, runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) print(log_file) update_file(log_file, tasks) return for idx, task in enumerate(tasks): print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) print(task.compute_dag) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune(tuning_opt)
def run_tuning(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials= 200, # change this to 20000 to achieve the best performance runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option)
def run_tuning(): print("Begin tuning...") measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=200, # change this to 20000 to achieve the best performance runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option)
def test_task_scheduler_round_robin(): tasks = [] for n in [2, 4, 8]: tasks.append( auto_scheduler.create_task(matmul_auto_scheduler_test, (n, n, n), "llvm")) def objective_func(costs): return sum(costs) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name num_trials_per_task = 2 # Tune all tasks measure_ctx = auto_scheduler.LocalRPCMeasureContext() tune_option = auto_scheduler.TuningOptions( num_measure_trials=num_trials_per_task * len(tasks), runner=measure_ctx.runner, num_measures_per_round=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) task_scheduler = auto_scheduler.TaskScheduler(tasks, objective_func, strategy="round-robin") task_scheduler.tune(tune_option, search_policy="sketch.random") # Check the result of round robin counters = {} for task in tasks: counters[task.workload_key] = 0 for inp, res in auto_scheduler.load_records(log_file): counters[inp.task.workload_key] += 1 for task in tasks: assert counters[task.workload_key] == num_trials_per_task # test continuous tuning (restoring the status) task_scheduler = auto_scheduler.TaskScheduler(tasks, objective_func, strategy="round-robin", load_log_file=log_file) tune_option = auto_scheduler.TuningOptions( num_measure_trials=len(tasks), num_measures_per_round=1, ) task_scheduler.tune(tune_option, search_policy="sketch.random") del measure_ctx
def tune_and_check(mod, data, weight): # Extract tasks from a relay program target = tvm.target.Target("llvm") tasks, task_weights = auto_scheduler.extract_tasks( mod, target=target, params={"weight": weight}) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name # Tune tasks tuner = auto_scheduler.TaskScheduler(tasks, task_weights, callbacks=[]) tune_option = auto_scheduler.TuningOptions( num_measure_trials=1, num_measures_per_round=1, builder=auto_scheduler.LocalBuilder(timeout=60), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option, search_policy="sketch.random") # Compile with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}, ): lib = relay.build(mod, target=target, params={"weight": weight}) # Compile without auto-scheduler for correctness check with tvm.transform.PassContext(opt_level=0): lib2 = relay.build(mod, target=target, params={"weight": weight}) def get_output(data, lib): dev = tvm.cpu() module = graph_executor.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, lib) expected_output = get_output(data, lib2) tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
def resume_search(task, logfile_name): cost_model = auto_scheduler.XGBModel() cost_model.update_from_file(logfile_name) search_policy = auto_scheduler.SketchPolicy( task, cost_model, init_search_callbacks=[ auto_scheduler.PreloadMeasuredStates(logfile_name) ]) tune_option = auto_scheduler.TuningOptions( num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(logfile_name)]) sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
def run_tuning(tasks, task_weights, log_file): print("Begin tuning...") measure_runner = auto_scheduler.RPCRunner("m1", "127.0.0.1", 9190, min_repeat_ms=300, timeout=30, repeat=2) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=10000, runner=measure_runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, ) tuner.tune(tune_option)
def resume_search(task, log_file): print("Resume search:") cost_model = auto_scheduler.XGBModel() cost_model.update_from_file(log_file) search_policy = auto_scheduler.SketchPolicy( task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)] ) measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300) tune_option = auto_scheduler.TuningOptions( num_measure_trials=5, runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) task.tune(tune_option, search_policy=search_policy) # Kill the measurement process del measure_ctx
def test_task_scheduler_gradient(): tasks = [] for n in [2, 4]: tasks.append( auto_scheduler.SearchTask( func=matmul_auto_scheduler_test, args=(n, n, n), target="llvm" ) ) def objective_func(costs): return costs[0] with tempfile.NamedTemporaryFile() as fp: log_file = fp.name n_trials = 5 # Tune all tasks measure_ctx = auto_scheduler.LocalRPCMeasureContext() tune_option = auto_scheduler.TuningOptions( num_measure_trials=n_trials, runner=measure_ctx.runner, num_measures_per_round=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) task_scheduler = auto_scheduler.TaskScheduler( tasks, objective_func=objective_func, callbacks=[] ) # Forcely rewrite the initial values. # This can make this test more stable on the slow CI machines task_scheduler.best_costs = np.array([1e2, 1e-8]) task_scheduler.tune(tune_option, search_policy="sketch.random") # Check the allocation results counters = {} for task in tasks: counters[task.workload_key] = 0 for inp, _ in auto_scheduler.load_records(log_file): counters[inp.task.workload_key] += 1 assert counters[tasks[0].workload_key] == n_trials - 1 assert counters[tasks[1].workload_key] == 1 del measure_ctx
def test_check_auto_schedule_tuning(host, port): # pylint: disable=too-many-locals log_file = TEMPORARY_DIRECTORY.relpath("ios_tuning_stat.log") target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin") mod, params = relay.testing.mlp.get_workload(batch_size=4, image_shape=(1, 4, 4)) try: status_ok = True measure_runner = auto_scheduler.RPCRunner( DEVICE_KEY, host, port, min_repeat_ms=1, timeout=10, n_parallel=multiprocessing.cpu_count(), ) builder = auto_scheduler.LocalBuilder(timeout=10, build_func=ios_create_dylib) tune_option = auto_scheduler.TuningOptions( builder=builder, num_measure_trials=2, num_measures_per_round=1, runner=measure_runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=0, ) tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target) tasks, task_weights = tasks[:2], task_weights[:2] tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune(tune_option, search_policy="sketch.random") # Check tuning log tuning_statistic = list(load_records(log_file)) for _, measure_result in tuning_statistic: if measure_result.error_no != MeasureErrorNo.NO_ERROR: raise ValueError( f"Error for MeasureResult. Error code: {measure_result.error_no}," f" for details see MeasureErrorNO.") except Exception as e: # pylint: disable=broad-except status_ok = False print(e) assert status_ok, "Tuning failed, see logs."
def remote_auto_scheduler(self, device_key, rpc_host, rpc_port): # generate tasks tasks, task_weights = auto_scheduler.extract_tasks( self.mod["main"], self.params, self.target) for idx, task in enumerate(tasks): logger.debug("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) logger.debug(task.compute_dag) # generate tuner tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=200, builder=auto_scheduler.LocalBuilder(), runner=auto_scheduler.RPCRunner( device_key, host=rpc_host, port=rpc_port, timeout=30, repeat=1, min_repeat_ms=200, enable_cpu_cache_flush=True, ), measure_callbacks=[auto_scheduler.RecordToFile(self.log_file)], ) tuner.tune(tune_option) # update self.lib with auto_scheduler.ApplyHistoryBest(self.log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}): self._lib = relay.build(self.mod, target=self.target, params=self.params) logger.info(f"load optimized library from {self.log_file}")
def tune_network(network, target): auto_scheduler.enable_relay_integration() # Extract tasks mod, params = get_network(network) target = tvm.target.Target(target) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name # Tuning measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=100, num_measures_per_round=2, early_stopping=1, runner=measure_ctx.runner, builder=auto_scheduler.LocalBuilder(timeout=60), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option, search_policy="sketch.random") del measure_ctx # Compile with the history best with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) # Todo(merrymercy): when the cpu backend is upstreamed, do the following things: # 1. compile without history to test the fallback mechanism # 2. check the correctness of layout rewrite / winograd pre-transform auto_scheduler.enable_relay_integration(False)
# --------------------------------- # Next, we set parameters for the auto-scheduler. # # * :code:`num_measure_trials` is the number of measurement trials we can use # during the search. We only make 10 trials in this tutorial for a fast # demonstration. In practice, 1000 is a good value for the search to converge. # You can do more trials according to your time budget. # * In addition, we use :code:`RecordToFile` to log measurement records into a # file `matmul.json`. The measurement records can be used to query the history # best, resume the search, and do more analyses later. # * see :any:`auto_scheduler.TuningOptions` for more parameters log_file = "matmul.json" tune_option = auto_scheduler.TuningOptions( num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, ) ################################################################################ # Run the search # -------------- # Now we get all inputs ready. Pretty simple, isn't it? We can kick off the # search and let the auto-scheduler do its magic. After some measurement # trials, we can load the best schedule from the log file and apply it. # Run auto-tuning (search) task.tune(tune_option) # Apply the best schedule sch, args = task.apply_best(log_file)
def tune_model( tvmc_model: TVMCModel, target: str, tuning_records: Optional[str] = None, prior_records: Optional[str] = None, enable_autoscheduler: bool = False, rpc_key: Optional[str] = None, hostname: Optional[str] = None, port: Optional[Union[int, str]] = 9090, trials: int = 10000, target_host: Optional[str] = None, tuner: str = "xgb", min_repeat_ms: Optional[int] = None, early_stopping: Optional[int] = None, desired_layout: Optional[str] = None, timeout: int = 10, repeat: int = 1, number: int = 10, parallel: int = 4, hardware_params: Optional[HardwareParams] = None, include_simple_tasks: bool = False, log_estimated_latency: bool = False, additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None, ): """Use tuning to automatically optimize the functions in a model. Parameters ---------- tvmc_model : TVMCModel The model to be optimized. target : str Compilation target as plain string, inline JSON or path to a JSON file. tuning_records: str, optional The path to a file that tuning results will be saved to. If not specified, a temporary file will be used. prior_records: str, optional A path to previous tuning results that will be used to hot-start the tuning cost model if provided. enable_autoscheduler : bool, optional When true, use autoscheduling rather than autotvm. This should produce faster kernels for compatible model-target pairs. rpc_key : str, optional The RPC tracker key of the target device. Required when rpc_tracker is provided. hostname : str, optional The IP address of an RPC tracker, used when benchmarking remotely. port : int or str, optional The port of the RPC tracker to connect to. Defaults to 9090. trials : int, optional The number of schedules to try out for the entire model. Note that the default value is chosen as a decent average for most models, but larger models may need more trials to reach a good result while smaller models will converge with fewer trials. tuner : str, optional The type of tuner to use when tuning with autotvm. Can be one of "ga", "gridsearch", "random", "xgb", "xgb_knob", and "xgb-rank". min_repeat_ms : int, optional Minimum time to run each trial. Defaults to 0 on x86 and 1000 on other targets. early_stopping : int, optional When specified, stop tuning after this number of trials if results aren't improving. desired_layout : str, optional Can be one of "NCHW" or "NHWC". When specified, compatible operations in the graph will have their layout set to this format. Tasks will then be tuned using this specified layout. timeout : int, optional, If a kernel trial lasts longer than this duration in seconds, it will be considered a failure. repeat : int, optional How many times each measurement should be repeated. number : int, optional The number of runs a single repeat is made of. parallel : int, optional The maximum number of parallel devices to use when tuning. hardware_params : auto_scheduler.HardwareParams, optional When using the autoscheduler, this object defines the configuration of the target hardware. include_simple_tasks : bool, optional Whether to extract simple operations or only computationally intensive ones when using the autoscheduler. log_estimated_latency : bool, optional If using the autoscheduler, write the estimated latency at each step of tuning to file. additional_target_options: Optional[Dict[str, Dict[str, Any]]] Additional target options in a dictionary to combine with initial Target arguments Returns ------- tuning_records : str The path to the produced tuning log file. """ target, extra_targets = common.target_from_cli(target, additional_target_options) target, target_host = Target.check_and_update_host_consist( target, target_host) # TODO(jwfromm) Remove this deepcopy once AlterOpLayout bug that mutates source # model is fixed. For now, creating a clone avoids the issue. mod = deepcopy(tvmc_model.mod) params = tvmc_model.params if tuning_records is None: tuning_records = tvmc_model.default_tuning_records_path() for codegen_from_cli in extra_targets: codegen = composite_target.get_codegen_by_target( codegen_from_cli["name"]) partition_function = codegen["pass_pipeline"] mod = partition_function(mod, params, **codegen_from_cli["opts"]) # min_repeat_ms should be: # a. the value provided by the user, if any, or # b. 0ms in case target is "cpu"; otherwise 1000ms if min_repeat_ms is None: min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000 logger.info("Default --min-repeat-ms for this target is %s", min_repeat_ms) if rpc_key: if hostname is None or port is None: raise common.TVMCException( "You must provide a hostname and port to connect to a remote RPC device." ) if isinstance(port, str): port = int(port) logger.info("Tuning will be performed on device %s at %s:%d.", rpc_key, hostname, port) runner_ctor = auto_scheduler.RPCRunner if enable_autoscheduler else autotvm.RPCRunner runner = runner_ctor( key=rpc_key, host=hostname, port=port, number=number, repeat=repeat, n_parallel=parallel, timeout=timeout, min_repeat_ms=min_repeat_ms, ) else: logger.info("Starting localhost tuning.") runner_ctor = (auto_scheduler.LocalRPCMeasureContext if enable_autoscheduler else autotvm.LocalRunner) local_server = runner_ctor( number=number, repeat=repeat, timeout=timeout, min_repeat_ms=min_repeat_ms, ) # For autoscheduling on some devices, we need to maintain a LocalRPCMeasureContext object. if enable_autoscheduler: runner = local_server.runner else: runner = local_server if enable_autoscheduler: tasks, weights = autoscheduler_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=desired_layout, hardware_params=hardware_params, include_simple_tasks=include_simple_tasks, ) # Create the autoscheduler tuning options tuning_options = auto_scheduler.TuningOptions( num_measure_trials=trials, measure_callbacks=[auto_scheduler.RecordToFile(tuning_records)], runner=runner, early_stopping=early_stopping, ) logger.info("Autoscheduling with configuration: %s", tuning_options) # Schedule the tasks (i.e., produce a schedule for each task) schedule_tasks(tasks, weights, tuning_options, prior_records, log_estimated_latency) else: tasks = autotvm_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=desired_layout, ) # In autotvm, trials is specified per task. We can convert the per-model input # provided to per-task trials by dividing by the number of tasks. trials = int(trials / len(tasks)) logger.info("Autotuning with %d trials per task.", trials) tuning_options = { "tuner": tuner, "trials": trials, "early_stopping": early_stopping, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="default"), runner=runner), "tuning_records": prior_records, } logger.info("Autotuning with configuration: %s", tuning_options) tune_tasks(tasks, tuning_records, **tuning_options) return tuning_records
# This can warmup the GPU, which is necessary to get accurate measurement results. # Typically, we recommend a value > 300 ms. # * :code:`num_measure_trials` is the number of measurement trials we can use during the search. # We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a # good value for the search to converge. You can do more trials according to your time budget. # * In addition, we use :code:`RecordToFile` to dump measurement records into a file `conv2d.json`. # The measurement records can be used to query the history best, resume the search, # and do more analyses later. # * see :any:`auto_scheduler.TuningOptions`, # :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300) tune_option = auto_scheduler.TuningOptions( num_measure_trials=10, runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile("conv2d.json")], ) ###################################################################### # Run the search # ^^^^^^^^^^^^^^ # Now we get all inputs ready. Pretty simple, isn't it? # We can kick off the search and let the auto-scheduler do its magic. # After some measurement trials, it will return the best schedule it found. sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option) # Kill the process for measurement del measure_ctx ######################################################################
print(task.compute_dag) ###################################################################### # Next, we set parameters for the auto-scheduler. # # * `num_measure_trials` is the number of measurement trials we can use during the search. # We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a # good value for the search to converge. You can do more trials according to your time budget. # * In addition, we use `RecordToFile` to dump measurement records into a file `matmul.json`. # The measurement records can be used to query the history best, resume the search, # and do more analyses later. # * see :any:`auto_schedule.TuningOptions`: for more parameters tune_option = auto_scheduler.TuningOptions( num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile("matmul.json")]) ###################################################################### # Run the search # ^^^^^^^^^^^^^^ # Now we get all inputs ready. Pretty simple, isn't it? # We can kick off the search and let the auto-scheduler do its magic. # After some measurement trials, it will return the best schedule it found. sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option) ###################################################################### # We can lower the schedule to see the IR after auto-scheduling. # The auto-scheduler correctly performs optimizations including multi-level tiling, # parallelization, vectorization, unrolling and fusion.