def main(): describe() print(f"Workload: {ARGS.workload}") runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, ), alloc_repeat=1, ) with ms.Profiler() as profiler: sch: Optional[tir.Schedule] = ms.tune_tir( mod=create_te_workload(ARGS.workload, 0), target=ARGS.target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=64, max_trials_per_task=ARGS.num_trials, max_trials_global=ARGS.num_trials, ), runner=runner, # type: ignore task_name=ARGS.workload, work_dir=ARGS.work_dir, num_threads=cpu_count(), ) print("Tuning Time:") print(profiler.table()) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace)
def main(): describe() print(f"Workload: {ARGS.workload}") mod, params, (input_name, input_shape, input_dtype) = get_network( ARGS.workload, ARGS.input_shape, cache_dir=ARGS.cache_dir, ) input_info = {input_name: input_shape} input_data = { item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape } for input_name, input_shape in input_info.items(): print(f" input_name : {input_name}") print(f" input_shape: {input_shape}") print(f" input_dtype: {input_dtype}") runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, ), alloc_repeat=1, ) with ms.Profiler() as profiler: lib = ms.tune_relay( mod=mod, target=ARGS.target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=64, max_trials_per_task=ARGS.num_trials, max_trials_global=ARGS.num_trials, adaptive_training=ARGS.adaptive_training, ), runner=runner, # type: ignore work_dir=ARGS.work_dir, params=params, backend=ARGS.backend, ) print("Tuning Time:") print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=create_timer(ARGS.backend), backend=ARGS.backend, )
def test_cuda_tensor_core(model_name, input_shape): """Integration tests of auto tensorization with CUDA tensor core""" target = tvm.target.Target("nvidia/geforce-rtx-3070") dev = tvm.cuda() if model_name.startswith("bert"): data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev) # embedding size else: data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev) mod, params, (input_name, _, _) = relay_workload.get_network(model_name, input_shape) seq = tvm.transform.Sequential( [ relay.transform.ToMixedPrecision(), ] ) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) def convert_layout(mod): seq = tvm.transform.Sequential( [relay.transform.ConvertLayout({"nn.conv2d": ["NHWC", "OHWI"]})] ) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) return mod with tempfile.TemporaryDirectory() as work_dir: with ms.Profiler() as profiler: rt_mod1: tvm.runtime.Module = ms.tune_relay( mod=convert_layout(mod), params=params, target=target, config=ms.TuneConfig( num_trials_per_iter=32, max_trials_per_task=200, max_trials_global=3000, ), sch_rules=ms.default_config._DefaultCUDATensorCore.schedule_rules, postprocs=ms.default_config._DefaultCUDATensorCore.postprocs, work_dir=work_dir, ) print(profiler.table()) # Compile without MetaSchedule for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod1) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-2, atol=2e-2)
def main(): describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: print(f" input_name : {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) input_data = { item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape } runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, ), alloc_repeat=1, ) with ms.Profiler() as profiler: lib = ms.tune_relay( mod=mod, target=ARGS.target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=64, max_trials_per_task=ARGS.num_trials, max_trials_global=ARGS.num_trials, adaptive_training=ARGS.adaptive_training, ), runner=runner, # type: ignore work_dir=ARGS.work_dir, params=params, backend=ARGS.backend, ) print("Tuning Time:") print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=create_timer(ARGS.backend), backend=ARGS.backend, )
def test_meta_schedule_profiler_context_manager(): with ms.Profiler() as profiler: time.sleep(1) with ms.Profiler.timeit("Level0"): time.sleep(1) with ms.Profiler.timeit("Level1"): time.sleep(2) # Note that the results are in seconds result = profiler.get() assert len(result) == 3 assert 3.9 <= result["Total"] <= 4.1 assert 2.9 <= result["Level0"] <= 3.1 assert 1.9 <= result["Level1"] <= 2.1
def test_meta_schedule_tune_relay( model_name: str, input_shape: List[int], target: str, ): dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda() if model_name.startswith("bert"): data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev) # embedding size else: data = tvm.nd.array( np.random.randn(*input_shape).astype("float32"), dev) mod, params, (input_name, _, _) = get_network(name=model_name, input_shape=input_shape) target = Target(target) with tempfile.TemporaryDirectory() as work_dir: with ms.Profiler() as profiler: rt_mod1: tvm.runtime.Module = ms.tune_relay( mod=mod, params=params, target=target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=32, max_trials_per_task=20000, max_trials_global=20000, ), work_dir=work_dir, ) print(profiler.table()) # Compile without meta-schedule for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod1) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
def main(): log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json") runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=cpu_count(logical=True), number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, timeout=ARGS.rpc_config.session_timeout_sec, ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int( ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int( ARGS.target.attrs["max_threads_per_block"]), # The value `max_local_memory_per_block` is not used in AutoScheduler, # but is required by the API. max_local_memory_per_block=12345678, max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: print(f" input_name : {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) input_data = { item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape } with ms.Profiler() as profiler: tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target=ARGS.target, hardware_params=hardware_params, ) for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): print(f"==== Task {idx}: {task.desc} " f"(weight {task_weight} key: {task.workload_key}) =====") print(task.compute_dag) if ARGS.num_trials > 0: tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune( auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, runner=runner, measure_callbacks=[ auto_scheduler.RecordToFile(log_file), ], ), adaptive_training=ARGS.adaptive_training, ) relay_build = { "graph": relay.build, "vm": relay.vm.compile }[ARGS.backend] with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True}, ): lib = relay_build( mod, target=ARGS.target, params=params, ) print("Tuning Time:") print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=create_timer(ARGS.backend), backend=ARGS.backend, )
def main(): describe() print(f"Workload: {ARGS.workload}") mod, params, (input_name, input_shape, input_dtype) = get_network( ARGS.workload, ARGS.input_shape, cache_dir=ARGS.cache_dir, ) input_info = {input_name: input_shape} input_data = {} for input_name, input_shape in input_info.items(): print(f" input_name: {input_name}") print(f" input_shape: {input_shape}") print(f" input_dtype: {input_dtype}") runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, ), alloc_repeat=1, ) with ms.Profiler() as profiler: lib = ms.tune_relay( mod=mod, target=ARGS.target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=64, max_trials_per_task=ARGS.num_trials, max_trials_global=ARGS.num_trials, ), runner=runner, # type: ignore work_dir=ARGS.work_dir, params=params, ) print("Tuning Time:") print(profiler.table()) graph, rt_mod, params = lib.graph_json, lib.lib, lib.params for input_name, input_shape in input_info.items(): if input_dtype.startswith("float"): input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype) else: input_data[input_name] = np.random.randint( low=0, high=10000, size=input_shape, dtype=input_dtype ) def f_timer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.graph_executor import GraphModule # pylint: enable=import-outside-toplevel mod = GraphModule(rt_mod["default"](dev)) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) ftimer = mod.module.time_evaluator( "run", dev, min_repeat_ms=500, repeat=3, ) results = list(np.array(ftimer().results) * 1000.0) # type: ignore print("Running time in time_evaluator: ", results) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=f_timer, ) def f_per_layer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.debugger.debug_executor import create # pylint: enable=import-outside-toplevel mod = create(graph, rt_mod, dev) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) print("|graph_nodes| = ", len(graph_nodes)) print("|graph_time| = ", len(graph_time)) graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)} for k, v in graph_nodes_time.items(): print(f"{k} : {v:.3f}") run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=rt_mod, dev_type=ARGS.target.kind.name, args=input_data, continuation=f_per_layer, )
def measure_candidates(database, builder, runner): """Send the candidates to builder and runner for distributed measurement, and save the results in a new json database. Parameters ---------- database : JSONDatabase The database for candidates to be measured. builder : Builder The builder for building the candidates. runner : Runner The runner for measuring the candidates. Returns ------- None """ candidates, runner_results, build_fail_indices, run_fail_indices = [], [], [], [] context = ms.TuneContext(target=Target(args.target)) tuning_records = database.get_all_tuning_records() for record in tuning_records: candidates.append(record.as_measure_candidate()) with ms.Profiler() as profiler: for idx in range(0, len(candidates), args.batch_size): batch_candidates = candidates[idx:idx + args.batch_size] context._set_measure_candidates(batch_candidates) # pylint: disable=protected-access with ms.Profiler.timeit("build"): context._send_to_builder(builder) # pylint: disable=protected-access with ms.Profiler.timeit("run"): context._send_to_runner(runner) # pylint: disable=protected-access batch_runner_results = context._join() # pylint: disable=protected-access runner_results.extend(batch_runner_results) for i, result in enumerate(context.builder_results): if result.error_msg is None: ms.utils.remove_build_dir(result.artifact_path) else: build_fail_indices.append(i + idx) context._clear_measure_state() # pylint: disable=protected-access model_name, workload_name = database.path_workload.split("/")[-2:] record_name = database.path_tuning_record.split("/")[-1] new_database = ms.database.JSONDatabase( path_workload=os.path.join(args.result_cache_dir, model_name, workload_name), path_tuning_record=os.path.join(args.result_cache_dir, model_name, record_name), ) workload = tuning_records[0].workload new_database.commit_workload(workload.mod) for i, (record, result) in enumerate(zip(tuning_records, runner_results)): if result.error_msg is None: new_database.commit_tuning_record( ms.database.TuningRecord( trace=record.trace, workload=workload, run_secs=[v.value for v in result.run_secs], target=Target(args.target), )) else: run_fail_indices.append(i) fail_indices_name = workload_name.replace("_workload.json", "_failed_indices.txt") with open(os.path.join(args.result_cache_dir, model_name, fail_indices_name), "w", encoding="utf8") as file: file.write(" ".join([str(n) for n in run_fail_indices])) print( f"Builder time: {profiler.get()['build']}, Runner time: {profiler.get()['run']}\n\ Failed number of builds: {len(build_fail_indices)},\ Failed number of runs: {len(run_fail_indices)}")
def main(): log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json") runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=cpu_count(logical=True), number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, timeout=ARGS.rpc_config.session_timeout_sec, ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]), # The value `max_local_memory_per_block` is not used in AutoScheduler, # but is required by the API. max_local_memory_per_block=12345678, max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") describe() print(f"Workload: {ARGS.workload}") with ms.Profiler() as profiler: # Same as MetaSchedule Tune TE # Does not count ApplyHistoryBest time workload_func, params = CONFIGS[ARGS.workload] params = params[0] # type: ignore workload_func = auto_scheduler.register_workload(workload_func) task = auto_scheduler.SearchTask( func=workload_func, args=params, target=ARGS.target, hardware_params=hardware_params, ) # Inspect the computational graph print("Computational DAG:") print(task.compute_dag) tune_option = auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, runner=runner, ) if ARGS.num_trials > 0: print("Running AutoTuning:") task.tune(tune_option, adaptive_training=ARGS.adaptive_training) print("Tuning Time:") print(profiler.table()) print("History Best:") print(task.print_best(log_file)) sch, args = task.apply_best(log_file) print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True))