def tune(self, config: TuneConfig = None, target: Union[str, Target] = None): """ Tune the TVMscript code. Parameters ---------- config: Optional[TuneConfig] The tuning configuration. target : Optional[str, Target] The target to tune for. """ if config is None: config = TuneConfig( # Default setting strategy="replay_trace", num_trials_per_iter=32, max_trials_per_task=32, max_trials_global=32, ) if target is None: target = Target("llvm --num-cores=16") with tempfile.TemporaryDirectory() as work_dir: sch: Schedule = tune_tir( mod=self.ir_module, target=target, config=config, work_dir=work_dir, ) self.ir_module = sch.mod self.build(target)
def test_meta_schedule_tune_relay( model_name: str, input_shape: List[int], target: str, ): dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda() if model_name.startswith("bert"): data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev) # embedding size else: data = tvm.nd.array( np.random.randn(*input_shape).astype("float32"), dev) mod, params, (input_name, _, _) = get_network(name=model_name, input_shape=input_shape) target = Target(target) with tempfile.TemporaryDirectory() as work_dir: database = DummyDatabase() rt_mod: tvm.runtime.Module = tune_relay( mod=mod, params=params, target=target, config=ReplayTraceConfig( num_trials_per_iter=32, num_trials_total=32, ), work_dir=work_dir, database=database, ) # Compile without meta-scheduler for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=Target("llvm"), params=params) def get_output(data, lib): module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod) expected_output = get_output( tvm.nd.array(data.numpy(), device=tvm.cpu()), rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
def test_tvmscript_torch_loop_split(): x = torch.rand(128, 128).cuda() y = torch.zeros(128).cuda() result = torch.sum(x.cpu(), dim=1).numpy() loop_split.tune(config, Target("nvidia/geforce-rtx-3070")) loop_split(x, y) tvm.testing.assert_allclose(y.cpu().numpy(), result, atol=1e-5, rtol=1e-5)
def test_meta_schedule_tune_relay( model_name: str, input_shape: List[int], target: str, ): dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda() if model_name.startswith("bert"): data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev) # embedding size else: data = tvm.nd.array( np.random.randn(*input_shape).astype("float32"), dev) mod, params, (input_name, _, _) = get_network(name=model_name, input_shape=input_shape) target = Target(target) with tempfile.TemporaryDirectory() as work_dir: rt_mod1: tvm.runtime.Module = tune_relay( mod=mod, params=params, target=target, config=TuneConfig( strategy="evolutionary", num_trials_per_iter=32, max_trials_per_task=20000, max_trials_global=20000, search_strategy_config={ "genetic_num_iters": 10, }, ), work_dir=work_dir, database=JSONDatabase( osp.join(work_dir, "workload.json"), osp.join(work_dir, "records.json"), ), ) # Compile without meta-scheduler for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod1) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
def test_tune_matmul_cuda(): with tempfile.TemporaryDirectory() as work_dir: sch: Schedule = tune_tir( mod=matmul, target=Target("nvidia/geforce-rtx-3070"), config=ReplayTraceConfig( num_trials_per_iter=32, num_trials_total=32, ), work_dir=work_dir, ) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace)
def test_tune_matmul_cpu(): with tempfile.TemporaryDirectory() as work_dir: sch: Schedule = tune_tir( mod=matmul, target=Target("llvm --num-cores=16"), config=ReplayTraceConfig( num_trials_per_iter=32, max_trials_per_task=32, max_trials_global=32, ), work_dir=work_dir, ) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace)
def test_tune_matmul(): with tempfile.TemporaryDirectory() as work_dir: sch: Schedule = tune_te( tensors=te_workload.batch_matmul_nkkm(B=1, N=128, M=128, K=128), target=Target("llvm --num-cores=16"), config=ReplayTraceConfig( num_trials_per_iter=32, max_trials_per_task=32, max_trials_global=32, ), work_dir=work_dir, ) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace)
def test_meta_schedule_relay_lowering(): data_shape = (1, 3, 16, 16) weight_shape = (8, 3, 5, 5) data = relay.var("data", relay.TensorType(data_shape, "float32")) weight = relay.var("weight", relay.TensorType(weight_shape, "float32")) y = relay.nn.conv2d( data, weight, padding=(2, 2), kernel_size=(5, 5), kernel_layout="OIHW", out_dtype="float32", ) f = relay.Function([data, weight], y) mod = tvm.IRModule.from_expr(f) mod = relay.transform.InferType()(mod) data_sample = np.random.rand(*data_shape).astype("float32") weight_sample = np.random.rand(*weight_shape).astype("float32") params = {mod["main"].params[1].name_hint: weight_sample} input_name = "data" dev = tvm.cpu() target = Target("llvm --num-cores=16") data = tvm.nd.array(data_sample, dev) with tempfile.TemporaryDirectory() as work_dir: database = JSONDatabase(osp.join(work_dir, "workload.json"), osp.join(work_dir, "records.json")) database.commit_tuning_record( TuningRecord( Trace([], {}), [0.0], database.commit_workload( tvmgen_default_fused_nn_contrib_conv2d_NCHWc), target=target, args_info=[], )) with ApplyHistoryBest(database): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): rt_mod1 = relay.build(mod, target=target, params=params) # Compile without meta-scheduler for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod1) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
def test_meta_schedule_te2primfunc_argument_order(): @derived_object class TestDummyDatabase(PyDatabase): def __init__(self): super().__init__() self.records = [] self.workload_reg = [] def has_workload(self, mod: IRModule) -> Workload: for workload in self.workload_reg: if tvm.ir.structural_equal(workload.mod, mod): return True # The database has already put in all correct workloads raise ValueError( "The workload searched for is not in given database!" + " Incorrect TIR was generated from TE subgraph.") def commit_tuning_record(self, record: TuningRecord) -> None: self.records.append(record) def commit_workload(self, mod: IRModule) -> Workload: for workload in self.workload_reg: if tvm.ir.structural_equal(workload.mod, mod): return workload workload = Workload(mod) self.workload_reg.append(workload) return workload def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]: return list( filter( lambda x: x.workload == workload, sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)), ))[:int(top_k)] def __len__(self) -> int: return len(self.records) def print_results(self) -> None: print("\n".join([str(r) for r in self.records])) data_shape = (1, 3, 16, 16) weight_shape = (8, 3, 5, 5) data = relay.var("data", relay.TensorType(data_shape, "float32")) weight = relay.var("weight", relay.TensorType(weight_shape, "float32")) y = relay.nn.conv2d( data, weight, padding=(2, 2), kernel_size=(5, 5), kernel_layout="OIHW", out_dtype="float32", ) f = relay.Function([data, weight], y) mod = tvm.IRModule.from_expr(f) mod = relay.transform.InferType()(mod) data_sample = np.random.rand(*data_shape).astype("float32") weight_sample = np.random.rand(*weight_shape).astype("float32") params = {mod["main"].params[1].name_hint: weight_sample} input_name = "data" dev = tvm.cpu() target = Target("llvm --num-cores=16") data = tvm.nd.array(data_sample, dev) database = TestDummyDatabase() database.commit_workload(tvmgen_default_fused_layout_transform) database.commit_workload(tvmgen_default_fused_layout_transform_1) database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc) with ApplyHistoryBest(database): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): rt_mod1 = relay.build(mod, target=target, params=params) # Compile without meta-scheduler for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod1) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
# under the License. import pytest import tvm import tvm.testing from tvm import relay from tvm.target.target import Target from tvm.relay.backend import Runtime, Executor, graph_executor_codegen from tvm.relay.build_module import _reconstruct_from_deprecated_options @pytest.mark.parametrize( "target,executor,runtime", [ [Target("c"), None, None], [Target("c -runtime=c"), None, Runtime("crt")], [Target("c -system-lib"), None, Runtime("cpp", {"system-lib": True})], [ Target("c -runtime=c -system-lib"), None, Runtime("crt", {"system-lib": True}) ], [Target("c -executor=aot"), Executor("aot"), None], [ Target("c -executor=aot -interface-api=c"), Executor("aot", {"interface-api": "c"}), None, ],
def test_meta_schedule_tune_relay(model_name: str, batch_size: int, target: str): if model_name == "inception_v3" and batch_size == 1: pytest.skip("inception_v3 does not handle batch_size of 1") input_shape: Tuple[int, ...] input_name = "input0" dev = tvm.cpu() if str(target).startswith("llvm") else cuda() if MODEL_TYPES[model_name] == MODEL_TYPE.TEXT_CLASSIFICATION: seq_length = 128 input_name = "input_ids" input_shape = (batch_size, seq_length) data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev) # embedding size else: if MODEL_TYPES[model_name] == MODEL_TYPE.IMAGE_CLASSIFICATION: input_shape = (batch_size, 3, 299, 299) elif MODEL_TYPES[model_name] == MODEL_TYPE.SEGMENTATION: input_shape = (batch_size, 3, 299, 299) elif MODEL_TYPES[model_name] == MODEL_TYPE.OBJECT_DETECTION: input_shape = (1, 3, 300, 300) elif MODEL_TYPES[model_name] == MODEL_TYPE.VIDEO_CLASSIFICATION: input_shape = (batch_size, 3, 3, 299, 299) else: raise ValueError("Unsupported model: " + model_name) data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev) output_shape: Tuple[int, int] = (batch_size, 1000) mod, params = get_torch_model( model_name=model_name, input_shape=input_shape, output_shape=output_shape, dtype="float32", ) with tempfile.TemporaryDirectory() as work_dir: target = Target(target) database = DummyDatabase() rt_mod: tvm.module = tune_relay( mod=mod, params=params, target=target, config=ReplayTraceConfig( num_trials_per_iter=32, num_trials_total=32, ), work_dir=work_dir, database=database, ) # Compile without meta-scheduler for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
def test_tune_matmul_cuda_tensor_core(): n = 512 mod = create_prim_func(te_workload.matmul_fp16(n, n, n)) target = Target("nvidia/geforce-rtx-3070") config = ReplayTraceConfig( num_trials_per_iter=32, num_trials_total=320, ) class DefaultTensorCore: @staticmethod def _sch_rules(): from tvm.meta_schedule import ( # pylint: disable=import-outside-toplevel schedule_rule as M, ) return [ M.AutoInline( into_producer=False, into_consumer=True, # into_cache_only=False, inline_const_tensor=True, disallow_if_then_else=False, require_injective=False, require_ordered=False, disallow_op=None, ), M.MultiLevelTiling( structure="SSSRRSRS", tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"], # use_tensor_core=True, max_innermost_factor=64, vector_load_lens=[1, 2, 3, 4], reuse_read=schedule_rule.ReuseType( req="must", levels=[4], scope="shared", ), reuse_write=schedule_rule.ReuseType( req="no", levels=[], scope="", ), ), M.AutoInline( into_producer=True, into_consumer=True, # into_cache_only=True, inline_const_tensor=True, disallow_if_then_else=False, require_injective=False, require_ordered=False, disallow_op=None, ), M.ParallelizeVectorizeUnroll( max_jobs_per_core=-1, # disable parallelize max_vectorize_extent=-1, # disable vectorize unroll_max_steps=[0, 16, 64, 512, 1024], unroll_explicit=True, ), ] @staticmethod def _postproc(): from tvm.meta_schedule import ( # pylint: disable=import-outside-toplevel postproc as M, ) return [ # M.RewriteCooperativeFetch(), M.RewriteParallelVectorizeUnroll(), M.RewriteReductionBlock(), # M.RewriteTensorCore(), M.VerifyGPUCode(), ] with tempfile.TemporaryDirectory() as work_dir: sch: Schedule = tune_tir( mod=mod, target=target, config=config, work_dir=work_dir, space=PostOrderApply(), sch_rules=DefaultTensorCore._sch_rules, postprocs=DefaultTensorCore._postproc, num_threads=None, ) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace) from tvm.contrib import nvcc import numpy as np ctx = tvm.gpu(0) if nvcc.have_tensorcore(ctx.compute_version): with tvm.transform.PassContext(): func = tvm.build(sch.mod["main"], [], "cuda") print(sch.mod.script()) print(func.imported_modules[0].get_source()) a_np = np.random.uniform(size=(n, n)).astype("float16") b_np = np.random.uniform(size=(n, n)).astype("float16") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, n), dtype="float32"), ctx) evaluator = func.time_evaluator(func.entry_name, ctx, number=3, repeat=1, min_repeat_ms=40) print("matmul with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3)) np.testing.assert_allclose( c.asnumpy(), np.matmul(a_np.astype("float32"), b_np.astype("float32")), rtol=1e-4, atol=1e-4, )