def main(): alloc_repeat = 1 runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( number=3, repeat=1, min_repeat_ms=100, enable_cpu_cache_flush=False, ), alloc_repeat=alloc_repeat, max_workers=ARGS.rpc_workers, ) sch: Optional[tir.Schedule] = ms.tune_tir( mod=create_te_workload(ARGS.workload, 0), target=ARGS.target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=64, max_trials_per_task=ARGS.num_trials, max_trials_global=ARGS.num_trials, ), runner=runner, # type: ignore task_name=ARGS.workload, work_dir=ARGS.work_dir, num_threads=cpu_count(), ) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace)
def main(): describe() print(f"Workload: {ARGS.workload}") runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, ), alloc_repeat=1, ) with ms.Profiler() as profiler: sch: Optional[tir.Schedule] = ms.tune_tir( mod=create_te_workload(ARGS.workload, 0), target=ARGS.target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=64, max_trials_per_task=ARGS.num_trials, max_trials_global=ARGS.num_trials, ), runner=runner, # type: ignore task_name=ARGS.workload, work_dir=ARGS.work_dir, num_threads=cpu_count(), ) print("Tuning Time:") print(profiler.table()) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace)
def test_tune_block_cpu(): @derived_object class RemoveBlock(PyScheduleRule): def _initialize_with_tune_context(self, context: TuneContext) -> None: pass def apply(self, sch: Schedule, block: BlockRV): if sch.get(block).name_hint == "root": return [sch] sch = sch.copy() sch.compute_inline(block) return [sch] with tempfile.TemporaryDirectory() as work_dir: sch: Schedule = tune_tir( mod=two_step, target=Target("llvm --num-cores=16"), config=TuneConfig( strategy="replay_trace", num_trials_per_iter=32, max_trials_per_task=32, max_trials_global=32, ), work_dir=work_dir, blocks=["A"], sch_rules=lambda *args: [RemoveBlock()], ) assert sch is not None
def test_tune_matmul_cuda(): with tempfile.TemporaryDirectory() as work_dir: sch: Schedule = tune_tir( mod=matmul, target=Target("nvidia/geforce-rtx-3070"), config=ReplayTraceConfig( num_trials_per_iter=32, num_trials_total=32, ), work_dir=work_dir, ) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace)
def test_tune_matmul_cpu(): with tempfile.TemporaryDirectory() as work_dir: sch: Schedule = tune_tir( mod=matmul, target=Target("llvm --num-cores=16"), config=ReplayTraceConfig( num_trials_per_iter=32, max_trials_per_task=32, max_trials_global=32, ), work_dir=work_dir, ) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace)
def test_tune_matmul_cuda_tensor_core(): n = 512 mod = create_prim_func(te_workload.matmul_fp16(n, n, n)) target = Target("nvidia/geforce-rtx-3070") config = ReplayTraceConfig( num_trials_per_iter=32, num_trials_total=320, ) class DefaultTensorCore: @staticmethod def _sch_rules(): from tvm.meta_schedule import ( # pylint: disable=import-outside-toplevel schedule_rule as M, ) return [ M.AutoInline( into_producer=False, into_consumer=True, # into_cache_only=False, inline_const_tensor=True, disallow_if_then_else=False, require_injective=False, require_ordered=False, disallow_op=None, ), M.MultiLevelTiling( structure="SSSRRSRS", tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"], # use_tensor_core=True, max_innermost_factor=64, vector_load_lens=[1, 2, 3, 4], reuse_read=schedule_rule.ReuseType( req="must", levels=[4], scope="shared", ), reuse_write=schedule_rule.ReuseType( req="no", levels=[], scope="", ), ), M.AutoInline( into_producer=True, into_consumer=True, # into_cache_only=True, inline_const_tensor=True, disallow_if_then_else=False, require_injective=False, require_ordered=False, disallow_op=None, ), M.ParallelizeVectorizeUnroll( max_jobs_per_core=-1, # disable parallelize max_vectorize_extent=-1, # disable vectorize unroll_max_steps=[0, 16, 64, 512, 1024], unroll_explicit=True, ), ] @staticmethod def _postproc(): from tvm.meta_schedule import ( # pylint: disable=import-outside-toplevel postproc as M, ) return [ # M.RewriteCooperativeFetch(), M.RewriteParallelVectorizeUnroll(), M.RewriteReductionBlock(), # M.RewriteTensorCore(), M.VerifyGPUCode(), ] with tempfile.TemporaryDirectory() as work_dir: sch: Schedule = tune_tir( mod=mod, target=target, config=config, work_dir=work_dir, space=PostOrderApply(), sch_rules=DefaultTensorCore._sch_rules, postprocs=DefaultTensorCore._postproc, num_threads=None, ) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace) from tvm.contrib import nvcc import numpy as np ctx = tvm.gpu(0) if nvcc.have_tensorcore(ctx.compute_version): with tvm.transform.PassContext(): func = tvm.build(sch.mod["main"], [], "cuda") print(sch.mod.script()) print(func.imported_modules[0].get_source()) a_np = np.random.uniform(size=(n, n)).astype("float16") b_np = np.random.uniform(size=(n, n)).astype("float16") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, n), dtype="float32"), ctx) evaluator = func.time_evaluator(func.entry_name, ctx, number=3, repeat=1, min_repeat_ms=40) print("matmul with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3)) np.testing.assert_allclose( c.asnumpy(), np.matmul(a_np.astype("float32"), b_np.astype("float32")), rtol=1e-4, atol=1e-4, )