def test_gpu_batch_norm_bmn(): expected = [ [], [ 'b0 = sch.get_block(name="C", func_name="main")', "b1, = sch.get_consumers(block=b0)", "l2, = sch.get_loops(block=b1)", "v3 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l4, l5 = sch.split(loop=l2, factors=[None, v3], preserve_unit_iters=True)", 'sch.bind(loop=l5, thread_axis="threadIdx.x")', "sch.compute_at(block=b0, loop=l4, preserve_unit_loops=True)", 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', "l6, l7, l8, l9 = sch.get_loops(block=b0)", "l10 = sch.fuse(l8, l9, preserve_unit_iters=True)", "l11, l12 = sch.split(loop=l10, factors=[None, v3], preserve_unit_iters=True)", 'sch.bind(loop=l12, thread_axis="threadIdx.x")', ], ] target = Target("nvidia/geforce-rtx-3090", host="llvm") ctx = _create_context( create_prim_func(te_workload.norm_bmn( B=1, M=512, N=512, )), target=target, rule=cross_thread_reduction(target=target), ) spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) assert len(spaces) == 2 check_trace(spaces, expected)
def test_cpu_matmul(): expected = [ [], [ 'b0 = sch.get_block(name="C", func_name="main")', "l1, l2, l3 = sch.get_loops(block=b0)", "v4, v5 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)", "l6, l7 = sch.split(loop=l3, factors=[v4, v5], preserve_unit_iters=True)", "b8 = sch.rfactor(loop=l7, factor_axis=2)", 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.random_compute_producer", ann_val=1)', ], [ 'b0 = sch.get_block(name="C", func_name="main")', "l1, l2, l3 = sch.get_loops(block=b0)", "v4, v5 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)", "l6, l7 = sch.split(loop=l3, factors=[v4, v5], preserve_unit_iters=True)", "b8 = sch.rfactor(loop=l6, factor_axis=2)", 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.random_compute_producer", ann_val=1)', ], ] target = Target("llvm --num-cores=32") ctx = _create_context( create_prim_func(te_workload.matmul( n=4, m=4, k=512, )), target=target, rule=add_rfactor(target=target), ) spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) assert len(spaces) == 3 check_trace(spaces, expected)
def test_gpu_softmax_mn(): expected = [ [], [ 'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")', "b1, = sch.get_consumers(block=b0)", "l2, l3 = sch.get_loops(block=b1)", "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)", 'sch.bind(loop=l6, thread_axis="threadIdx.x")', "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)", 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', "l7, l8, l9 = sch.get_loops(block=b0)", "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)", 'sch.bind(loop=l11, thread_axis="threadIdx.x")', ], [ 'b0 = sch.get_block(name="T_softmax_expsum", func_name="main")', "b1, = sch.get_consumers(block=b0)", "l2, l3 = sch.get_loops(block=b1)", "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)", 'sch.bind(loop=l6, thread_axis="threadIdx.x")', "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)", 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', "l7, l8, l9 = sch.get_loops(block=b0)", "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)", 'sch.bind(loop=l11, thread_axis="threadIdx.x")', ], [ 'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")', 'b1 = sch.get_block(name="T_softmax_expsum", func_name="main")', "b2, = sch.get_consumers(block=b1)", "l3, l4 = sch.get_loops(block=b2)", "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)", 'sch.bind(loop=l7, thread_axis="threadIdx.x")', "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True)", 'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")', "l8, l9, l10 = sch.get_loops(block=b1)", "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)", 'sch.bind(loop=l12, thread_axis="threadIdx.x")', "b13, = sch.get_consumers(block=b0)", "l14, l15 = sch.get_loops(block=b13)", "v16 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l17, l18 = sch.split(loop=l15, factors=[None, v16], preserve_unit_iters=True)", 'sch.bind(loop=l18, thread_axis="threadIdx.x")', "sch.compute_at(block=b0, loop=l14, preserve_unit_loops=True)", 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', "l19, l20, l21 = sch.get_loops(block=b0)", "l22, l23 = sch.split(loop=l21, factors=[None, v16], preserve_unit_iters=True)", 'sch.bind(loop=l23, thread_axis="threadIdx.x")', ], ] target = Target("nvidia/geforce-rtx-3090", host="llvm") ctx = _create_context( create_prim_func(te_workload.softmax_mn( n=256, m=256, )), target=target, rule=cross_thread_reduction(target=target), ) spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) assert len(spaces) == 4 check_trace(spaces, expected)
def test_tune_matmul_cuda_tensor_core(): n = 512 mod = create_prim_func(te_workload.matmul_fp16(n, n, n)) target = Target("nvidia/geforce-rtx-3070") config = ReplayTraceConfig( num_trials_per_iter=32, num_trials_total=320, ) class DefaultTensorCore: @staticmethod def _sch_rules(): from tvm.meta_schedule import ( # pylint: disable=import-outside-toplevel schedule_rule as M, ) return [ M.AutoInline( into_producer=False, into_consumer=True, # into_cache_only=False, inline_const_tensor=True, disallow_if_then_else=False, require_injective=False, require_ordered=False, disallow_op=None, ), M.MultiLevelTiling( structure="SSSRRSRS", tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"], # use_tensor_core=True, max_innermost_factor=64, vector_load_lens=[1, 2, 3, 4], reuse_read=schedule_rule.ReuseType( req="must", levels=[4], scope="shared", ), reuse_write=schedule_rule.ReuseType( req="no", levels=[], scope="", ), ), M.AutoInline( into_producer=True, into_consumer=True, # into_cache_only=True, inline_const_tensor=True, disallow_if_then_else=False, require_injective=False, require_ordered=False, disallow_op=None, ), M.ParallelizeVectorizeUnroll( max_jobs_per_core=-1, # disable parallelize max_vectorize_extent=-1, # disable vectorize unroll_max_steps=[0, 16, 64, 512, 1024], unroll_explicit=True, ), ] @staticmethod def _postproc(): from tvm.meta_schedule import ( # pylint: disable=import-outside-toplevel postproc as M, ) return [ # M.RewriteCooperativeFetch(), M.RewriteParallelVectorizeUnroll(), M.RewriteReductionBlock(), # M.RewriteTensorCore(), M.VerifyGPUCode(), ] with tempfile.TemporaryDirectory() as work_dir: sch: Schedule = tune_tir( mod=mod, target=target, config=config, work_dir=work_dir, space=PostOrderApply(), sch_rules=DefaultTensorCore._sch_rules, postprocs=DefaultTensorCore._postproc, num_threads=None, ) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace) from tvm.contrib import nvcc import numpy as np ctx = tvm.gpu(0) if nvcc.have_tensorcore(ctx.compute_version): with tvm.transform.PassContext(): func = tvm.build(sch.mod["main"], [], "cuda") print(sch.mod.script()) print(func.imported_modules[0].get_source()) a_np = np.random.uniform(size=(n, n)).astype("float16") b_np = np.random.uniform(size=(n, n)).astype("float16") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, n), dtype="float32"), ctx) evaluator = func.time_evaluator(func.entry_name, ctx, number=3, repeat=1, min_repeat_ms=40) print("matmul with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3)) np.testing.assert_allclose( c.asnumpy(), np.matmul(a_np.astype("float32"), b_np.astype("float32")), rtol=1e-4, atol=1e-4, )