def _sch_rules() -> List[ScheduleRule]: from tvm.meta_schedule import schedule_rule as M return [ M.AutoInline( into_producer=False, into_consumer=True, inline_const_tensor=True, disallow_if_then_else=True, require_injective=True, require_ordered=True, disallow_op=["tir.exp"], ), M.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64), M.MultiLevelTiling( structure="SSRSRS", tile_binds=None, max_innermost_factor=64, vector_load_lens=None, reuse_read=None, reuse_write=M.ReuseType( req="may", levels=[1, 2], scope="global", ), ), M.ParallelizeVectorizeUnroll( max_jobs_per_core=16, max_vectorize_extent=64, unroll_max_steps=[0, 16, 64, 512], unroll_explicit=True, ), M.RandomComputeLocation(), ]
def _sch_rules(): from tvm.meta_schedule import ( # pylint: disable=import-outside-toplevel schedule_rule as M, ) return [ M.AutoInline( into_producer=False, into_consumer=True, # into_cache_only=False, inline_const_tensor=True, disallow_if_then_else=False, require_injective=False, require_ordered=False, disallow_op=None, ), M.MultiLevelTiling( structure="SSSRRSRS", tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"], # use_tensor_core=True, max_innermost_factor=64, vector_load_lens=[1, 2, 3, 4], reuse_read=schedule_rule.ReuseType( req="must", levels=[4], scope="shared", ), reuse_write=schedule_rule.ReuseType( req="no", levels=[], scope="", ), ), M.AutoInline( into_producer=True, into_consumer=True, # into_cache_only=True, inline_const_tensor=True, disallow_if_then_else=False, require_injective=False, require_ordered=False, disallow_op=None, ), M.ParallelizeVectorizeUnroll( max_jobs_per_core=-1, # disable parallelize max_vectorize_extent=-1, # disable vectorize unroll_max_steps=[0, 16, 64, 512, 1024], unroll_explicit=True, ), ]
def _sch_rules() -> List[ScheduleRule]: from tvm.meta_schedule import schedule_rule as M return [ M.MultiLevelTiling( structure="SSSRRSRS", tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"], max_innermost_factor=64, vector_load_lens=[1, 2, 3, 4], reuse_read=M.ReuseType( req="must", levels=[4], scope="shared", ), reuse_write=M.ReuseType( req="must", levels=[3], scope="local", ), ), M.AutoInline( into_producer=True, into_consumer=True, inline_const_tensor=True, disallow_if_then_else=False, require_injective=False, require_ordered=False, disallow_op=None, ), M.CrossThreadReduction( thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]), M.ParallelizeVectorizeUnroll( max_jobs_per_core=-1, # disable parallelize max_vectorize_extent=-1, # disable vectorize unroll_max_steps=[0, 16, 64, 512, 1024], unroll_explicit=True, ), M.AutoBind( max_threadblocks=256, thread_extents=[32, 64, 128, 256, 512, 1024], ), ]
tile_binds=None, max_innermost_factor=64, vector_load_lens=None, reuse_read=None, reuse_write=schedule_rule.ReuseType( req="may", levels=[1, 2], scope="global", ), ), schedule_rule.MultiLevelTiling( structure="SSRSRS", tile_binds=None, max_innermost_factor=64, vector_load_lens=None, reuse_read=None, reuse_write=schedule_rule.ReuseType( req="may", levels=[1, 2], scope="global", ), ), schedule_rule.ParallelizeVectorizeUnroll( max_jobs_per_core=16, max_vectorize_extent=64, unroll_max_steps=[0, 16, 64, 512], unroll_explicit=True, ), schedule_rule.RandomComputeLocation(), ]