コード例 #1
0
def main():
    alloc_repeat = 1
    runner = ms.runner.RPCRunner(
        rpc_config=ARGS.rpc_config,
        evaluator_config=ms.runner.EvaluatorConfig(
            number=3,
            repeat=1,
            min_repeat_ms=100,
            enable_cpu_cache_flush=False,
        ),
        alloc_repeat=alloc_repeat,
        max_workers=ARGS.rpc_workers,
    )
    sch: Optional[tir.Schedule] = ms.tune_tir(
        mod=create_te_workload(ARGS.workload, 0),
        target=ARGS.target,
        config=ms.TuneConfig(
            strategy="evolutionary",
            num_trials_per_iter=64,
            max_trials_per_task=ARGS.num_trials,
            max_trials_global=ARGS.num_trials,
        ),
        runner=runner,  # type: ignore
        task_name=ARGS.workload,
        work_dir=ARGS.work_dir,
        num_threads=cpu_count(),
    )
    if sch is None:
        print("No valid schedule found!")
    else:
        print(sch.mod.script())
        print(sch.trace)
コード例 #2
0
ファイル: tune_te.py プロジェクト: junrushao1994/tvm
def main():
    describe()
    print(f"Workload: {ARGS.workload}")
    runner = ms.runner.RPCRunner(
        rpc_config=ARGS.rpc_config,
        evaluator_config=ms.runner.EvaluatorConfig(
            number=ARGS.number,
            repeat=ARGS.repeat,
            min_repeat_ms=ARGS.min_repeat_ms,
            enable_cpu_cache_flush=ARGS.cpu_flush,
        ),
        alloc_repeat=1,
    )
    with ms.Profiler() as profiler:
        sch: Optional[tir.Schedule] = ms.tune_tir(
            mod=create_te_workload(ARGS.workload, 0),
            target=ARGS.target,
            config=ms.TuneConfig(
                strategy="evolutionary",
                num_trials_per_iter=64,
                max_trials_per_task=ARGS.num_trials,
                max_trials_global=ARGS.num_trials,
            ),
            runner=runner,  # type: ignore
            task_name=ARGS.workload,
            work_dir=ARGS.work_dir,
            num_threads=cpu_count(),
        )
    print("Tuning Time:")
    print(profiler.table())
    if sch is None:
        print("No valid schedule found!")
    else:
        print(sch.mod.script())
        print(sch.trace)
コード例 #3
0
def test_tune_block_cpu():
    @derived_object
    class RemoveBlock(PyScheduleRule):
        def _initialize_with_tune_context(self, context: TuneContext) -> None:
            pass

        def apply(self, sch: Schedule, block: BlockRV):
            if sch.get(block).name_hint == "root":
                return [sch]
            sch = sch.copy()
            sch.compute_inline(block)
            return [sch]

    with tempfile.TemporaryDirectory() as work_dir:
        sch: Schedule = tune_tir(
            mod=two_step,
            target=Target("llvm --num-cores=16"),
            config=TuneConfig(
                strategy="replay_trace",
                num_trials_per_iter=32,
                max_trials_per_task=32,
                max_trials_global=32,
            ),
            work_dir=work_dir,
            blocks=["A"],
            sch_rules=lambda *args: [RemoveBlock()],
        )
        assert sch is not None
コード例 #4
0
def test_tune_matmul_cuda():
    with tempfile.TemporaryDirectory() as work_dir:
        sch: Schedule = tune_tir(
            mod=matmul,
            target=Target("nvidia/geforce-rtx-3070"),
            config=ReplayTraceConfig(
                num_trials_per_iter=32,
                num_trials_total=32,
            ),
            work_dir=work_dir,
        )
        if sch is None:
            print("No valid schedule found!")
        else:
            print(sch.mod.script())
            print(sch.trace)
コード例 #5
0
def test_tune_matmul_cpu():
    with tempfile.TemporaryDirectory() as work_dir:
        sch: Schedule = tune_tir(
            mod=matmul,
            target=Target("llvm --num-cores=16"),
            config=ReplayTraceConfig(
                num_trials_per_iter=32,
                max_trials_per_task=32,
                max_trials_global=32,
            ),
            work_dir=work_dir,
        )
        if sch is None:
            print("No valid schedule found!")
        else:
            print(sch.mod.script())
            print(sch.trace)
コード例 #6
0
def test_tune_matmul_cuda_tensor_core():
    n = 512
    mod = create_prim_func(te_workload.matmul_fp16(n, n, n))
    target = Target("nvidia/geforce-rtx-3070")
    config = ReplayTraceConfig(
        num_trials_per_iter=32,
        num_trials_total=320,
    )

    class DefaultTensorCore:
        @staticmethod
        def _sch_rules():
            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
                schedule_rule as M, )

            return [
                M.AutoInline(
                    into_producer=False,
                    into_consumer=True,
                    # into_cache_only=False,
                    inline_const_tensor=True,
                    disallow_if_then_else=False,
                    require_injective=False,
                    require_ordered=False,
                    disallow_op=None,
                ),
                M.MultiLevelTiling(
                    structure="SSSRRSRS",
                    tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"],
                    # use_tensor_core=True,
                    max_innermost_factor=64,
                    vector_load_lens=[1, 2, 3, 4],
                    reuse_read=schedule_rule.ReuseType(
                        req="must",
                        levels=[4],
                        scope="shared",
                    ),
                    reuse_write=schedule_rule.ReuseType(
                        req="no",
                        levels=[],
                        scope="",
                    ),
                ),
                M.AutoInline(
                    into_producer=True,
                    into_consumer=True,
                    # into_cache_only=True,
                    inline_const_tensor=True,
                    disallow_if_then_else=False,
                    require_injective=False,
                    require_ordered=False,
                    disallow_op=None,
                ),
                M.ParallelizeVectorizeUnroll(
                    max_jobs_per_core=-1,  # disable parallelize
                    max_vectorize_extent=-1,  # disable vectorize
                    unroll_max_steps=[0, 16, 64, 512, 1024],
                    unroll_explicit=True,
                ),
            ]

        @staticmethod
        def _postproc():
            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
                postproc as M, )

            return [
                # M.RewriteCooperativeFetch(),
                M.RewriteParallelVectorizeUnroll(),
                M.RewriteReductionBlock(),
                # M.RewriteTensorCore(),
                M.VerifyGPUCode(),
            ]

    with tempfile.TemporaryDirectory() as work_dir:
        sch: Schedule = tune_tir(
            mod=mod,
            target=target,
            config=config,
            work_dir=work_dir,
            space=PostOrderApply(),
            sch_rules=DefaultTensorCore._sch_rules,
            postprocs=DefaultTensorCore._postproc,
            num_threads=None,
        )
        if sch is None:
            print("No valid schedule found!")
        else:
            print(sch.mod.script())
            print(sch.trace)

            from tvm.contrib import nvcc
            import numpy as np

            ctx = tvm.gpu(0)
            if nvcc.have_tensorcore(ctx.compute_version):
                with tvm.transform.PassContext():
                    func = tvm.build(sch.mod["main"], [], "cuda")
                    print(sch.mod.script())
                    print(func.imported_modules[0].get_source())
                a_np = np.random.uniform(size=(n, n)).astype("float16")
                b_np = np.random.uniform(size=(n, n)).astype("float16")
                a = tvm.nd.array(a_np, ctx)
                b = tvm.nd.array(b_np, ctx)
                c = tvm.nd.array(np.zeros((n, n), dtype="float32"), ctx)
                evaluator = func.time_evaluator(func.entry_name,
                                                ctx,
                                                number=3,
                                                repeat=1,
                                                min_repeat_ms=40)
                print("matmul with tensor core: %f ms" %
                      (evaluator(a, b, c).mean * 1e3))

                np.testing.assert_allclose(
                    c.asnumpy(),
                    np.matmul(a_np.astype("float32"), b_np.astype("float32")),
                    rtol=1e-4,
                    atol=1e-4,
                )