Beispiel #1
0
def test_meta_schedule_post_order_apply():
    mod = Matmul
    context = TuneContext(
        mod=mod,
        target=Target("llvm"),
        task_name="Test Task",
        sch_rules=[WowSoFancyScheduleRule()],
    )
    post_order_apply = PostOrderApply()
    post_order_apply.initialize_with_tune_context(context)
    schs = post_order_apply.generate_design_space(mod)
    assert len(schs) == 1
    assert not tvm.ir.structural_equal(schs[0].mod, mod)
    _check_correct(schs[0])
Beispiel #2
0
def test_meta_schedule_custom_search_space():
    mod = MatmulCustomized
    context = TuneContext(
        mod=mod,
        target=Target("llvm"),
        task_name="Custom Search Space Task",
        sch_rules=[],
    )
    post_order_apply = PostOrderApply()
    post_order_apply.initialize_with_tune_context(context)

    post_order_apply.generate_design_space(mod)

    called = False

    def custom_search_space_func(sch: Schedule, _: BlockRV) -> List[Schedule]:
        nonlocal called
        called = True
        return [sch]

    register_func("tvm.meta_schedule.test.custom_search_space",
                  custom_search_space_func)

    post_order_apply.generate_design_space(mod)
    assert called
Beispiel #3
0
def test_meta_schedule_post_order_apply_multiple():
    mod = Matmul
    context = TuneContext(
        mod=mod,
        target=Target("llvm"),
        task_name="Double Rules Task",
        sch_rules=[DoubleScheduleRule(), ReorderScheduleRule()],
    )
    post_order_apply = PostOrderApply()
    post_order_apply.initialize_with_tune_context(context)
    schs = post_order_apply.generate_design_space(mod)
    assert len(schs) == 4
    for sch in schs:
        assert not tvm.ir.structural_equal(sch.mod, mod)
        _check_correct(sch)
def test_conv2d_winograd_cuda():
    mod = conv2d_winograd_cuda
    mod = IRModule({"main": mod})
    context = TuneContext(
        mod=mod,
        target=Target("nvidia/geforce-rtx-3090", host="llvm"),
        task_name="Custom Search Space Task",
        sch_rules=DefaultCUDA._sch_rules(),  # pylint: disable=protected-access
    )
    for sch_rule in context.sch_rules:
        sch_rule.initialize_with_tune_context(context)
    post_order_apply = PostOrderApply()
    post_order_apply.initialize_with_tune_context(context)
    (sch,) = post_order_apply.generate_design_space(mod)
    decisions = dict(
        zip(
            [i for i in sch.trace.insts if i.kind.name.startswith("Sample")],
            [
                # data_pack
                [3, 3],
                [64, 2],
                2,
                # inverse
                [3, 3],
                [2, 64],
                2,
                # bgemm
                [1, 1, 1, 1, 6],
                [1, 1, 1, 3, 2],
                [3, 1, 1, 1, 3],
                [4, 2, 1, 4, 4],
                [32, 1, 4],
                1,
                1,
                # root anno
                2,
                # conv2d
                2,
            ],
        )
    )
    trace = Trace(sch.trace.insts, decisions=decisions)
    sch = Schedule(mod=mod)
    trace.apply_to_schedule(sch, remove_postproc=False)
    answer = sch.mod
    expected = _get_mod()
    tvm.ir.assert_structural_equal(answer, expected)
Beispiel #5
0
def test_conv2d_winograd_cpu():
    mod = conv2d_winograd_cpu
    mod = IRModule({"main": mod})
    context = TuneContext(
        mod=mod,
        target=Target("llvm"),
        task_name="Custom Search Space Task",
        sch_rules=DefaultLLVM._sch_rules(),  # pylint: disable=protected-access
    )
    post_order_apply = PostOrderApply()
    post_order_apply.initialize_with_tune_context(context)
    (sch, ) = post_order_apply.generate_design_space(mod)

    decisions = dict(
        zip(
            [
                i for i in sch.trace.insts[:-4]
                if i.kind.name.startswith("Sample")
            ],
            [
                # data_pack
                [9, 1],
                [32, 4],
                # input_tile
                4,
                # data_pad
                -2,
                # inverse
                [1, 9],
                [2, 64],
                # bgemm
                [1, 2, 3, 1],
                [1, 1, 1, 6],
                [1, 1, 1, 9],
                [2, 1, 16, 4],
                [16, 8],
            ],
        ))
    trace = Trace(sch.trace.insts[:-4], decisions=decisions)
    sch = Schedule(mod=mod)
    trace.apply_to_schedule(sch, remove_postproc=False)
    answer = sch.mod
    expected = _get_mod()
    tvm.ir.assert_structural_equal(answer, expected)
Beispiel #6
0
def test_meta_schedule_post_order_apply_double():
    mod = Matmul
    context = TuneContext(
        mod=mod,
        target=Target("llvm"),
        task_name="Double Rules Task",
        space_generator=PostOrderApply(),
        sch_rules=[DoubleScheduleRule()],
    )
    post_order_apply = context.space_generator
    schs = post_order_apply.generate_design_space(mod)
    assert len(schs) == 2
    for sch in schs:
        assert not tvm.ir.structural_equal(sch.mod, mod)
        _check_correct(sch)
Beispiel #7
0
def test_meta_schedule_post_order_apply_duplicate_matmul():
    mod = DuplicateMatmul
    context = TuneContext(
        mod=mod,
        target=Target("llvm"),
        task_name="Duplicate Matmul Task",
        space_generator=PostOrderApply(),
        sch_rules=[WowSoFancyScheduleRule()],
    )
    post_order_apply = context.space_generator
    with pytest.raises(
            TVMError,
            match=
            r".*TVMError: Check failed: \(block_names_.count\(block->name_hint\) == 0\)"
            r" is false: Duplicated block name matmul in function main not supported!",
    ):
        post_order_apply.generate_design_space(mod)
Beispiel #8
0
def test_meta_schedule_post_order_apply_remove_block():
    @derived_object
    class TrinityDouble(PyScheduleRule):
        def initialize_with_tune_context(self, context: "TuneContext") -> None:
            pass

        def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
            if _is_root(sch, block):
                return [sch]
            new_sch = sch.copy()
            i, j = new_sch.get_loops(block=block)
            i_0, i_1 = new_sch.split(loop=i, factors=[16, 64])
            j_0, j_1 = new_sch.split(loop=j, factors=[64, 16])
            new_sch.reorder(i_0, j_0, i_1, j_1)
            result = [new_sch]
            new_sch = sch.copy()
            i, j = new_sch.get_loops(block=block)
            i_0, i_1 = new_sch.split(loop=i, factors=[2, 512])
            j_0, j_1 = new_sch.split(loop=j, factors=[2, 512])
            new_sch.reorder(i_0, j_0, i_1, j_1)
            result.append(new_sch)
            return result

    @derived_object
    class RemoveBlock(PyScheduleRule):
        def initialize_with_tune_context(self, context: "TuneContext") -> None:
            pass

        def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
            if _is_root(sch, block):
                return [sch]
            sch = sch.copy()
            if sch.get(block).name_hint == "B":
                sch.compute_inline(block)
            return [sch]

    def correct_trace(a, b, c, d):
        return "\n".join([
            'b0 = sch.get_block(name="A", func_name="main")',
            'b1 = sch.get_block(name="B", func_name="main")',
            'b2 = sch.get_block(name="C", func_name="main")',
            "sch.compute_inline(block=b1)",
            "l3, l4 = sch.get_loops(block=b2)",
            "l5, l6 = sch.split(loop=l3, factors=" + str(a) + ")",
            "l7, l8 = sch.split(loop=l4, factors=" + str(b) + ")",
            "sch.reorder(l5, l7, l6, l8)",
            "l9, l10 = sch.get_loops(block=b0)",
            "l11, l12 = sch.split(loop=l9, factors=" + str(c) + ")",
            "l13, l14 = sch.split(loop=l10, factors=" + str(d) + ")",
            "sch.reorder(l11, l13, l12, l14)",
        ])

    mod = TrinityMatmul
    context = TuneContext(
        mod=mod,
        target=Target("llvm"),
        task_name="Remove Block Task",
        sch_rules=[RemoveBlock(), TrinityDouble()],
    )
    post_order_apply = PostOrderApply()
    post_order_apply.initialize_with_tune_context(context)
    schs = post_order_apply.generate_design_space(mod)
    assert len(schs) == 4
    for sch in schs:
        with pytest.raises(
                tvm.tir.schedule.schedule.ScheduleError,
                match=
                "ScheduleError: An error occurred in the schedule primitive 'get-block'.",
        ):
            sch.get_block("B", "main")
        sch_trace = sch.trace.simplified(True)
        assert (str(sch_trace) == correct_trace([16, 64], [64, 16], [2, 512],
                                                [2, 512]) or str(sch_trace)
                == correct_trace([2, 512], [2, 512], [2, 512], [2, 512])
                or str(sch_trace) == correct_trace(
                    [16, 64], [64, 16], [16, 64], [64, 16]) or str(sch_trace)
                == correct_trace([2, 512], [2, 512], [16, 64], [64, 16]))
def test_tune_matmul_cuda_tensor_core():
    n = 512
    mod = create_prim_func(te_workload.matmul_fp16(n, n, n))
    target = Target("nvidia/geforce-rtx-3070")
    config = ReplayTraceConfig(
        num_trials_per_iter=32,
        num_trials_total=320,
    )

    class DefaultTensorCore:
        @staticmethod
        def _sch_rules():
            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
                schedule_rule as M, )

            return [
                M.AutoInline(
                    into_producer=False,
                    into_consumer=True,
                    # into_cache_only=False,
                    inline_const_tensor=True,
                    disallow_if_then_else=False,
                    require_injective=False,
                    require_ordered=False,
                    disallow_op=None,
                ),
                M.MultiLevelTiling(
                    structure="SSSRRSRS",
                    tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"],
                    # use_tensor_core=True,
                    max_innermost_factor=64,
                    vector_load_lens=[1, 2, 3, 4],
                    reuse_read=schedule_rule.ReuseType(
                        req="must",
                        levels=[4],
                        scope="shared",
                    ),
                    reuse_write=schedule_rule.ReuseType(
                        req="no",
                        levels=[],
                        scope="",
                    ),
                ),
                M.AutoInline(
                    into_producer=True,
                    into_consumer=True,
                    # into_cache_only=True,
                    inline_const_tensor=True,
                    disallow_if_then_else=False,
                    require_injective=False,
                    require_ordered=False,
                    disallow_op=None,
                ),
                M.ParallelizeVectorizeUnroll(
                    max_jobs_per_core=-1,  # disable parallelize
                    max_vectorize_extent=-1,  # disable vectorize
                    unroll_max_steps=[0, 16, 64, 512, 1024],
                    unroll_explicit=True,
                ),
            ]

        @staticmethod
        def _postproc():
            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
                postproc as M, )

            return [
                # M.RewriteCooperativeFetch(),
                M.RewriteParallelVectorizeUnroll(),
                M.RewriteReductionBlock(),
                # M.RewriteTensorCore(),
                M.VerifyGPUCode(),
            ]

    with tempfile.TemporaryDirectory() as work_dir:
        sch: Schedule = tune_tir(
            mod=mod,
            target=target,
            config=config,
            work_dir=work_dir,
            space=PostOrderApply(),
            sch_rules=DefaultTensorCore._sch_rules,
            postprocs=DefaultTensorCore._postproc,
            num_threads=None,
        )
        if sch is None:
            print("No valid schedule found!")
        else:
            print(sch.mod.script())
            print(sch.trace)

            from tvm.contrib import nvcc
            import numpy as np

            ctx = tvm.gpu(0)
            if nvcc.have_tensorcore(ctx.compute_version):
                with tvm.transform.PassContext():
                    func = tvm.build(sch.mod["main"], [], "cuda")
                    print(sch.mod.script())
                    print(func.imported_modules[0].get_source())
                a_np = np.random.uniform(size=(n, n)).astype("float16")
                b_np = np.random.uniform(size=(n, n)).astype("float16")
                a = tvm.nd.array(a_np, ctx)
                b = tvm.nd.array(b_np, ctx)
                c = tvm.nd.array(np.zeros((n, n), dtype="float32"), ctx)
                evaluator = func.time_evaluator(func.entry_name,
                                                ctx,
                                                number=3,
                                                repeat=1,
                                                min_repeat_ms=40)
                print("matmul with tensor core: %f ms" %
                      (evaluator(a, b, c).mean * 1e3))

                np.testing.assert_allclose(
                    c.asnumpy(),
                    np.matmul(a_np.astype("float32"), b_np.astype("float32")),
                    rtol=1e-4,
                    atol=1e-4,
                )