def record_common(dag, s):
    target = tvm.target.Target("llvm")
    task = auto_scheduler.SearchTask(compute_dag=dag,
                                     workload_key="test",
                                     target=target)

    inp = auto_scheduler.measure.MeasureInput(task, s)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    # Test in-memory record processing.
    record_str = auto_scheduler.measure_record.dump_record_to_string(inp, res)
    r_inp, r_res = auto_scheduler.measure_record.load_record_from_string(
        record_str)
    # Only check the workload_key for simplification.
    assert inp.task.workload_key == r_inp.task.workload_key
    assert str(res) == str(r_res)

    # Test file-based record processing.
    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, _ = log_reader.read_lines()
        assert len(inputs) == 1

        s1 = dag.infer_bound_from_state(s)
        s2 = dag.infer_bound_from_state(inputs[0].state)

        assert s1 == s2
        assert not (s1 == dag.get_init_state())
Esempio n. 2
0
def test_record():
    if not tvm.runtime.enabled("llvm"):
        return

    A = te.placeholder((512, 512), name='A')
    B = te.placeholder((512, 512), name='B')
    k = te.reduce_axis((0, 512), name='k')
    C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name='C')
    D = topi.nn.relu(C)
    k = te.reduce_axis((0, 512), name='k')
    E = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * D[k][j], axis=[k]), name='C')
    F = topi.nn.relu(E)

    dag = auto_scheduler.ComputeDAG([A, B, F])
    s = dag.get_init_state()

    # Split
    its0 = s.split(C, s[C].iters[0], [4, 8, 8])
    its1 = s.split(C, s[C].iters[4], [8, 4, 4])
    # Reorder
    s.reorder(C, [its0[0], its1[0], its0[1], its1[1], its0[2], its1[2], its0[3], s[C].iters[8],
                  its1[3]])
    # Fuse
    s.fuse(C, [s[C].iters[0], s[C].iters[1], s[C].iters[2]])
    # Compute at
    s.split(F, s[F].iters[0], [2])
    s.compute_at(E, F, s[F].iters[0])
    # Compute inline
    s.compute_inline(D)
    # Compute root
    s.compute_root(D)
    # Parallel
    s.parallel(C, s[C].iters[0])
    # Thread bind(The blockIdx & threadIdx are used in GPU, just for record testing here)
    s.bind(C, s[C].iters[1], "blockIdx.x")
    s.bind(C, s[C].iters[2], "threadIdx.z")
    s.bind(C, s[C].iters[3], "vthread")
    # Unroll
    s.unroll(C, s[C].iters[4])
    # Vectorize
    s.vectorize(C, s[C].iters[6])

    target = tvm.target.create("llvm")
    task = auto_scheduler.SearchTask(dag, "test", target)

    inp = auto_scheduler.measure.MeasureInput(task, s)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, results = log_reader.read_lines()
        assert len(inputs) == 1

        s1 = dag.infer_bound_from_state(s)
        s2 = dag.infer_bound_from_state(inputs[0].state)

        assert s1 == s2
        assert not (s1 == dag.get_init_state())
def record_common(dag, s):
    target = tvm.target.create("llvm")
    task = auto_scheduler.SearchTask(dag, "test", target)

    inp = auto_scheduler.measure.MeasureInput(task, s)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, results = log_reader.read_lines()
        assert len(inputs) == 1

        s1 = dag.infer_bound_from_state(s)
        s2 = dag.infer_bound_from_state(inputs[0].state)

        assert s1 == s2
        assert not (s1 == dag.get_init_state())
def test_measure_target_host():
    task = auto_scheduler.SearchTask(
        func=matmul_auto_scheduler_test,
        args=(512, 512, 512),
        target=tvm.target.Target("llvm", "llvm -mtriple=aarch64-linux-gnu"),
    )

    inp = auto_scheduler.measure.MeasureInput(task,
                                              task.compute_dag.init_state)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, _ = log_reader.read_lines()
        assert len(inputs) == 1

        raw_inp = inputs[0]

        recovered_inp = auto_scheduler.measure.recover_measure_input(raw_inp)
        assert str(recovered_inp.task.target.host) == str(inp.task.target.host)
Esempio n. 5
0
def test_recover_measure_input():
    task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm")

    inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, results = log_reader.read_lines()
        assert len(inputs) == 1

        raw_inp = inputs[0]

        correct_inp = auto_scheduler.measure_record.recover_measure_input(raw_inp)
        assert str(correct_inp.task.compute_dag) == str(inp.task.compute_dag)

        correct_inp = auto_scheduler.measure_record.recover_measure_input(
            raw_inp, rebuild_state=True
        )
        assert str(correct_inp.state) == str(inp.state)
Esempio n. 6
0
def test_gpu_feature():
    # Use records to build a complicated GPU program
    json_records = "\n".join(
        (
            """{"i": [["[\\"matmul_auto_scheduler_test\\", 512, 512, 512]", "cuda"], [[], [["CHW", 2, "local"], ["SP", 2, 0, 512, [1, 16, 32, 1], 1], ["SP", 2, 5, 512, [4, 1, 1, 16], 1], ["SP", 2, 10, 512, [1, 2], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 3, 0, 1, 3], ["FSP", 3, 4, 2, 3], ["RE", 3, [0, 4, 1, 5, 2, 6, 3, 7]], ["FU", 2, [0, 1]], ["FU", 3, [0, 1]], ["FU", 2, [1, 2]], ["FU", 3, [1, 2]], ["FU", 2, [2, 3]], ["FU", 3, [2, 3]], ["CA", 2, 3, 2], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 3], ["FU", 2, [0, 1]], ["FFSP", 2, 0, [1, 2], 1, 1], ["AN", 2, 1, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 3], ["FU", 1, [0, 1]], ["FFSP", 1, 0, [1, 2], 1, 1], ["AN", 1, 1, 6], ["AN", 5, 0, 5], ["AN", 5, 1, 4], ["AN", 5, 2, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00536798], 0, 2.49277, 1585564852], "v": "v0.1"}""",
        )
    )

    # load states
    with tempfile.NamedTemporaryFile(mode="w") as f:
        f.write(json_records)
        f.flush()
        inputs, _ = auto_scheduler.RecordReader(f.name).read_lines()

        inp = inputs[0]
        task = auto_scheduler.SearchTask(
            workload_key=inp.task.workload_key,
            target=inp.task.target,
            hardware_params=auto_scheduler.HardwareParams(
                100000, 16, 64, 1 << 30, 1 << 30, 1 << 30, 1 << 30, 1 << 30
            ),
        )

        state = task.compute_dag.infer_bound_from_state(inputs[0].state)
        fea = auto_scheduler.feature.get_per_store_features_from_states([state], task)[0]
        names = auto_scheduler.feature.get_per_store_feature_names()

        # build feature dict
        fea_dicts = []
        for i in range(len(fea)):
            tmp_dict = {}
            for j in range(len(names)):
                tmp_dict[names[j]] = fea[i][j]
            fea_dicts.append(tmp_dict)

        """
        lowered IR:

        Placeholder: A, B
        blockIdx.x [email protected]@ (0,8)
          vthread [email protected]@ (0,4)
            threadIdx.x [email protected]@ (0,16)
              C.local auto_unroll: 1024
              for k.0 (0,256)
                for ax0@[email protected] (0,8)
                  threadIdx.x ax0@[email protected] (0,16)
                    B.shared = ...
                for ax0@[email protected] (0,64)
                  threadIdx.x ax0@[email protected] (0,16)
                    A.shared = ...
                for i_c.3 (0,32)
                  for k.2 (0,2)
                    for j_c.4 (0,16)
                      C.local = ...
              for i.3 (0,32)
                for j.3 (0,16)
                  C = ...
        """

        # check gpu-related features
        assert fequal(fea_dicts[0]["blockIdx_x_len"], math.log2(8 + 1))
        assert fequal(fea_dicts[0]["vthread_len"], math.log2(4 + 1))
        assert fequal(fea_dicts[1]["threadIdx_x_len"], math.log2(16 + 1))
        assert fequal(fea_dicts[0]["threadIdx_y_len"], math.log2(1 + 1))
        assert fequal(fea_dicts[2]["blockIdx_z_len"], math.log2(1 + 1))
        assert fequal(fea_dicts[0]["is_gpu"], 1.0)
def test_record():
    if not tvm.runtime.enabled("llvm"):
        return

    A = te.placeholder((512, 512), name='A')
    B = te.placeholder((512, 512), name='B')
    k = te.reduce_axis((0, 512), name='k')
    C = te.compute((512, 512),
                   lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]),
                   name='C')
    D = topi.nn.relu(C)
    k = te.reduce_axis((0, 512), name='k')
    E = te.compute((512, 512),
                   lambda i, j: te.sum(A[i][k] * D[k][j], axis=[k]),
                   name='E')
    F = topi.nn.relu(E)
    k = te.reduce_axis((0, 512), name='k')
    G = te.compute((512, 512),
                   lambda i, j: te.sum(A[i][k] * F[k][j], axis=[k]),
                   name='G')
    H = topi.nn.relu(G)
    I = topi.nn.relu(H)

    dag = auto_scheduler.ComputeDAG([A, B, I])
    s = dag.get_init_state()

    # Split
    its0 = s.split(C, s[C].iters[0], [4, 8, 8])
    its1 = s.split(C, s[C].iters[4], [8, 4, 4])
    # Reorder
    s.reorder(C, [
        its0[0], its1[0], its0[1], its1[1], its0[2], its1[2], its0[3],
        s[C].iters[8], its1[3]
    ])
    # Fuse
    s.fuse(C, [s[C].iters[0], s[C].iters[1], s[C].iters[2]])
    # Compute at
    s.split(F, s[F].iters[0], [2])
    s.compute_at(E, F, s[F].iters[0])
    # Compute inline
    s.compute_inline(D)
    # Compute root
    s.compute_root(D)
    # Parallel
    s.parallel(C, s[C].iters[0])
    # Thread bind(The blockIdx & threadIdx are used in GPU, just for record testing here)
    s.bind(C, s[C].iters[1], "blockIdx.x")
    s.bind(C, s[C].iters[2], "threadIdx.z")
    s.bind(C, s[C].iters[3], "vthread")
    # Unroll
    s.unroll(C, s[C].iters[4])
    # Vectorize
    s.vectorize(C, s[C].iters[6])
    # Cache Read
    D_global = s.cache_read(D, "global", [E])
    s.compute_at(D_global, E, s[E].iters[2])
    # Cache Write
    s.cache_write(D, "shared")
    #follow_split
    its2 = s.split(G, s[G].iters[0], [4, 2, 8, 4], True)
    split_step0 = len(s.transform_steps) - 1
    s.follow_split(G, s[G].iters[5], split_step0, 4)
    #follow_fused_split
    its2 = s.split(H, s[H].iters[0], [4, 2, 8, 4], True)
    split_step1 = len(s.transform_steps) - 1
    its3 = s.split(H, s[H].iters[5], [2, 4, 2, 4], True)
    split_step2 = len(s.transform_steps) - 1
    its = []
    for i0, i1 in zip(its2, its3):
        its.append(i0)
        its.append(i1)
    for i in range(0, 5):
        s.fuse(H, [s[H].iters[i], s[H].iters[i + 1]])
    s.follow_fused_split(I, s[I].iters[0], [split_step1, split_step2], 0,
                         False)

    target = tvm.target.create("llvm")
    task = auto_scheduler.SearchTask(dag, "test", target)

    inp = auto_scheduler.measure.MeasureInput(task, s)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, results = log_reader.read_lines()
        assert len(inputs) == 1

        s1 = dag.infer_bound_from_state(s)
        s2 = dag.infer_bound_from_state(inputs[0].state)

        assert s1 == s2
        assert not (s1 == dag.get_init_state())