コード例 #1
0
def test_sketch_search_policy_zero_rank():
    measure_ctx = auto_scheduler.LocalRPCMeasureContext()
    for target in ["llvm", "cuda"]:
        task = auto_scheduler.SearchTask(
            func=zero_rank_compute_auto_scheduler_test,
            args=(10, ),
            target=target)
        search_common(task, runner=measure_ctx.runner)

        task = auto_scheduler.SearchTask(
            func=zero_rank_reduce_auto_scheduler_test,
            args=(10, ),
            target=target)
        search_common(task, runner=measure_ctx.runner)
コード例 #2
0
def test_measure_special_inputs_map_by_name_rpc_runner():
    @auto_scheduler.register_workload
    def foo():
        X = te.placeholder(shape=[10], dtype="int32")
        Index = te.placeholder(shape=[1], dtype="int32", name="Index")
        Y = te.compute((1, ), lambda i: X[Index[i]])
        return [X, Index, Y]

    # This workload cannot use random input for the `Index` input
    task = auto_scheduler.SearchTask(
        func=foo,
        target="llvm",
        task_inputs={
            "Index": tvm.nd.array(np.array([5], dtype="int32")),
        },
    )

    for enable_cpu_cache_flush in [True, False]:
        minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
        local_builder = auto_scheduler.LocalBuilder()
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(
            timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush)
        rpc_runner = measure_ctx.runner

        bress = local_builder.build([minp])
        assert bress[0].error_no == 0
        mress = rpc_runner.run([minp], bress)
        assert mress[0].error_no == 0
コード例 #3
0
def generate_sketches(workload_func, args, target, print_for_debug=False):
    workload_key = auto_scheduler.make_workload_key(workload_func, args)
    dag = auto_scheduler.ComputeDAG(workload_key)
    task = auto_scheduler.SearchTask(dag, workload_key,
                                     tvm.target.create(target))
    policy = auto_scheduler.SketchPolicy(task, verbose=0)
    return policy.generate_sketches(print_for_debug)
コード例 #4
0
def record_common(dag, s):
    target = tvm.target.Target("llvm")
    task = auto_scheduler.SearchTask(compute_dag=dag,
                                     workload_key="test",
                                     target=target)

    inp = auto_scheduler.measure.MeasureInput(task, s)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    # Test in-memory record processing.
    record_str = auto_scheduler.measure_record.dump_record_to_string(inp, res)
    r_inp, r_res = auto_scheduler.measure_record.load_record_from_string(
        record_str)
    # Only check the workload_key for simplification.
    assert inp.task.workload_key == r_inp.task.workload_key
    assert str(res) == str(r_res)

    # Test file-based record processing.
    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, _ = log_reader.read_lines()
        assert len(inputs) == 1

        s1 = dag.infer_bound_from_state(s)
        s2 = dag.infer_bound_from_state(inputs[0].state)

        assert s1 == s2
        assert not (s1 == dag.get_init_state())
コード例 #5
0
def search_common(workload=matmul_auto_scheduler_test,
                  target="llvm",
                  search_policy='empty',
                  seed=random.randint(1, 1 << 30),
                  runner='local',
                  cost_model=auto_scheduler.RandomModel(),
                  num_measure_trials=2,
                  init_search_callbacks=None):
    print("Test %s schedule search with the default search policy" % (target))

    random.seed(seed)
    N = 128
    workload_key = auto_scheduler.make_workload_key(workload, (N, N, N))
    dag = auto_scheduler.ComputeDAG(workload_key)
    target = tvm.target.create(target)
    task = auto_scheduler.SearchTask(dag, workload_key, target)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        init_search_callbacks = init_search_callbacks or []
        init_search_callbacks.append(
            auto_scheduler.PreloadMeasuredStates(log_file))

        if search_policy == 'empty':
            search_policy = auto_scheduler.EmptyPolicy(task)
        elif search_policy == 'sketch':
            search_policy = auto_scheduler.SketchPolicy(
                task, init_search_callbacks=init_search_callbacks)

        tuning_options = auto_scheduler.TuningOptions(
            num_measure_trials=num_measure_trials,
            runner=runner,
            verbose=1,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)])
        sch, args = auto_scheduler.auto_schedule(task, search_policy,
                                                 tuning_options)
        inp, res = auto_scheduler.load_best(log_file, workload_key, target)

        print("==== Python Code ====")
        print(dag.print_python_code_from_state(inp.state))

        try:
            print("==== Lowered Stmt ====")
            print(tvm.lower(sch, args, simple_mode=True))
            mod = tvm.build(sch, args, target)

            ctx = tvm.context(str(target), 0)
            dtype = dag.tensors[0].dtype
            a = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx)
            b = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx)
            c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx)
            mod(a, b, c)
            tvm.testing.assert_allclose(c.asnumpy(),
                                        np.dot(a.asnumpy(), b.asnumpy()),
                                        rtol=1e-5)
            print("==== Verification passed ====")
        except Exception:
            raise Exception("Error encountered with seed: %d" % (seed))
    print()
コード例 #6
0
def test_record():
    if not tvm.runtime.enabled("llvm"):
        return

    A = te.placeholder((512, 512), name='A')
    B = te.placeholder((512, 512), name='B')
    k = te.reduce_axis((0, 512), name='k')
    C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name='C')
    D = topi.nn.relu(C)
    k = te.reduce_axis((0, 512), name='k')
    E = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * D[k][j], axis=[k]), name='C')
    F = topi.nn.relu(E)

    dag = auto_scheduler.ComputeDAG([A, B, F])
    s = dag.get_init_state()

    # Split
    its0 = s.split(C, s[C].iters[0], [4, 8, 8])
    its1 = s.split(C, s[C].iters[4], [8, 4, 4])
    # Reorder
    s.reorder(C, [its0[0], its1[0], its0[1], its1[1], its0[2], its1[2], its0[3], s[C].iters[8],
                  its1[3]])
    # Fuse
    s.fuse(C, [s[C].iters[0], s[C].iters[1], s[C].iters[2]])
    # Compute at
    s.split(F, s[F].iters[0], [2])
    s.compute_at(E, F, s[F].iters[0])
    # Compute inline
    s.compute_inline(D)
    # Compute root
    s.compute_root(D)
    # Parallel
    s.parallel(C, s[C].iters[0])
    # Thread bind(The blockIdx & threadIdx are used in GPU, just for record testing here)
    s.bind(C, s[C].iters[1], "blockIdx.x")
    s.bind(C, s[C].iters[2], "threadIdx.z")
    s.bind(C, s[C].iters[3], "vthread")
    # Unroll
    s.unroll(C, s[C].iters[4])
    # Vectorize
    s.vectorize(C, s[C].iters[6])

    target = tvm.target.create("llvm")
    task = auto_scheduler.SearchTask(dag, "test", target)

    inp = auto_scheduler.measure.MeasureInput(task, s)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, results = log_reader.read_lines()
        assert len(inputs) == 1

        s1 = dag.infer_bound_from_state(s)
        s2 = dag.infer_bound_from_state(inputs[0].state)

        assert s1 == s2
        assert not (s1 == dag.get_init_state())
コード例 #7
0
def test_cpu_matmul():
    dag = auto_scheduler.ComputeDAG(matmul_auto_scheduler_test(512, 512, 512))
    s = dag.get_init_state()
    C = s.stage_ops[2]

    i, j, k = s[C].iters
    io, ii = s.split(C, i, [16])
    jo, ji = s.split(C, j, [8])
    s.reorder(C, [io, jo, k, ji, ii])
    s.vectorize(C, ji)
    s.parallel(C, io)
    s.parallel(C, jo)
    s.unroll(C, k)

    target = tvm.target.Target("llvm")
    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target)
    names = auto_scheduler.feature.get_per_store_feature_names()
    fea = auto_scheduler.feature.get_per_store_features_from_states([s], task)[0]

    stage_0 = fea[0]
    assert len(stage_0) == len(names), "%d vs %d" % (len(stage_0), len(names))
    fea_dict = {}
    for name, value in zip(names, stage_0):
        fea_dict[name] = value

    for name in ["B0", "B1", "B2"]:
        if fequal(fea_dict[name + ".acc_type.kReadWrite"], 1.0):
            c_name = name
        if fequal(fea_dict[name + ".acc_type.kRead"], 1.0):
            if fequal(fea_dict[name + ".stride"], 0.0):
                b_name = name
            else:
                a_name = name

    """
    lowered IR:
    
    Placeholder: A, B
    parallel i.0 (0,32)
      parallel j.0 (0,64)
        unroll k (0,512)
          vectorize j.1 (0,8)
            for i.1 (0,16)
              C...] = A[...] * B[...]
    """

    # check touched memory in bytes, touched unique memory in bytes, reuse distance, etc.
    assert fequal(fea_dict[c_name + ".bytes"], math.log2(512 ** 3 * 4 + 1))
    assert fequal(fea_dict[b_name + ".unique_bytes"], math.log2(512 ** 2 * 4 + 1))
    assert fequal(fea_dict[c_name + ".reuse_dis_iter"], math.log2(8 * 16 + 1))
    assert fequal(fea_dict[c_name + ".reuse_dis_bytes"], math.log2((8 * 16 + 8 + 16) * 4 + 1))
    assert fequal(fea_dict[c_name + ".reuse_ct"], math.log2(512 + 1))

    # check annotations
    assert fequal(fea_dict["unroll_num"], math.log2(1 + 1))
    # assert fequal(fea_dict["unroll_type.kPosInnerReduce"], 1.0)
    assert fequal(fea_dict["vec_num"], math.log2(1 + 1))
    assert fequal(fea_dict["parallel_num"], math.log2(2 + 1))
    assert fequal(fea_dict["parallel_prod"], math.log2((512 * 512 / 16 / 8) + 1))
コード例 #8
0
def generate_sketches(
    workload_func, args, target, print_for_debug=False, init_search_callbacks=None
):
    task = auto_scheduler.SearchTask(func=workload_func, args=args, target=target)
    policy = auto_scheduler.SketchPolicy(
        task, verbose=0, init_search_callbacks=init_search_callbacks
    )
    return policy.generate_sketches(print_for_debug)
コード例 #9
0
def main():
    log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json")
    workload_func, params = CONFIGS[ARGS.workload]
    params = params[0]  # type: ignore
    workload_func = auto_scheduler.register_workload(workload_func)

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(
                ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(
                ARGS.target.attrs["max_threads_per_block"]),
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")
    task = auto_scheduler.SearchTask(
        func=workload_func,
        args=params,
        target=ARGS.target,
        hardware_params=hardware_params,
    )
    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=ARGS.rpc_workers,
        number=3,
        repeat=1,
        min_repeat_ms=100,
        enable_cpu_cache_flush=False,
    )

    # Inspect the computational graph
    print("Computational DAG:")
    print(task.compute_dag)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=ARGS.num_trials,
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        verbose=2,
        runner=runner,
    )
    print("Running AutoTuning:")
    task.tune(tune_option)
    print("History Best:")
    print(task.print_best(log_file))
    sch, args = task.apply_best(log_file)
    print("Lowered TIR:")
    print(tvm.lower(sch, args, simple_mode=True))
コード例 #10
0
def test_workload_registry_empty_policy():
    search_common(search_policy="empty", num_measure_trials=2)

    N = 64
    target = "llvm"
    search_common(
        task=auto_scheduler.SearchTask(func="matmul_auto_scheduler_test",
                                       args=(N, N, N),
                                       target=target),
        num_measure_trials=2,
        search_policy="empty",
    )
    search_common(
        task=auto_scheduler.SearchTask(
            func="matmul_auto_scheduler_test_rename_1",
            args=(N, N, N),
            target=target),
        num_measure_trials=2,
        search_policy="empty",
    )
コード例 #11
0
def test_random_model():
    if not tvm.runtime.enabled("llvm"):
        return
    N = 128
    workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test, (N, N, N))
    dag = auto_scheduler.ComputeDAG(workload_key)
    target = tvm.target.create('llvm')
    task = auto_scheduler.SearchTask(dag, workload_key, target)

    model = auto_scheduler.RandomModel()
    model.update([], [])
    scores = model.predict(task, [dag.init_state, dag.init_state])
    assert len(scores) == 2
コード例 #12
0
def get_sample_records(number):
    """Generate a list of random MeasureInput and MeasureResult pairs"""
    N = 128
    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target="llvm")
    policy = auto_scheduler.SketchPolicy(task, verbose=0)
    states = policy.sample_initial_population()[:number]

    inputs = [auto_scheduler.MeasureInput(task, s) for s in states]
    results = [
        auto_scheduler.MeasureResult([np.random.uniform(0.5, 1.0)], 0, "", 0.1, 0)
        for _ in range(len(inputs))
    ]

    return task, inputs, results
コード例 #13
0
def generate_sketches(
    workload_func, args, target, print_for_debug=False, init_search_callbacks=None
):
    # NOTE: test_cpu_matmul_sketch and test_cpu_max_pool2d_sketch assume 4 cores to trigger all
    # possible sketch generations.
    task = auto_scheduler.SearchTask(
        func=workload_func,
        args=args,
        target=target,
        hardware_params=auto_scheduler.HardwareParams(num_cores=4, target=target),
    )
    policy = auto_scheduler.SketchPolicy(
        task, verbose=0, init_search_callbacks=init_search_callbacks
    )
    return policy.generate_sketches(print_for_debug)
コード例 #14
0
def get_sample_records(number):
    """Generate random a list of random MeasureInput and MeasureResult pairs"""
    N = 128
    workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test, (N, N, N))
    dag = auto_scheduler.ComputeDAG(workload_key)
    target = tvm.target.create('llvm')
    task = auto_scheduler.SearchTask(dag, workload_key, target)
    policy = auto_scheduler.SketchPolicy(task, verbose=0)
    states = policy.sample_initial_population(number)

    inputs = [auto_scheduler.MeasureInput(task, s) for s in states]
    results = [auto_scheduler.MeasureResult([np.random.uniform(0.5, 1.0)], 0, "", 0.1, 0)
               for _ in range(len(inputs))]

    return task, dag, inputs, results
コード例 #15
0
def test_measure_local_builder_runner():
    if not tvm.runtime.enabled("llvm"):
        return

    dag, s0 = get_tiled_matmul()
    tgt = tvm.target.create("llvm")
    task = auto_scheduler.SearchTask(dag, "test", tgt)

    minp = auto_scheduler.MeasureInput(task, s0)
    local_builder = auto_scheduler.LocalBuilder()
    local_runner = auto_scheduler.LocalRunner(timeout=60)

    bress = local_builder.build([minp])
    assert bress[0].error_no == 0
    mress = local_runner.run([minp], bress)
    assert mress[0].error_no == 0
コード例 #16
0
def test_task_scheduler_round_robin():
    tasks = []
    for n in [2, 4, 8]:
        tasks.append(
            auto_scheduler.SearchTask(func=matmul_auto_scheduler_test,
                                      args=(n, n, n),
                                      target="llvm"))

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name
        num_trials_per_task = 2

        # Tune all tasks
        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=num_trials_per_task * len(tasks),
            runner=measure_ctx.runner,
            num_measures_per_round=1,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        task_scheduler = auto_scheduler.TaskScheduler(tasks,
                                                      strategy="round-robin",
                                                      callbacks=[])
        task_scheduler.tune(tune_option, search_policy="sketch.random")

        # Check the result of round robin
        counters = {}
        for task in tasks:
            counters[task.workload_key] = 0

        for inp, _ in auto_scheduler.load_records(log_file):
            counters[inp.task.workload_key] += 1

        for task in tasks:
            assert counters[task.workload_key] == num_trials_per_task

        # test continuous tuning (restoring the status)
        task_scheduler = auto_scheduler.TaskScheduler(tasks,
                                                      strategy="round-robin",
                                                      load_log_file=log_file,
                                                      callbacks=[])
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=len(tasks),
            num_measures_per_round=1,
        )
        task_scheduler.tune(tune_option, search_policy="sketch.random")
        del measure_ctx
コード例 #17
0
def test_measure_local_builder_runner(enable_cpu_cache_flush=False):
    if not tvm.testing.device_enabled("llvm"):
        return

    dag, s0 = get_tiled_matmul()
    tgt = tvm.target.Target("llvm")
    task = auto_scheduler.SearchTask(dag, "test", tgt)

    minp = auto_scheduler.MeasureInput(task, s0)
    local_builder = auto_scheduler.LocalBuilder()
    local_runner = auto_scheduler.LocalRunner(
        timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush)

    bress = local_builder.build([minp])
    assert bress[0].error_no == 0
    mress = local_runner.run([minp], bress)
    assert mress[0].error_no == 0
コード例 #18
0
def test_mutate_parallel():
    """
    The test case initializes evo search with a batch of "bad" states and check whether
    the search algorithm can find "good" states by mutating the "bad" states.
    """

    class MockCostModel(PythonBasedModel):
        @staticmethod
        def is_good_state(state):
            for line in str(state).split("\n"):
                if (
                    line.find("parallel i.0@ (0") != -1
                    or line.find("parallel [email protected]@ (0") != -1
                    or line.find("parallel [email protected]@i.1@ (0") != -1
                ):
                    return True
            return False

        def predict(self, task, states):
            scores = []
            for state in states:
                scores.append(1 if self.is_good_state(state) else 0)
            return scores

    task = auto_scheduler.SearchTask(
        func=matmul_auto_scheduler_test, args=(1024, 1024, 1024), target="llvm"
    )
    policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0)

    found = False
    retry_ct = 0
    while retry_ct < 10 and not found:
        states = policy.sample_initial_population()[:100]
        bad_states = []
        for state in states:
            if not MockCostModel.is_good_state(state):
                bad_states.append(state)

        new_states = policy.evolutionary_search(bad_states, 50)
        for state in new_states:
            if MockCostModel.is_good_state(state):
                found = True
                break
        retry_ct += 1

    assert found
コード例 #19
0
def test_task_scheduler_gradient():
    tasks = []
    for n in [2, 4]:
        tasks.append(
            auto_scheduler.SearchTask(
                func=matmul_auto_scheduler_test, args=(n, n, n), target="llvm"
            )
        )

    def objective_func(costs):
        return costs[0]

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        n_trials = 5

        # Tune all tasks
        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=n_trials,
            runner=measure_ctx.runner,
            num_measures_per_round=1,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        task_scheduler = auto_scheduler.TaskScheduler(
            tasks, objective_func=objective_func, callbacks=[]
        )

        # Forcely rewrite the initial values.
        # This can make this test more stable on the slow CI machines
        task_scheduler.best_costs = np.array([1e2, 1e-8])

        task_scheduler.tune(tune_option, search_policy="sketch.random")

        # Check the allocation results
        counters = {}
        for task in tasks:
            counters[task.workload_key] = 0

        for inp, _ in auto_scheduler.load_records(log_file):
            counters[inp.task.workload_key] += 1

        assert counters[tasks[0].workload_key] == n_trials - 1
        assert counters[tasks[1].workload_key] == 1
        del measure_ctx
コード例 #20
0
def test_mutate_tile_size():
    """
    The test case initializes evo search with a batch of "bad" states and check whether
    the search algorithm can find "good" states by mutating the "bad" states.

    This unit test has been tested with 1,000 runs with no failures, meaning that
    the failure rate is less than 0.1%.
    """
    class MockCostModel(PythonBasedModel):
        """A mock cost model that rates 1 only for the states with tile_k=2."""
        @staticmethod
        def is_good_state(state):
            for line in str(state).split("\n"):
                if line.find("k.1") != -1 and line.find("(0,2)") != -1:
                    return True
            return False

        def predict(self, task, states):
            scores = []
            for state in states:
                scores.append(1 if self.is_good_state(state) else 0)
            return scores

    workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test,
                                                    (10, 10, 4))
    dag = auto_scheduler.ComputeDAG(workload_key)
    task = auto_scheduler.SearchTask(dag, workload_key,
                                     tvm.target.Target("llvm"))
    policy = auto_scheduler.SketchPolicy(task,
                                         program_cost_model=MockCostModel(),
                                         verbose=0)
    states = policy.sample_initial_population()[:50]

    bad_states = []
    for state in states:
        if not MockCostModel.is_good_state(state):
            bad_states.append(state)

    new_states = policy.evolutionary_search(bad_states, 50)
    found = False
    for state in new_states:
        if MockCostModel.is_good_state(state):
            found = True
            break
    assert found
コード例 #21
0
def test_correctness_layout_rewrite_insert_transform_stage():
    N = 128
    target = tvm.target.Target("llvm")
    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target=target)
    dag = task.compute_dag

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        search_policy = auto_scheduler.SketchPolicy(task)

        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
        tuning_options = auto_scheduler.TuningOptions(
            num_measure_trials=2,
            runner=measure_ctx.runner,
            verbose=1,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        task.tune(tuning_options, search_policy=search_policy)
        inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key, target)
        s, bufs = dag.apply_steps_from_state(
            inp.state, layout_rewrite=auto_scheduler.LayoutRewriteOption.INSERT_TRANSFORM_STAGE
        )

        s_ref, bufs_ref = dag.apply_steps_from_state(inp.state)
        np_args = [np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]

        func = tvm.build(s, bufs, target=target)
        func_ref = tvm.build(s_ref, bufs_ref, target=target)

        ctx = tvm.context(str(target))
        ctx_ref = tvm.cpu()

        args = [tvm.nd.array(x, ctx=ctx) for x in np_args]
        args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args]
        ctx.sync()

        func(*args)
        func_ref(*args_ref)
        ctx.sync()

        tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3)
        tvm.testing.assert_allclose(args[1].asnumpy(), args_ref[1].asnumpy(), atol=1e-3, rtol=1e-3)
        tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), atol=1e-3, rtol=1e-3)
        del measure_ctx
コード例 #22
0
def test_measure_local_builder_runner():
    if not tvm.testing.device_enabled("llvm"):
        return

    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test,
                                     args=(512, 512, 512),
                                     target="llvm")

    for enable_cpu_cache_flush in [True, False]:
        minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
        local_builder = auto_scheduler.LocalBuilder()
        local_runner = auto_scheduler.LocalRunner(
            timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush)

        bress = local_builder.build([minp])
        assert bress[0].error_no == 0
        mress = local_runner.run([minp], bress)
        assert mress[0].error_no == 0
コード例 #23
0
def record_common(dag, s):
    target = tvm.target.create("llvm")
    task = auto_scheduler.SearchTask(dag, "test", target)

    inp = auto_scheduler.measure.MeasureInput(task, s)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, results = log_reader.read_lines()
        assert len(inputs) == 1

        s1 = dag.infer_bound_from_state(s)
        s2 = dag.infer_bound_from_state(inputs[0].state)

        assert s1 == s2
        assert not (s1 == dag.get_init_state())
コード例 #24
0
def test_cpu_fusion():
    def fusion_test(N, M):
        A = te.placeholder((N, M), name="A")
        B = te.compute((N, M), lambda i, j: A[i][j], name="B")
        C = te.compute((N, M), lambda i, j: B[i][j], name="C")
        return [A, B, C]

    dag = auto_scheduler.ComputeDAG(fusion_test(64, 32))
    s = dag.get_init_state()
    s.compute_at(1, 2, s.stages[2].iters[1])

    target = tvm.target.Target("llvm")
    task = auto_scheduler.SearchTask(compute_dag=dag,
                                     workload_key="test",
                                     target=target)
    names = auto_scheduler.feature.get_per_store_feature_names()
    fea = auto_scheduler.feature.get_per_store_features_from_states([s],
                                                                    task)[0]
    """
    lowered IR:

    Placeholder: A
    for i (0,64)
        for j (0,32)
            for ii (1)
                for jj (1)
                    B[...] = A[...]
            C[...] = B[...]
    """

    # check reuse distance and reuse type after fusion
    found = False
    for stage_fea in fea:
        for i, (name, value) in enumerate(zip(names, stage_fea)):
            if "reuse_type.kSerialMultipleReadWrite" in name and value > 0.5:
                # reuse distance in #iter
                assert fequal(stage_fea[i + 2], 1.0)
                # reuse distance in bytes
                assert fequal(stage_fea[i + 3], math.log2(16 + 1))
                found = True
    assert found
コード例 #25
0
def test_measure_target_host():
    task = auto_scheduler.SearchTask(
        func=matmul_auto_scheduler_test,
        args=(512, 512, 512),
        target=tvm.target.Target("llvm", "llvm -mtriple=aarch64-linux-gnu"),
    )

    inp = auto_scheduler.measure.MeasureInput(task,
                                              task.compute_dag.init_state)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, _ = log_reader.read_lines()
        assert len(inputs) == 1

        raw_inp = inputs[0]

        recovered_inp = auto_scheduler.measure.recover_measure_input(raw_inp)
        assert str(recovered_inp.task.target.host) == str(inp.task.target.host)
def test_evo_search():
    """Test evolutionary search. Since we cannot mock random number generator,
    we mocked the cost model to manually guide the evo search. If evo search works
    as expected, it should find the target state after a sufficient number of iterations.
    This unit test has been tested with 1,000 runs with no failures, meaning that
    the failure rate is less than 0.1%.
    """
    workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test,
                                                    (10, 10, 4))
    dag = auto_scheduler.ComputeDAG(workload_key)
    task = auto_scheduler.SearchTask(dag, workload_key,
                                     tvm.target.Target("llvm"))
    policy = auto_scheduler.SketchPolicy(task,
                                         schedule_cost_model=MockCostModel(),
                                         verbose=0)
    states = policy.sample_initial_population(50)
    pruned_states = []
    for state in states:
        found = False
        for line in str(state).split("\n"):
            # Remove all tile_k=2 states and expect evo search will fine them.
            if line.find("k.1") != -1 and line.find("(0,2)") != -1:
                found = True
                break
        if not found:
            pruned_states.append(state)

    new_states = policy.evolutionary_search(pruned_states, 50)
    found = False
    for state in new_states:
        for line in str(state).split("\n"):
            # Check if evo search found at least one state with tile_k=2.
            if line.find("k.1") != -1 and line.find("(0,2)") != -1:
                found = True
                break
        if found:
            break
    assert found
コード例 #27
0
def test_dag_measure_local_builder_runner():
    if not tvm.testing.device_enabled("llvm"):
        return

    A = te.placeholder((512, 512), name="A")
    B = te.placeholder((512, 512), name="B")
    k = te.reduce_axis((0, 512), name="k")
    C = te.compute((512, 512),
                   lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]),
                   name="C")
    D = topi.nn.relu(C)
    E = topi.nn.relu(D)

    tensors = [A, B, E]
    dag = auto_scheduler.ComputeDAG(tensors)
    key = workload_registry.register_workload_tensors(dag.workload_key(),
                                                      tensors)
    transfer_data = workload_registry.serialize_workload_registry_entry(key)
    f_data = pickle.dumps(transfer_data)
    f_new = pickle.loads(f_data)
    del workload_registry.WORKLOAD_FUNC_REGISTRY[key]
    workload_registry.deserialize_workload_registry_entry(f_new)

    target = tvm.target.Target("llvm")
    task = auto_scheduler.SearchTask(compute_dag=dag,
                                     workload_key=key,
                                     target=target)

    for enable_cpu_cache_flush in [True, False]:
        minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
        local_builder = auto_scheduler.LocalBuilder()
        local_runner = auto_scheduler.LocalRunner(
            timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush)

        bress = local_builder.build([minp])
        assert bress[0].error_no == 0
        mress = local_runner.run([minp], bress)
        assert mress[0].error_no == 0
コード例 #28
0
def test_search_task_add_task_input():
    auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear()
    N = 64
    target = "llvm"
    test_input_0 = tvm.runtime.ndarray.empty((64, 64))
    test_input_1 = tvm.runtime.ndarray.empty((10, 20))
    test_input_2 = tvm.runtime.ndarray.empty((30, 40, 50))
    task = auto_scheduler.SearchTask(
        func="matmul_auto_scheduler_test",
        args=(N, N, N),
        target=target,
        task_inputs={
            "test_input_0": test_input_0,
            "test_input_1": test_input_1,
            "test_input_2": test_input_2,
        },
        task_inputs_overwrite=True,
    )

    assert len(task.task_input_names) == 3
    assert task.task_input_names[0] == "test_input_0"
    assert task.task_input_names[1] == "test_input_1"
    assert task.task_input_names[2] == "test_input_2"
コード例 #29
0
def test_recover_measure_input():
    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test,
                                     args=(512, 512, 512),
                                     target="llvm")

    inp = auto_scheduler.measure.MeasureInput(task,
                                              task.compute_dag.init_state)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, _ = log_reader.read_lines()
        assert len(inputs) == 1

        raw_inp = inputs[0]

        correct_inp = auto_scheduler.measure.recover_measure_input(raw_inp)
        assert str(correct_inp.task.compute_dag) == str(inp.task.compute_dag)

        correct_inp = auto_scheduler.measure.recover_measure_input(
            raw_inp, rebuild_state=True)
        assert str(correct_inp.state) == str(inp.state)
コード例 #30
0
ファイル: gemm.py プロジェクト: octoml/Apple-M1-BERT
        x_plus_3 = te.compute(x.shape, lambda i, j: x[i, j] + 3.0)
        relu6 = tvm.topi.clip(x_plus_3, 0., 6.)
        return te.compute(x.shape,
                          lambda i, j: relu6[i, j] * x[i, j] * 0.1666667)

    d = hard_swish(c)
    return [a, b, d]


if __name__ == "__main__":
    target = tvm.target.Target("metal --max_num_threads=1024")
    train_flag = True
    #target = tvm.target.Target("llvm -mcpu=apple-latest -mtriple=arm64-apple-darwin20.1.0")

    M, N, K = 128, 3072, 768
    task = auto_scheduler.SearchTask(func=gemm, args=(M, N, K), target=target)
    log_file = "gemm_{M}_{N}_{K}.json".format(M=M, N=N, K=K)
    # Inspect the computational graph
    print(task.compute_dag)

    if train_flag:
        measure_runner = auto_scheduler.RPCRunner("m1",
                                                  "127.0.0.1",
                                                  9190,
                                                  min_repeat_ms=300,
                                                  timeout=30,
                                                  repeat=3)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=1000,
            check_correctness=True,
            builder_n_parallel=1,