def test_sketch_search_policy_zero_rank(): measure_ctx = auto_scheduler.LocalRPCMeasureContext() for target in ["llvm", "cuda"]: task = auto_scheduler.SearchTask( func=zero_rank_compute_auto_scheduler_test, args=(10, ), target=target) search_common(task, runner=measure_ctx.runner) task = auto_scheduler.SearchTask( func=zero_rank_reduce_auto_scheduler_test, args=(10, ), target=target) search_common(task, runner=measure_ctx.runner)
def test_measure_special_inputs_map_by_name_rpc_runner(): @auto_scheduler.register_workload def foo(): X = te.placeholder(shape=[10], dtype="int32") Index = te.placeholder(shape=[1], dtype="int32", name="Index") Y = te.compute((1, ), lambda i: X[Index[i]]) return [X, Index, Y] # This workload cannot use random input for the `Index` input task = auto_scheduler.SearchTask( func=foo, target="llvm", task_inputs={ "Index": tvm.nd.array(np.array([5], dtype="int32")), }, ) for enable_cpu_cache_flush in [True, False]: minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state) local_builder = auto_scheduler.LocalBuilder() measure_ctx = auto_scheduler.LocalRPCMeasureContext( timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush) rpc_runner = measure_ctx.runner bress = local_builder.build([minp]) assert bress[0].error_no == 0 mress = rpc_runner.run([minp], bress) assert mress[0].error_no == 0
def generate_sketches(workload_func, args, target, print_for_debug=False): workload_key = auto_scheduler.make_workload_key(workload_func, args) dag = auto_scheduler.ComputeDAG(workload_key) task = auto_scheduler.SearchTask(dag, workload_key, tvm.target.create(target)) policy = auto_scheduler.SketchPolicy(task, verbose=0) return policy.generate_sketches(print_for_debug)
def record_common(dag, s): target = tvm.target.Target("llvm") task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target) inp = auto_scheduler.measure.MeasureInput(task, s) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) # Test in-memory record processing. record_str = auto_scheduler.measure_record.dump_record_to_string(inp, res) r_inp, r_res = auto_scheduler.measure_record.load_record_from_string( record_str) # Only check the workload_key for simplification. assert inp.task.workload_key == r_inp.task.workload_key assert str(res) == str(r_res) # Test file-based record processing. with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, _ = log_reader.read_lines() assert len(inputs) == 1 s1 = dag.infer_bound_from_state(s) s2 = dag.infer_bound_from_state(inputs[0].state) assert s1 == s2 assert not (s1 == dag.get_init_state())
def search_common(workload=matmul_auto_scheduler_test, target="llvm", search_policy='empty', seed=random.randint(1, 1 << 30), runner='local', cost_model=auto_scheduler.RandomModel(), num_measure_trials=2, init_search_callbacks=None): print("Test %s schedule search with the default search policy" % (target)) random.seed(seed) N = 128 workload_key = auto_scheduler.make_workload_key(workload, (N, N, N)) dag = auto_scheduler.ComputeDAG(workload_key) target = tvm.target.create(target) task = auto_scheduler.SearchTask(dag, workload_key, target) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name init_search_callbacks = init_search_callbacks or [] init_search_callbacks.append( auto_scheduler.PreloadMeasuredStates(log_file)) if search_policy == 'empty': search_policy = auto_scheduler.EmptyPolicy(task) elif search_policy == 'sketch': search_policy = auto_scheduler.SketchPolicy( task, init_search_callbacks=init_search_callbacks) tuning_options = auto_scheduler.TuningOptions( num_measure_trials=num_measure_trials, runner=runner, verbose=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]) sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options) inp, res = auto_scheduler.load_best(log_file, workload_key, target) print("==== Python Code ====") print(dag.print_python_code_from_state(inp.state)) try: print("==== Lowered Stmt ====") print(tvm.lower(sch, args, simple_mode=True)) mod = tvm.build(sch, args, target) ctx = tvm.context(str(target), 0) dtype = dag.tensors[0].dtype a = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx) mod(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) print("==== Verification passed ====") except Exception: raise Exception("Error encountered with seed: %d" % (seed)) print()
def test_record(): if not tvm.runtime.enabled("llvm"): return A = te.placeholder((512, 512), name='A') B = te.placeholder((512, 512), name='B') k = te.reduce_axis((0, 512), name='k') C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name='C') D = topi.nn.relu(C) k = te.reduce_axis((0, 512), name='k') E = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * D[k][j], axis=[k]), name='C') F = topi.nn.relu(E) dag = auto_scheduler.ComputeDAG([A, B, F]) s = dag.get_init_state() # Split its0 = s.split(C, s[C].iters[0], [4, 8, 8]) its1 = s.split(C, s[C].iters[4], [8, 4, 4]) # Reorder s.reorder(C, [its0[0], its1[0], its0[1], its1[1], its0[2], its1[2], its0[3], s[C].iters[8], its1[3]]) # Fuse s.fuse(C, [s[C].iters[0], s[C].iters[1], s[C].iters[2]]) # Compute at s.split(F, s[F].iters[0], [2]) s.compute_at(E, F, s[F].iters[0]) # Compute inline s.compute_inline(D) # Compute root s.compute_root(D) # Parallel s.parallel(C, s[C].iters[0]) # Thread bind(The blockIdx & threadIdx are used in GPU, just for record testing here) s.bind(C, s[C].iters[1], "blockIdx.x") s.bind(C, s[C].iters[2], "threadIdx.z") s.bind(C, s[C].iters[3], "vthread") # Unroll s.unroll(C, s[C].iters[4]) # Vectorize s.vectorize(C, s[C].iters[6]) target = tvm.target.create("llvm") task = auto_scheduler.SearchTask(dag, "test", target) inp = auto_scheduler.measure.MeasureInput(task, s) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, results = log_reader.read_lines() assert len(inputs) == 1 s1 = dag.infer_bound_from_state(s) s2 = dag.infer_bound_from_state(inputs[0].state) assert s1 == s2 assert not (s1 == dag.get_init_state())
def test_cpu_matmul(): dag = auto_scheduler.ComputeDAG(matmul_auto_scheduler_test(512, 512, 512)) s = dag.get_init_state() C = s.stage_ops[2] i, j, k = s[C].iters io, ii = s.split(C, i, [16]) jo, ji = s.split(C, j, [8]) s.reorder(C, [io, jo, k, ji, ii]) s.vectorize(C, ji) s.parallel(C, io) s.parallel(C, jo) s.unroll(C, k) target = tvm.target.Target("llvm") task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target) names = auto_scheduler.feature.get_per_store_feature_names() fea = auto_scheduler.feature.get_per_store_features_from_states([s], task)[0] stage_0 = fea[0] assert len(stage_0) == len(names), "%d vs %d" % (len(stage_0), len(names)) fea_dict = {} for name, value in zip(names, stage_0): fea_dict[name] = value for name in ["B0", "B1", "B2"]: if fequal(fea_dict[name + ".acc_type.kReadWrite"], 1.0): c_name = name if fequal(fea_dict[name + ".acc_type.kRead"], 1.0): if fequal(fea_dict[name + ".stride"], 0.0): b_name = name else: a_name = name """ lowered IR: Placeholder: A, B parallel i.0 (0,32) parallel j.0 (0,64) unroll k (0,512) vectorize j.1 (0,8) for i.1 (0,16) C...] = A[...] * B[...] """ # check touched memory in bytes, touched unique memory in bytes, reuse distance, etc. assert fequal(fea_dict[c_name + ".bytes"], math.log2(512 ** 3 * 4 + 1)) assert fequal(fea_dict[b_name + ".unique_bytes"], math.log2(512 ** 2 * 4 + 1)) assert fequal(fea_dict[c_name + ".reuse_dis_iter"], math.log2(8 * 16 + 1)) assert fequal(fea_dict[c_name + ".reuse_dis_bytes"], math.log2((8 * 16 + 8 + 16) * 4 + 1)) assert fequal(fea_dict[c_name + ".reuse_ct"], math.log2(512 + 1)) # check annotations assert fequal(fea_dict["unroll_num"], math.log2(1 + 1)) # assert fequal(fea_dict["unroll_type.kPosInnerReduce"], 1.0) assert fequal(fea_dict["vec_num"], math.log2(1 + 1)) assert fequal(fea_dict["parallel_num"], math.log2(2 + 1)) assert fequal(fea_dict["parallel_prod"], math.log2((512 * 512 / 16 / 8) + 1))
def generate_sketches( workload_func, args, target, print_for_debug=False, init_search_callbacks=None ): task = auto_scheduler.SearchTask(func=workload_func, args=args, target=target) policy = auto_scheduler.SketchPolicy( task, verbose=0, init_search_callbacks=init_search_callbacks ) return policy.generate_sketches(print_for_debug)
def main(): log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json") workload_func, params = CONFIGS[ARGS.workload] params = params[0] # type: ignore workload_func = auto_scheduler.register_workload(workload_func) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( num_cores=int(ARGS.target.attrs["num-cores"]), target=ARGS.target, ) elif ARGS.target.kind.name == "cuda": hardware_params = auto_scheduler.HardwareParams( num_cores=-1, vector_unit_bytes=16, cache_line_bytes=64, max_shared_memory_per_block=int( ARGS.target.attrs["max_shared_memory_per_block"]), max_threads_per_block=int( ARGS.target.attrs["max_threads_per_block"]), max_vthread_extent=8, warp_size=32, ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") task = auto_scheduler.SearchTask( func=workload_func, args=params, target=ARGS.target, hardware_params=hardware_params, ) runner = auto_scheduler.RPCRunner( key=ARGS.rpc_key, host=ARGS.rpc_host, port=ARGS.rpc_port, n_parallel=ARGS.rpc_workers, number=3, repeat=1, min_repeat_ms=100, enable_cpu_cache_flush=False, ) # Inspect the computational graph print("Computational DAG:") print(task.compute_dag) tune_option = auto_scheduler.TuningOptions( num_measure_trials=ARGS.num_trials, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], verbose=2, runner=runner, ) print("Running AutoTuning:") task.tune(tune_option) print("History Best:") print(task.print_best(log_file)) sch, args = task.apply_best(log_file) print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True))
def test_workload_registry_empty_policy(): search_common(search_policy="empty", num_measure_trials=2) N = 64 target = "llvm" search_common( task=auto_scheduler.SearchTask(func="matmul_auto_scheduler_test", args=(N, N, N), target=target), num_measure_trials=2, search_policy="empty", ) search_common( task=auto_scheduler.SearchTask( func="matmul_auto_scheduler_test_rename_1", args=(N, N, N), target=target), num_measure_trials=2, search_policy="empty", )
def test_random_model(): if not tvm.runtime.enabled("llvm"): return N = 128 workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test, (N, N, N)) dag = auto_scheduler.ComputeDAG(workload_key) target = tvm.target.create('llvm') task = auto_scheduler.SearchTask(dag, workload_key, target) model = auto_scheduler.RandomModel() model.update([], []) scores = model.predict(task, [dag.init_state, dag.init_state]) assert len(scores) == 2
def get_sample_records(number): """Generate a list of random MeasureInput and MeasureResult pairs""" N = 128 task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target="llvm") policy = auto_scheduler.SketchPolicy(task, verbose=0) states = policy.sample_initial_population()[:number] inputs = [auto_scheduler.MeasureInput(task, s) for s in states] results = [ auto_scheduler.MeasureResult([np.random.uniform(0.5, 1.0)], 0, "", 0.1, 0) for _ in range(len(inputs)) ] return task, inputs, results
def generate_sketches( workload_func, args, target, print_for_debug=False, init_search_callbacks=None ): # NOTE: test_cpu_matmul_sketch and test_cpu_max_pool2d_sketch assume 4 cores to trigger all # possible sketch generations. task = auto_scheduler.SearchTask( func=workload_func, args=args, target=target, hardware_params=auto_scheduler.HardwareParams(num_cores=4, target=target), ) policy = auto_scheduler.SketchPolicy( task, verbose=0, init_search_callbacks=init_search_callbacks ) return policy.generate_sketches(print_for_debug)
def get_sample_records(number): """Generate random a list of random MeasureInput and MeasureResult pairs""" N = 128 workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test, (N, N, N)) dag = auto_scheduler.ComputeDAG(workload_key) target = tvm.target.create('llvm') task = auto_scheduler.SearchTask(dag, workload_key, target) policy = auto_scheduler.SketchPolicy(task, verbose=0) states = policy.sample_initial_population(number) inputs = [auto_scheduler.MeasureInput(task, s) for s in states] results = [auto_scheduler.MeasureResult([np.random.uniform(0.5, 1.0)], 0, "", 0.1, 0) for _ in range(len(inputs))] return task, dag, inputs, results
def test_measure_local_builder_runner(): if not tvm.runtime.enabled("llvm"): return dag, s0 = get_tiled_matmul() tgt = tvm.target.create("llvm") task = auto_scheduler.SearchTask(dag, "test", tgt) minp = auto_scheduler.MeasureInput(task, s0) local_builder = auto_scheduler.LocalBuilder() local_runner = auto_scheduler.LocalRunner(timeout=60) bress = local_builder.build([minp]) assert bress[0].error_no == 0 mress = local_runner.run([minp], bress) assert mress[0].error_no == 0
def test_task_scheduler_round_robin(): tasks = [] for n in [2, 4, 8]: tasks.append( auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(n, n, n), target="llvm")) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name num_trials_per_task = 2 # Tune all tasks measure_ctx = auto_scheduler.LocalRPCMeasureContext() tune_option = auto_scheduler.TuningOptions( num_measure_trials=num_trials_per_task * len(tasks), runner=measure_ctx.runner, num_measures_per_round=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) task_scheduler = auto_scheduler.TaskScheduler(tasks, strategy="round-robin", callbacks=[]) task_scheduler.tune(tune_option, search_policy="sketch.random") # Check the result of round robin counters = {} for task in tasks: counters[task.workload_key] = 0 for inp, _ in auto_scheduler.load_records(log_file): counters[inp.task.workload_key] += 1 for task in tasks: assert counters[task.workload_key] == num_trials_per_task # test continuous tuning (restoring the status) task_scheduler = auto_scheduler.TaskScheduler(tasks, strategy="round-robin", load_log_file=log_file, callbacks=[]) tune_option = auto_scheduler.TuningOptions( num_measure_trials=len(tasks), num_measures_per_round=1, ) task_scheduler.tune(tune_option, search_policy="sketch.random") del measure_ctx
def test_measure_local_builder_runner(enable_cpu_cache_flush=False): if not tvm.testing.device_enabled("llvm"): return dag, s0 = get_tiled_matmul() tgt = tvm.target.Target("llvm") task = auto_scheduler.SearchTask(dag, "test", tgt) minp = auto_scheduler.MeasureInput(task, s0) local_builder = auto_scheduler.LocalBuilder() local_runner = auto_scheduler.LocalRunner( timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush) bress = local_builder.build([minp]) assert bress[0].error_no == 0 mress = local_runner.run([minp], bress) assert mress[0].error_no == 0
def test_mutate_parallel(): """ The test case initializes evo search with a batch of "bad" states and check whether the search algorithm can find "good" states by mutating the "bad" states. """ class MockCostModel(PythonBasedModel): @staticmethod def is_good_state(state): for line in str(state).split("\n"): if ( line.find("parallel i.0@ (0") != -1 or line.find("parallel [email protected]@ (0") != -1 or line.find("parallel [email protected]@i.1@ (0") != -1 ): return True return False def predict(self, task, states): scores = [] for state in states: scores.append(1 if self.is_good_state(state) else 0) return scores task = auto_scheduler.SearchTask( func=matmul_auto_scheduler_test, args=(1024, 1024, 1024), target="llvm" ) policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0) found = False retry_ct = 0 while retry_ct < 10 and not found: states = policy.sample_initial_population()[:100] bad_states = [] for state in states: if not MockCostModel.is_good_state(state): bad_states.append(state) new_states = policy.evolutionary_search(bad_states, 50) for state in new_states: if MockCostModel.is_good_state(state): found = True break retry_ct += 1 assert found
def test_task_scheduler_gradient(): tasks = [] for n in [2, 4]: tasks.append( auto_scheduler.SearchTask( func=matmul_auto_scheduler_test, args=(n, n, n), target="llvm" ) ) def objective_func(costs): return costs[0] with tempfile.NamedTemporaryFile() as fp: log_file = fp.name n_trials = 5 # Tune all tasks measure_ctx = auto_scheduler.LocalRPCMeasureContext() tune_option = auto_scheduler.TuningOptions( num_measure_trials=n_trials, runner=measure_ctx.runner, num_measures_per_round=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) task_scheduler = auto_scheduler.TaskScheduler( tasks, objective_func=objective_func, callbacks=[] ) # Forcely rewrite the initial values. # This can make this test more stable on the slow CI machines task_scheduler.best_costs = np.array([1e2, 1e-8]) task_scheduler.tune(tune_option, search_policy="sketch.random") # Check the allocation results counters = {} for task in tasks: counters[task.workload_key] = 0 for inp, _ in auto_scheduler.load_records(log_file): counters[inp.task.workload_key] += 1 assert counters[tasks[0].workload_key] == n_trials - 1 assert counters[tasks[1].workload_key] == 1 del measure_ctx
def test_mutate_tile_size(): """ The test case initializes evo search with a batch of "bad" states and check whether the search algorithm can find "good" states by mutating the "bad" states. This unit test has been tested with 1,000 runs with no failures, meaning that the failure rate is less than 0.1%. """ class MockCostModel(PythonBasedModel): """A mock cost model that rates 1 only for the states with tile_k=2.""" @staticmethod def is_good_state(state): for line in str(state).split("\n"): if line.find("k.1") != -1 and line.find("(0,2)") != -1: return True return False def predict(self, task, states): scores = [] for state in states: scores.append(1 if self.is_good_state(state) else 0) return scores workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test, (10, 10, 4)) dag = auto_scheduler.ComputeDAG(workload_key) task = auto_scheduler.SearchTask(dag, workload_key, tvm.target.Target("llvm")) policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0) states = policy.sample_initial_population()[:50] bad_states = [] for state in states: if not MockCostModel.is_good_state(state): bad_states.append(state) new_states = policy.evolutionary_search(bad_states, 50) found = False for state in new_states: if MockCostModel.is_good_state(state): found = True break assert found
def test_correctness_layout_rewrite_insert_transform_stage(): N = 128 target = tvm.target.Target("llvm") task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target=target) dag = task.compute_dag with tempfile.NamedTemporaryFile() as fp: log_file = fp.name search_policy = auto_scheduler.SketchPolicy(task) measure_ctx = auto_scheduler.LocalRPCMeasureContext() tuning_options = auto_scheduler.TuningOptions( num_measure_trials=2, runner=measure_ctx.runner, verbose=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) task.tune(tuning_options, search_policy=search_policy) inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key, target) s, bufs = dag.apply_steps_from_state( inp.state, layout_rewrite=auto_scheduler.LayoutRewriteOption.INSERT_TRANSFORM_STAGE ) s_ref, bufs_ref = dag.apply_steps_from_state(inp.state) np_args = [np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs] func = tvm.build(s, bufs, target=target) func_ref = tvm.build(s_ref, bufs_ref, target=target) ctx = tvm.context(str(target)) ctx_ref = tvm.cpu() args = [tvm.nd.array(x, ctx=ctx) for x in np_args] args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args] ctx.sync() func(*args) func_ref(*args_ref) ctx.sync() tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3) tvm.testing.assert_allclose(args[1].asnumpy(), args_ref[1].asnumpy(), atol=1e-3, rtol=1e-3) tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), atol=1e-3, rtol=1e-3) del measure_ctx
def test_measure_local_builder_runner(): if not tvm.testing.device_enabled("llvm"): return task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm") for enable_cpu_cache_flush in [True, False]: minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state) local_builder = auto_scheduler.LocalBuilder() local_runner = auto_scheduler.LocalRunner( timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush) bress = local_builder.build([minp]) assert bress[0].error_no == 0 mress = local_runner.run([minp], bress) assert mress[0].error_no == 0
def record_common(dag, s): target = tvm.target.create("llvm") task = auto_scheduler.SearchTask(dag, "test", target) inp = auto_scheduler.measure.MeasureInput(task, s) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, results = log_reader.read_lines() assert len(inputs) == 1 s1 = dag.infer_bound_from_state(s) s2 = dag.infer_bound_from_state(inputs[0].state) assert s1 == s2 assert not (s1 == dag.get_init_state())
def test_cpu_fusion(): def fusion_test(N, M): A = te.placeholder((N, M), name="A") B = te.compute((N, M), lambda i, j: A[i][j], name="B") C = te.compute((N, M), lambda i, j: B[i][j], name="C") return [A, B, C] dag = auto_scheduler.ComputeDAG(fusion_test(64, 32)) s = dag.get_init_state() s.compute_at(1, 2, s.stages[2].iters[1]) target = tvm.target.Target("llvm") task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target) names = auto_scheduler.feature.get_per_store_feature_names() fea = auto_scheduler.feature.get_per_store_features_from_states([s], task)[0] """ lowered IR: Placeholder: A for i (0,64) for j (0,32) for ii (1) for jj (1) B[...] = A[...] C[...] = B[...] """ # check reuse distance and reuse type after fusion found = False for stage_fea in fea: for i, (name, value) in enumerate(zip(names, stage_fea)): if "reuse_type.kSerialMultipleReadWrite" in name and value > 0.5: # reuse distance in #iter assert fequal(stage_fea[i + 2], 1.0) # reuse distance in bytes assert fequal(stage_fea[i + 3], math.log2(16 + 1)) found = True assert found
def test_measure_target_host(): task = auto_scheduler.SearchTask( func=matmul_auto_scheduler_test, args=(512, 512, 512), target=tvm.target.Target("llvm", "llvm -mtriple=aarch64-linux-gnu"), ) inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, _ = log_reader.read_lines() assert len(inputs) == 1 raw_inp = inputs[0] recovered_inp = auto_scheduler.measure.recover_measure_input(raw_inp) assert str(recovered_inp.task.target.host) == str(inp.task.target.host)
def test_evo_search(): """Test evolutionary search. Since we cannot mock random number generator, we mocked the cost model to manually guide the evo search. If evo search works as expected, it should find the target state after a sufficient number of iterations. This unit test has been tested with 1,000 runs with no failures, meaning that the failure rate is less than 0.1%. """ workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test, (10, 10, 4)) dag = auto_scheduler.ComputeDAG(workload_key) task = auto_scheduler.SearchTask(dag, workload_key, tvm.target.Target("llvm")) policy = auto_scheduler.SketchPolicy(task, schedule_cost_model=MockCostModel(), verbose=0) states = policy.sample_initial_population(50) pruned_states = [] for state in states: found = False for line in str(state).split("\n"): # Remove all tile_k=2 states and expect evo search will fine them. if line.find("k.1") != -1 and line.find("(0,2)") != -1: found = True break if not found: pruned_states.append(state) new_states = policy.evolutionary_search(pruned_states, 50) found = False for state in new_states: for line in str(state).split("\n"): # Check if evo search found at least one state with tile_k=2. if line.find("k.1") != -1 and line.find("(0,2)") != -1: found = True break if found: break assert found
def test_dag_measure_local_builder_runner(): if not tvm.testing.device_enabled("llvm"): return A = te.placeholder((512, 512), name="A") B = te.placeholder((512, 512), name="B") k = te.reduce_axis((0, 512), name="k") C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C") D = topi.nn.relu(C) E = topi.nn.relu(D) tensors = [A, B, E] dag = auto_scheduler.ComputeDAG(tensors) key = workload_registry.register_workload_tensors(dag.workload_key(), tensors) transfer_data = workload_registry.serialize_workload_registry_entry(key) f_data = pickle.dumps(transfer_data) f_new = pickle.loads(f_data) del workload_registry.WORKLOAD_FUNC_REGISTRY[key] workload_registry.deserialize_workload_registry_entry(f_new) target = tvm.target.Target("llvm") task = auto_scheduler.SearchTask(compute_dag=dag, workload_key=key, target=target) for enable_cpu_cache_flush in [True, False]: minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state) local_builder = auto_scheduler.LocalBuilder() local_runner = auto_scheduler.LocalRunner( timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush) bress = local_builder.build([minp]) assert bress[0].error_no == 0 mress = local_runner.run([minp], bress) assert mress[0].error_no == 0
def test_search_task_add_task_input(): auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear() N = 64 target = "llvm" test_input_0 = tvm.runtime.ndarray.empty((64, 64)) test_input_1 = tvm.runtime.ndarray.empty((10, 20)) test_input_2 = tvm.runtime.ndarray.empty((30, 40, 50)) task = auto_scheduler.SearchTask( func="matmul_auto_scheduler_test", args=(N, N, N), target=target, task_inputs={ "test_input_0": test_input_0, "test_input_1": test_input_1, "test_input_2": test_input_2, }, task_inputs_overwrite=True, ) assert len(task.task_input_names) == 3 assert task.task_input_names[0] == "test_input_0" assert task.task_input_names[1] == "test_input_1" assert task.task_input_names[2] == "test_input_2"
def test_recover_measure_input(): task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm") inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, _ = log_reader.read_lines() assert len(inputs) == 1 raw_inp = inputs[0] correct_inp = auto_scheduler.measure.recover_measure_input(raw_inp) assert str(correct_inp.task.compute_dag) == str(inp.task.compute_dag) correct_inp = auto_scheduler.measure.recover_measure_input( raw_inp, rebuild_state=True) assert str(correct_inp.state) == str(inp.state)
x_plus_3 = te.compute(x.shape, lambda i, j: x[i, j] + 3.0) relu6 = tvm.topi.clip(x_plus_3, 0., 6.) return te.compute(x.shape, lambda i, j: relu6[i, j] * x[i, j] * 0.1666667) d = hard_swish(c) return [a, b, d] if __name__ == "__main__": target = tvm.target.Target("metal --max_num_threads=1024") train_flag = True #target = tvm.target.Target("llvm -mcpu=apple-latest -mtriple=arm64-apple-darwin20.1.0") M, N, K = 128, 3072, 768 task = auto_scheduler.SearchTask(func=gemm, args=(M, N, K), target=target) log_file = "gemm_{M}_{N}_{K}.json".format(M=M, N=N, K=K) # Inspect the computational graph print(task.compute_dag) if train_flag: measure_runner = auto_scheduler.RPCRunner("m1", "127.0.0.1", 9190, min_repeat_ms=300, timeout=30, repeat=3) tune_option = auto_scheduler.TuningOptions( num_measure_trials=1000, check_correctness=True, builder_n_parallel=1,