def run_tuning(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials= 200, # change this to 20000 to achieve the best performance runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) if use_sparse: from tvm.topi.sparse.utils import sparse_sketch_rules search_policy = [ auto_scheduler.SketchPolicy( task, program_cost_model=auto_scheduler.XGBModel(), init_search_callbacks=sparse_sketch_rules(), ) for task in tasks ] tuner.tune(tune_option, search_policy=search_policy) else: tuner.tune(tune_option)
def test_measure_special_inputs_map_by_name_local_runner(): @auto_scheduler.register_workload def foo(): X = te.placeholder(shape=[10], dtype="int32") Index = te.placeholder(shape=[1], dtype="int32", name="Index") Y = te.compute((1, ), lambda i: X[Index[i]]) return [X, Index, Y] # This workload cannot use random input for the `Index` input task = auto_scheduler.SearchTask( func=foo, target="llvm", task_inputs={ "Index": tvm.nd.array(np.array([5], dtype="int32")), }, ) minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state) local_builder = auto_scheduler.LocalBuilder() local_runner = auto_scheduler.LocalRunner(timeout=10) bress = local_builder.build([minp]) assert bress[0].error_no == 0 mress = local_runner.run([minp], bress) assert mress[0].error_no == 0
def auto_scheduler_tune(network, target, input_name, log_file): if os.path.exists(log_file): os.remove(log_file) mod, net_params, input_shape, output_shape = get_network(network) if network not in ["bert"]: # convert to NHWC layout desired_layouts = {'nn.conv2d': ['NHWC', 'default']} seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(), relay.transform.ConvertLayout(desired_layouts)]) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) if "cpu" in target.keys: tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=20000, # change this to 20000 to achieve the best performance runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) else: measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10) tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=20000, # change this to 20000 to achieve the best performance runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], net_params, target) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune(tuning_opt)
def run_tuning(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials= 200, # change this to 20000 to achieve the best performance runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option)
def test_measure_local_builder_runner(): if not tvm.runtime.enabled("llvm"): return dag, s0 = get_tiled_matmul() tgt = tvm.target.create("llvm") task = auto_scheduler.SearchTask(dag, "test", tgt) minp = auto_scheduler.MeasureInput(task, s0) local_builder = auto_scheduler.LocalBuilder() local_runner = auto_scheduler.LocalRunner(timeout=60) bress = local_builder.build([minp]) assert bress[0].error_no == 0 mress = local_runner.run([minp], bress) assert mress[0].error_no == 0
def test_measure_local_builder_runner(): if not tvm.testing.device_enabled("llvm"): return task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm") for enable_cpu_cache_flush in [True, False]: minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state) local_builder = auto_scheduler.LocalBuilder() local_runner = auto_scheduler.LocalRunner( timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush ) bress = local_builder.build([minp]) assert bress[0].error_no == 0 mress = local_runner.run([minp], bress) assert mress[0].error_no == 0
def test_measure_local_builder_runner(enable_cpu_cache_flush=False): if not tvm.testing.device_enabled("llvm"): return dag, s0 = get_tiled_matmul() tgt = tvm.target.Target("llvm") task = auto_scheduler.SearchTask(dag, "test", tgt) minp = auto_scheduler.MeasureInput(task, s0) local_builder = auto_scheduler.LocalBuilder() local_runner = auto_scheduler.LocalRunner( timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush) bress = local_builder.build([minp]) assert bress[0].error_no == 0 mress = local_runner.run([minp], bress) assert mress[0].error_no == 0
def auto_scheduler_tune(network, batch_size, dtype, target, log_file): os.makedirs(os.path.dirname(log_file), exist_ok=True) #if os.path.exists(log_file): # os.remove(log_file) layout = "NHWC" mod, params, input_name, input_shape, output_shape = get_network( network, batch_size, dtype, layout) n_trials = network_to_n_trials[(network, batch_size, dtype, str(target.kind))] if "cpu" in target.keys: tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=n_trials, runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) else: min_repeat_ms = 450 if network in ["bert"] else 300 measure_ctx = auto_scheduler.LocalRPCMeasureContext( repeat=1, min_repeat_ms=min_repeat_ms, timeout=10) tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=n_trials, runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) print(log_file) update_file(log_file, tasks) return for idx, task in enumerate(tasks): print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) print(task.compute_dag) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune(tuning_opt)
def test_dag_measure_local_builder_runner(): if not tvm.testing.device_enabled("llvm"): return A = te.placeholder((512, 512), name="A") B = te.placeholder((512, 512), name="B") k = te.reduce_axis((0, 512), name="k") C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C") D = topi.nn.relu(C) E = topi.nn.relu(D) tensors = [A, B, E] dag = auto_scheduler.ComputeDAG(tensors) key = workload_registry.register_workload_tensors(dag.workload_key(), tensors) transfer_data = workload_registry.serialize_workload_registry_entry(key) f_data = pickle.dumps(transfer_data) f_new = pickle.loads(f_data) del workload_registry.WORKLOAD_FUNC_REGISTRY[key] workload_registry.deserialize_workload_registry_entry(f_new) target = tvm.target.Target("llvm") task = auto_scheduler.SearchTask(compute_dag=dag, workload_key=key, target=target) for enable_cpu_cache_flush in [True, False]: minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state) local_builder = auto_scheduler.LocalBuilder() local_runner = auto_scheduler.LocalRunner( timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush) bress = local_builder.build([minp]) assert bress[0].error_no == 0 mress = local_runner.run([minp], bress) assert mress[0].error_no == 0