def _create_data(target, dshape, dtype, layout): data = relay.var("data", shape=dshape, dtype=dtype) w0 = relay.var("w0_weight") conv0 = relay.nn.conv2d(data, w0, channels=16, kernel_size=(3, 3), padding=(1, 1)) w1 = relay.var("w1_weight") conv1 = relay.nn.conv2d(conv0, w1, channels=32, kernel_size=(1, 1)) w2 = relay.var("w2_weight") conv2 = relay.nn.conv2d(conv1, w2, channels=32, kernel_size=(3, 3), padding=(1, 1)) out = relay.add(conv1, conv2) net = relay.Function(relay.ir_pass.free_vars(out), out) net, params = relay.testing.create_workload(net) tasks = autotvm.task.extract_from_program(net, target=target, params=params, ops=(relay.op.nn.conv2d,)) wkl_list = [ create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1), (1, 1), layout, layout, dtype, dtype), create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0), (1, 1), layout, layout, dtype, dtype), create_workload((1, 32, 8, 8), (32, 32, 3, 3), (1, 1), (1, 1), (1, 1), layout, layout, dtype, dtype), ] costs = [0.04, 0.012, 0.03] config_list = [] cfg_dict = {"i": -1, "c": None, "e": [["tile_ic", "sp", [3, 1]], ["tile_oc", "sp", [4, 4]], ["tile_ow", "sp", [4, 2]], ["unroll_kw", "ot", True]], "t": ""} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = {"i": -1, "c": None, "e": [["tile_ic", "sp", [2, 8]], ["tile_oc", "sp", [1, 32]], ["tile_oh", "ot", 1], ["tile_ow", "sp", [4, 2]]], "t": ""} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = {"i": -1, "c": None, "e": [["tile_ic", "sp", [8, 4]], ["tile_oc", "sp", [4, 8]], ["tile_ow", "sp", [2, 4]], ["unroll_kw", "ot", False]], "t": ""} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) records = [] for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks): task.workload = wkl ms_input = MeasureInput(target=target, task=task, config=config) ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1) records.append((ms_input, ms_output)) ltf_records = [] ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"] ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg) ltf_task = copy.deepcopy(tasks[0]) ltf_task.workload = ltf_wkl ms_input = MeasureInput(target=target, task=ltf_task, config=None) ms_output = MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1) ltf_records.append((ms_input, ms_output)) ltf_keys = [] ltf_arg = [tvm.placeholder((1, 4, 8, 8, 4), dtype=dtype), "NCHW4c", "NCHW8c"] ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg) ltf_keys.append(ltf_wkl) ltf_arg = [tvm.placeholder((1, 1, 8, 8, 32), dtype=dtype), "NCHW32c", "NCHW4c"] ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg) ltf_keys.append(ltf_wkl) ltf_arg = [tvm.placeholder((1, 4, 8, 8, 8), dtype=dtype), "NCHW8c", "NCHW32c"] ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg) ltf_keys.append(ltf_wkl) return net, records, ltf_records, ltf_keys, tasks
def test_DPTuner_run(): log_file = "%s/test_tuner.log" % (os.getcwd()) target = "llvm" dtype = "float32" layout = "NCHW" dshape = (1, 3, 8, 8) conv2d = relay.op.get("nn.conv2d") target_ops = [conv2d] g, records, ltf_records, ltf_keys, tasks = _create_data( target, dshape, dtype, layout) mod = tvm.IRModule() mod["main"] = g costs = [0.02, 0.02, 0.045] config_list = [] cfg_dict = { "index": -1, "code_hash": None, "entity": [ ["tile_ic", "sp", [1, 3]], ["tile_oc", "sp", [2, 8]], ["tile_ow", "sp", [4, 2]], ["unroll_kw", "ot", True], ], } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "index": -1, "code_hash": None, "entity": [ ["tile_ic", "sp", [4, 4]], ["tile_oc", "sp", [2, 16]], ["tile_oh", "ot", 1], ["tile_ow", "sp", [4, 2]], ], } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "index": -1, "code_hash": None, "entity": [ ["tile_ic", "sp", [16, 2]], ["tile_oc", "sp", [8, 4]], ["tile_ow", "sp", [2, 4]], ["unroll_kw", "ot", False], ], } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) for cost, config, task in zip(costs, config_list, tasks): ms_input = MeasureInput(target=target, task=task, config=config) ms_output = MeasureResult(costs=(cost, ), error_no=0, all_cost=-1, timestamp=-1) records.append((ms_input, ms_output)) executor = DPTuner(mod, {"data": dshape}, records, target_ops, target, log_file=log_file) executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True) executor.run() out = [record[0].config for record in executor.get_optimal_records()] expected_out = [ records[3][0].config, records[1][0].config, records[2][0].config ] assert expected_out == out, "Output mismatch: expecting %s but got %s" % ( str(expected_out), str(out), ) assert os.path.isfile( log_file), "No log file with name %s exists." % log_file
def benchmark_layout_transform(self, min_exec_num=100, timeout=10, use_rpc=False, device_key=None, host="localhost", port=9190, n_parallel=1, build_func='default', layout_records=None, target_host=None, infer_layout=False): """Benchmark all possible layout transformation in the graph, given a set of schedule candidates for each workload of target operator. Parameters ---------- min_exec_num : int, optional Minimum number of execution. Final execution time is the average of all execution time. timeout : int, optional Time out for each execution. use_rpc : boolean, optional Whether to use rpc mode for benchmarking. device_key : str, optional Remote device key which can be queried by python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190 host : str, optional IP address used to create RPC tracker on host machine. port : int, optional Port number used to create RPC tracker on host machine. n_parallel: int, optional The number of measurement task that can run in parallel. Set this according to the number of cpu cores (for compilation) and the number of devices you have (for measuring generate code). build_func: str or callable, optional 'default': call default builder. This works for normal target (llvm, cuda) 'ndk': use Android NDK to create shared library. Use this for android target. callable: customized build function for other backends (e.g. VTA). See autotvm/measure/measure_methods.py::default_build_func for example. layout_records : str or iterator of (MeasureInput, MeasureResult). optional Collection of layout_transform benchmarking records. If is str, then it should be the filename of a records log file. Each row of this file is an encoded record pair. Otherwise, it is an iterator. If this argument is set, graph tuner will first check whether layout_transform workload already exists in records and skip benchmarking if possible. target_host : str, optional str or :any:`tvm.target.Target` optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. infer_layout : bool, optional Whether to infer layout transformation time if it doesn't exist in records, instead of benchmarking on target device. This might bring performance loss comparing to benchmarking layout transformation. """ self._logger.info("Start to benchmark layout transformation...") if layout_records is None and infer_layout: raise RuntimeError( "Requires some records to infer layout transformation time.") if isinstance(layout_records, str): layout_records = load_from_file(layout_records) if not layout_records and infer_layout: raise RuntimeError( "Records must be non-empty to infer layout transformation time." ) if isinstance(layout_records, str): layout_records = load_from_file(layout_records) num_flops, total_time = 0, 0 if layout_records is not None: for record in layout_records: ltf_wkl = record[0].task.workload self._layout_transform_perf_records[ltf_wkl] = record input_shape = ltf_wkl[1][1] flops = np.prod(input_shape) num_flops += flops total_time += record[1].costs[0] avg_time = total_time / num_flops if num_flops > 0 else 0 args_list = [] def _fetch_args_callback(from_node_idx, to_node_idx, from_sch_idx, to_sch_idx, args): """Callback function to fetch layout transform args""" _, in_layout, out_layout = args if in_layout != out_layout: args_list.append(args) self._iterate_layout_transform(_fetch_args_callback) def _log_to_list(record_list): """Callback to log result to a list.""" def _callback(_, inputs, results): """Callback implementation""" record_list.append((inputs[0], results[0])) return _callback builder = autotvm.LocalBuilder(n_parallel=n_parallel, build_func=build_func) runner = autotvm.LocalRunner(number=min_exec_num, repeat=1, timeout=timeout) if use_rpc: if device_key is None: raise RuntimeError( "device_key need to be set to use rpc tracker mode.") runner = autotvm.measure.RPCRunner(device_key, host, port, n_parallel=n_parallel, number=min_exec_num, repeat=1, timeout=timeout) measure_option = autotvm.measure_option(builder=builder, runner=runner) for args in args_list: args = serialize_args(args) ltf_workload = ( 'layout_transform', ) + autotvm.task.args_to_workload(args) if ltf_workload in self._layout_transform_perf_records: continue if infer_layout: input_shape = ltf_workload[1][1] flops = 1 for i in input_shape: flops *= i inferred_time = flops * avg_time record_input = MeasureInput(target=self._target, task=None, config=None) record_output = MeasureResult(costs=(inferred_time, ), error_no=0, all_cost=-1, timestamp=-1) self._layout_transform_perf_records[ltf_workload] = ( record_input, record_output) continue records = [] task = autotvm.task.create(layout_transform, args=args, target=self._target, target_host=target_host) task.workload = ltf_workload tuner = autotvm.tuner.GridSearchTuner(task) tuner.tune(n_trial=1, measure_option=measure_option, callbacks=[_log_to_list(records)]) if not isinstance(records[0][1].costs[0], float): records[0] = (records[0][0], records[0][1]._replace( costs=(INVALID_LAYOUT_TIME, ))) self._layout_transform_perf_records[ltf_workload] = records[0] self._iterate_layout_transform(self._create_matrix_callback) self._logger.info("Benchmarking layout transformation successful.")