def test_check_correctness(): task, target = get_sample_task() measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(check_correctness=True) ) def _callback_correct(tuner, measure_inputs, measure_results): for inp, res in zip(measure_inputs, measure_results): assert res.error_no == 0 tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=2, measure_option=measure_option, callbacks=[_callback_correct]) # a bad template n = 128 target = tvm.target.create("llvm -device=bad_device") task = autotvm.task.create(bad_matmul, args=(n, n, n, 'float32'), target=target) def _callback_wrong(tuner, measure_inputs, measure_results): for inp, res in zip(measure_inputs, measure_results): assert res.error_no == MeasureErrorNo.WRONG_ANSWER tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=2, measure_option=measure_option, callbacks=[_callback_wrong])
def test_task_tuner_without_measurement(): """test task and tuner without measurement""" task, target = get_sample_task() class DummyRunner(Runner): def __init__(self): super(DummyRunner, self).__init__(1, 1) def run(self, measure_inputs, build_results): return [MeasureResult((np.random.random(),), 0, 0.2, time.time()) for _ in range(len(measure_inputs))] def get_build_kwargs(self): return {} measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=DummyRunner() ) logging.info("%s", task.config_space) for tuner_class in [autotvm.tuner.RandomTuner, autotvm.tuner.GridSearchTuner, autotvm.tuner.GATuner, autotvm.tuner.XGBTuner]: tuner = tuner_class(task) tuner.tune(n_trial=10, measure_option=measure_option) assert tuner.best_flops > 1
def check(target, target_host): ctx = tvm.context(target, 0) if not ctx.exist: logging.info("Skip test because %s is not available" % target) return # init task task, target = get_sample_task(target, target_host) logging.info("%s", task.config_space) measure_option = autotvm.measure_option( autotvm.LocalBuilder(), autotvm.LocalRunner()) tuner = RandomTuner(task) tuner.tune(n_trial=20, measure_option=measure_option)
def test_min_repeat_ms(): task, target = get_sample_task() measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=1, min_repeat_ms=100) ) def _callback(tuner, measure_inputs, measure_results): for inp, res in zip(measure_inputs, measure_results): if res.error_no != 0: continue assert 1000 * np.mean(res.costs) * \ measure_option['runner'].cur_number >= 100 tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=5, measure_option=measure_option, callbacks=[_callback])
def autoTVM_conv(bs, oc, ic, nh, nw, kh, kw, ph=0, pw=0, sh=1, sw=1): """ use autoTVM to get the best throughput """ @autotvm.template("leslie/second_conv") def my_tune(bs, oc, ic, nh, nw, kh, kw, ph, pw, sh, sw): # toc = 16 # tic = 16 # tw = 4 cfg = autotvm.get_config() cfg.define_knob("tile_w", [1, 2, 4]) cfg.define_knob("tile_oc", [16, 32, 64]) cfg.define_knob("tile_ic", [16, 32, 64]) tic = cfg["tile_ic"].val toc = cfg["tile_oc"].val tw = cfg["tile_w"].val X = te.placeholder((bs, ic, nh, nw), name='X') K = te.placeholder((oc, ic, kh, kw), name='K') PaddedX = padding(X, ph, pw) if ph * pw != 0 else X # pack X and K assert ic % tic == 0 and oc % toc == 0 PackedX = te.compute( (bs, ic // tic, nh + ph * 2, nw + pw * 2, tic), lambda b, ic_out, x, y, ic_in: PaddedX[b, ic_out * tic + ic_in, x, y], name='PackedX') PackedK = te.compute( (oc // toc, ic // tic, kh, kw, tic, toc), lambda oc_out, ic_out, x, y, ic_in, oc_in: K[ oc_out * toc + oc_in, ic_out * tic + ic_in, x, y], name='PackedK') # reduction axes ric_in = te.reduce_axis((0, tic), name='ric_in') ric_out = te.reduce_axis((0, ic // tic), name='ric_out') rkh = te.reduce_axis((0, kh), name='rkh') rkw = te.reduce_axis((0, kw), name='rkw') # output height and weights oh = conv_out_size(nh, kh, ph, sh) ow = conv_out_size(nw, kw, pw, sw) # Compuated Y in the packed layout PackedY = te.compute( (bs, oc // toc, oh, ow, toc), lambda b, oc_out, x, y, oc_in: te.sum( PackedX[b, ric_out, x * sh + rkh, y * sw + rkw, ric_in] * PackedK[oc_out, ric_out, rkh, rkw, ric_in, oc_in], axis=[ric_out, rkh, rkw, ric_in]), name='Y') # Unpack the result Y = te.compute((bs, oc, oh, ow), lambda b, oc, x, y: PackedY[b, oc // toc, x, y, oc % toc], name='Y') s = te.create_schedule(Y.op) CachedY = s.cache_write(PackedY, 'local') # self test by leslie bso, oc_out, h, w, oc_in = s[PackedY].op.axis s[PackedY].reorder(bso, h, w, oc_out, oc_in) #w_out, w_in = s[PackedY].split(w, cfg["tile_w"].val) # Split the columns w_out, w_in = s[PackedY].split(w, tw) bso_h_w_out = s[PackedY].fuse(bso, h, w_out) s[PackedY].parallel(bso_h_w_out) # CachedY = s.cache_write(PackedY, 'local') s[CachedY].compute_at(s[PackedY], bso_h_w_out) c_bso, c_oc_out, ch, cw, c_oc_in = CachedY.op.axis ric_out, rkh, rkw, ric_in = CachedY.op.reduce_axis s[CachedY].reorder(ric_out, rkh, rkw, ric_in, c_oc_out, cw, c_oc_in) s[CachedY].unroll(cw) # s[CachedY].unroll(c_oc_out) s[CachedY].vectorize(c_oc_in) # Schedule the padding by adding thread-level parallelism if PaddedX != X: s[PaddedX].parallel(PaddedX.op.axis[0]) # Optimize the packing of X and K s[PackedX].parallel(s[PackedX].fuse(*PackedX.op.axis[0:2])) s[PackedX].unroll(PackedX.op.axis[-1]) s[PackedK].parallel(s[PackedK].fuse(*PackedK.op.axis[0:2])) s[PackedK].unroll(PackedK.op.axis[-1]) # Optimize the unpacking of Y s[Y].parallel(s[Y].fuse(*Y.op.axis[0:2])) s[Y].unroll(Y.op.axis[-1]) return s, [X, K, Y] #param = (bs, oc, ic, nh, nw, kh, kw, ph, pw, sh, sw) task = autotvm.task.create("leslie/second_conv", args=(bs, oc, ic, nh, nw, kh, kw, ph, pw, sh, sw), target=target) #task = autotvm.task.create(bs, oc, ic, nh, nw, kh, kw, ph, pw, sh, sw) print(task.config_space) print(len(task.config_space)) measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(timeout=10000), runner=autotvm.LocalRunner(repeat=1, number=10, min_repeat_ms=1000, timeout=1000)) logfile = 'leslie_tune.log' os.system("rm -rf {}".format(logfile)) tuner = autotvm.tuner.XGBTuner(task) #n_trial = len(task.config_space) n_trial = len(task.config_space) prefix = "[Task]" tuner.tune(n_trial=n_trial, measure_option=measure_option, callbacks=[autotvm.callback.progress_bar(n_trial, prefix=prefix), autotvm.callback.log_to_file(logfile)]) # evalute task with autotvm.apply_history_best(logfile): print("Compiling") with tvm.target.create(target): s, arg_bufs = my_tune(bs, oc, ic, nh, nw, kh, kw, ph, pw, sh, sw) mod = tvm.build(s, arg_bufs, target=target) return mod
# logging config (for printing tuning log to screen) logging.getLogger("autotvm").setLevel(logging.DEBUG) logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout)) # the last layer in resnet N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1) task = autotvm.task.create( "tutorial/conv2d_no_batching", args=(N, H, W, CO, CI, KH, KW, strides, padding), target="cuda" ) print(task.config_space) # Use local gpu, measure 10 times for every config to reduce variance # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4), ) # Begin tuning, log records to file `conv2d.log` # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task) tuner.tune( n_trial=20, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file("conv2d.log")], ) ######################################################################### # Finally we can inspect the best config from log file, check correctness, # and measure running time.
def benchmark_layout_transform( self, min_exec_num=100, timeout=10, use_rpc=False, device_key=None, host="localhost", port=9190, n_parallel=1, build_func="default", layout_records=None, target_host=None, infer_layout=False, ): """Benchmark all possible layout transformation in the graph, given a set of schedule candidates for each workload of target operator. Parameters ---------- min_exec_num : int, optional Minimum number of execution. Final execution time is the average of all execution time. timeout : int, optional Time out for each execution. use_rpc : boolean, optional Whether to use rpc mode for benchmarking. device_key : str, optional Remote device key which can be queried by python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190 host : str, optional IP address used to create RPC tracker on host machine. port : int, optional Port number used to create RPC tracker on host machine. n_parallel: int, optional The number of measurement task that can run in parallel. Set this according to the number of cpu cores (for compilation) and the number of devices you have (for measuring generate code). build_func: str or callable, optional 'default': call default builder. This works for normal target (llvm, cuda) 'ndk': use Android NDK to create shared library. Use this for android target. callable: customized build function for other backends (e.g. VTA). See autotvm/measure/measure_methods.py::default_build_func for example. layout_records : str or iterator of (MeasureInput, MeasureResult). optional Collection of layout_transform benchmarking records. If is str, then it should be the filename of a records log file. Each row of this file is an encoded record pair. Otherwise, it is an iterator. If this argument is set, graph tuner will first check whether layout_transform workload already exists in records and skip benchmarking if possible. target_host : str, optional str or :any:`tvm.target.Target` optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. infer_layout : bool, optional Whether to infer layout transformation time if it doesn't exist in records, instead of benchmarking on target device. This might bring performance loss comparing to benchmarking layout transformation. """ self._logger.info("Start to benchmark layout transformation...") if layout_records is None and infer_layout: raise RuntimeError("Requires some records to infer layout transformation time.") if isinstance(layout_records, str): layout_records = load_from_file(layout_records) if not layout_records and infer_layout: raise RuntimeError("Records must be non-empty to infer layout transformation time.") if isinstance(layout_records, str): layout_records = load_from_file(layout_records) num_flops, total_time = 0, 0 if layout_records is not None: for record in layout_records: ltf_wkl = record[0].task.workload self._layout_transform_perf_records[ltf_wkl] = record input_shape = ltf_wkl[1][1] flops = np.prod(input_shape) num_flops += flops total_time += record[1].costs[0] avg_time = total_time / num_flops if num_flops > 0 else 0 args_list = [] def _fetch_args_callback(from_node_idx, to_node_idx, from_sch_idx, to_sch_idx, args): """Callback function to fetch layout transform args""" _, in_layout, out_layout = args if in_layout != out_layout: args_list.append(args) self._iterate_layout_transform(_fetch_args_callback) def _log_to_list(record_list): """Callback to log result to a list.""" def _callback(_, inputs, results): """Callback implementation""" record_list.append((inputs[0], results[0])) return _callback builder = autotvm.LocalBuilder(n_parallel=n_parallel, build_func=build_func) runner = autotvm.LocalRunner(number=min_exec_num, repeat=1, timeout=timeout) if use_rpc: if device_key is None: raise RuntimeError("device_key need to be set to use rpc tracker mode.") runner = autotvm.measure.RPCRunner( device_key, host, port, n_parallel=n_parallel, number=min_exec_num, repeat=1, timeout=timeout, ) measure_option = autotvm.measure_option(builder=builder, runner=runner) for args in args_list: data, in_layout, out_layout = args ltf_workload = autotvm.task.args_to_workload(args, "layout_transform") if ltf_workload in self._layout_transform_perf_records: continue if infer_layout: input_shape = ltf_workload[1][1] flops = 1 for i in input_shape: flops *= i # Rule out invalid layout transformations out = topi.layout_transform(data, in_layout, out_layout) out_flops = 1 for i in topi.utils.get_const_tuple(out.shape): out_flops *= i if flops != out_flops: inferred_time = INVALID_LAYOUT_TIME else: inferred_time = flops * avg_time record_input = MeasureInput(target=self._target, task=None, config=None) record_output = MeasureResult( costs=(inferred_time,), error_no=0, all_cost=-1, timestamp=-1 ) self._layout_transform_perf_records[ltf_workload] = (record_input, record_output) continue records = [] task = autotvm.task.create( "layout_transform", args=args, target=self._target, target_host=target_host ) tuner = autotvm.tuner.GridSearchTuner(task) tuner.tune(n_trial=1, measure_option=measure_option, callbacks=[_log_to_list(records)]) if not isinstance(records[0][1].costs[0], float): records[0] = (records[0][0], records[0][1]._replace(costs=(INVALID_LAYOUT_TIME,))) self._layout_transform_perf_records[ltf_workload] = records[0] self._iterate_layout_transform(self._create_matrix_callback) self._logger.info("Benchmarking layout transformation successful.")
network = 'resnet-18' log_file = "%s.log" % network dtype = 'float32' tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 2000, 'early_stopping': 600, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4), ), } #################################################################### # # .. note:: How to set tuning options # # In general, the default value provided here works well. # # If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, # which makes the tuning runs longer. # # If you have multiple devices, you can use all of them for measurement to # accelerate the tuning process. (see the 'Scale up measurement` section below). #
dtype = "float32" tuning_option = { "log_filename": log_file, "tuner": "xgb", "n_trial": 2000, "early_stopping": 600, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150), ), } #################################################################### # # .. note:: How to set tuning options # # In general, the default value provided here works well. # # If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, # which makes the tuning runs longer. # # If you have multiple devices, you can use all of them for measurement to
tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 1000, 'early_stopping': 450, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder( build_func='ndk' if use_android else 'default'), runner=autotvm.RPCRunner( device_key, host='localhost', port=9190, number=10, timeout=5, ), ), } #################################################################### # # .. note:: How to set tuning options # # In general, the default values provided here work well. # If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, # which makes the tuning run longer. # If your device runs very slow or your conv2d operators have many GFLOPs, considering to
#### TUNING OPTION #### network = "resnet-18" log_file = "%s.%s.log" % (device_key, network) dtype = "float32" tuning_option = { "log_filename": log_file, "tuner": "xgb", "n_trial": 1500, "early_stopping": 800, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"), runner=autotvm.RPCRunner( device_key, host="0.0.0.0", port=9190, number=5, timeout=10, ), ), } #################################################################### # # .. note:: How to set tuning options # # In general, the default values provided here work well. # If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, # which makes the tuning run longer. # If your device runs very slow or your conv2d operators have many GFLOPs, considering to # set timeout larger.
# "NCHW" to "NCHWc". To deal with this situation, we define # conv2d_NCHWc operator in topi. We will tune this operator # instead of plain conv2d. # # We will use local mode for tuning configuration. RPC tracker # mode can be setup similarly to the approach in # :ref:`tune_nnvm_arm` tutorial. tuning_option = { 'log_filename': log_file, 'tuner': 'random', 'early_stopping': None, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=10, repeat=1, min_repeat_ms=1000), ), } # You can skip the implementation of this function for this tutorial. def tune_kernels(tasks, measure_option, tuner='gridsearch', early_stopping=None, log_filename='tuning.log'): for i, tsk in enumerate(tasks): prefix = "[Task %2d/%2d] " % (i+1, len(tasks)) # converting conv2d tasks to conv2d_NCHWc tasks
target = tvm.target.cuda() #### TUNING OPTION #### network = 'resnet-18' log_file = "%s.log" % network dtype = 'float32' tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 2000, 'early_stopping': 600, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4), ), } #################################################################### # # .. note:: How to set tuning options # # In general, the default value provided here works well. # # If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, # which makes the tuning runs longer. # # If you have multiple devices, you can use all of them for measurement to # accelerate the tuning process. (see the 'Scale up measurement` section below). #
dtype = 'float32' tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 1000, 'early_stopping': 250, 'measure_option': autotvm.measure_option( autotvm.measure.rpc(device_key, host='localhost', port=9190), number=4, n_parallel=1, timeout=10, build_func='ndk' if use_android else 'default', ), } #################################################################### # # .. note:: How to set tuning options # # In general, the default value provided here works well. It is the same # value that we used to generate pre-tuned parameters. # If you have multiple devices, you can set :code:`n_parallel` to # the number of devices you have. (e.g. set it to 3 if you register 3 rk3399 # boards to the tracker). # If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
#### TUNING OPTION #### #network = 'onnx' network = 'resnet-50' log_file = "%s.log" % network log_file = 'gtx-1060.log' dtype = 'float32' tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 2000, 'early_stopping': 600, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4), ), } # You can skip the implementation of this function for this tutorial. def tune_tasks(tasks, measure_option, tuner='xgb', n_trial=1000, early_stopping=None, log_filename='tuning.log', use_transfer_learning=True, try_winograd=True): if try_winograd: for i in range(len(tasks)):
network = 'sample' log_file = 'gpu.log' dtype = 'float32' tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 1000, 'early_stopping': 600, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), #runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150), runner=autotvm.RPCRunner( '1080ti', # change the device key to your key '0.0.0.0', 9090, number=1, repeat=3, timeout=100, min_repeat_ms=150) ), } def tune_tasks(tasks, measure_option, tuner='xgb', n_trial=1000, early_stopping=None, log_filename='tuning.log', use_transfer_learning=True, try_winograd=True): if try_winograd: for i in range(len(tasks)):
def run_tuning(): import os import numpy as np from tvm import autotvm from tvm.relay import testing from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner import tvm.contrib.graph_runtime as runtime from datetime import datetime tunemods = ["Resnet", "VGG", "MobileNet", "Squeezenet", "Inception", "MXNet"] tuners = ["XGBoost", "Genetic Algorithm", "Random", "Grid Search"] gtuners = ["DPTuner", "PBQPTuner"] pat = get_menu("Which platform do you want to tune?", supportedPlatforms) model = get_menu("Which model do you want to tune?", tunemods) if model == 5: submod = get_menu("Which submodel do you want to tune?", supportedModels) tunes = get_menu("Which kernel tuner do you want to use?", tuners) gtuner = get_menu("Which graph tuner do you want to use?", gtuners) batch = get_menu("How many pictures should be run at a time?") core = get_menu("How many cores should be used at a time?") print("\n──────────────────────────── TVMUI ────────────────────────────\n") print("Started on " + str(datetime.now().strftime("%m/%d/%Y at %H:%M:%S"))) from tvm import relay import tvm def get_network(name, batch_size): """Get the symbol definition and random weight of a network""" input_shape = (batch_size, 3, 224, 224) output_shape = (batch_size, 1000) if "resnet" in name: n_layer = int(name.split("-")[1]) mod, params = relay.testing.resnet.get_workload( num_layers=n_layer, batch_size=batch_size, dtype=dtype ) print("Tuning ResNet") elif "vgg" in name: n_layer = int(name.split("-")[1]) mod, params = relay.testing.vgg.get_workload( num_layers=n_layer, batch_size=batch_size, dtype=dtype ) print("Tuning VGG") elif name == "mobilenet": mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype) print("Tuning MobileNet") elif name == "squeezenet_v1.1": mod, params = relay.testing.squeezenet.get_workload( batch_size=batch_size, version="1.1", dtype=dtype ) print("Tuning SqueezeNet") elif name == "inception_v3": input_shape = (1, 3, 299, 299) mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) print("Tuning Inception") elif name == "mxnet": # an example for mxnet model from mxnet.gluon.model_zoo.vision import get_model if submod == 0: modn = "resnet18_v1" print("Tuning MXNet's ResNet") elif submod == 1: modn = "inceptionv3" print("Tuning MXNet's Inception") elif submod == 2: modn = "mobilenetv2_1.0" print("Tuning MXNet's MobileNet") else: raise Exception("Not Supported!") block = get_model(modn, pretrained=True) mod, params = relay.frontend.from_mxnet(block, shape={input_name: input_shape}, dtype=dtype) net = mod["main"] net = relay.Function( net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs ) mod = tvm.IRModule.from_expr(net) else: raise ValueError("Unsupported network: " + name) return mod, params, input_shape, output_shape if pat == 0: target = "llvm" print("Using LLVM") if pat == 1: target = "metal" print("Using metal") batch_size = batch if model == 0: dtype = "float32" model_name = "resnet-18" elif model == 1: dtype = "float32" model_name = "vgg-18" elif model == 2: dtype = "float32" model_name = "mobilenet" elif model == 3: dtype = "float32" model_name = "squeezenet_v1.1" elif model == 4: dtype = "float32" model_name = "inception_v3" elif model == 5: dtype = "float32" model_name = "mxnet" else: raise Exception('Not Supported!') filename = "TVMTune_" + supportedPlatforms[pat] + "_" + tunemods[model] if model == 5: filename = filename + "_" + supportedModels[submod] filename = filename + "_" + str(batch) if tunes == 0: filename = filename + "_XG" elif tunes == 1: filename = filename + "_GA" elif tunes == 2: filename = filename + "_RD" elif tunes == 3: filename = filename + "_GS" if gtuner == 0: filename = filename + "DP" elif gtuner == 1: filename = filename + "PB" log_file = "logs/" + filename + ".log" graph_opt_sch_file = "tunings/" + filename + "_graph_opt.log" # Set the input name of the graph # For ONNX models, it is typically "0". input_name = "data" # Set number of threads used for tuning based on the number of # physical CPU cores on your machine. num_threads = core os.environ["TVM_NUM_THREADS"] = str(num_threads) ################################################################# # Configure tensor tuning settings and create tasks # ------------------------------------------------- # To get better kernel execution performance on x86 CPU, # we need to change data layout of convolution kernel from # "NCHW" to "NCHWc". To deal with this situation, we define # conv2d_NCHWc operator in topi. We will tune this operator # instead of plain conv2d. # # We will use local mode for tuning configuration. RPC tracker # mode can be setup similarly to the approach in # :ref:`tune_relay_arm` tutorial. # # To perform a precise measurement, we should repeat the measurement several # times and use the average of results. In addition, we need to flush the cache # for the weight tensors between repeated measurements. This can make the measured # latency of one operator closer to its actual latency during end-to-end inference. tuning_option = { "log_filename": log_file, "tuner": "random", "early_stopping": None, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner( number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True ), ), } # You can skip the implementation of this function for this tutorial. def tune_kernels( tasks, measure_option, tuner="gridsearch", early_stopping=None, log_filename="logs/tuning.log" ): for i, task in enumerate(tasks): prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) # create tuner if tunes == 0: tuner_obj = XGBTuner(task, loss_type="rank") # print("Using XGBTuner") elif tunes == 1: tuner_obj = GATuner(task, pop_size=50) # print("Using GATuner") elif tunes == 2: tuner_obj = RandomTuner(task) # print("Using Random") elif tunes == 3: tuner_obj = GridSearchTuner(task) # print("Using GridSearch") else: raise ValueError("Invalid tuner: " + tuner) # do tuning n_trial = len(task.config_space) tuner_obj.tune( n_trial=n_trial, early_stopping=early_stopping, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(n_trial, prefix=prefix), autotvm.callback.log_to_file(log_filename), ], ) # Use graph tuner to achieve graph level optimal schedules # Set use_DP=False if it takes too long to finish. def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True): target_op = [ relay.op.get("nn.conv2d"), ] if gtuner == 0: Tuner = DPTuner # print("Using DPTuner") else: Tuner = PBQPTuner # print("Using PBQPTuner") executor = Tuner(graph, {input_name: dshape}, records, target_op, target) executor.benchmark_layout_transform(min_exec_num=2000) executor.run() executor.write_opt_sch2record_file(opt_sch_file) ######################################################################## # Finally, we launch tuning jobs and evaluate the end-to-end performance. def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") mod, params, data_shape, out_shape = get_network(model_name, batch_size) tasks = autotvm.task.extract_from_program( mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),) ) # run tuning tasks tune_kernels(tasks, **tuning_opt) tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file) # compile kernels with graph-level best records with autotvm.apply_graph_best(graph_opt_sch_file): print("Compile...") with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, target=target, params=params) # upload parameters to device if pat == 0: ctx = tvm.cpu() if pat == 1: ctx = tvm.metal() data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype)) module = runtime.GraphModule(lib["default"](ctx)) module.set_input(input_name, data_tvm) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print( "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)) ) # We do not run the tuning in our webpage server since it takes too long. # Uncomment the following line to run it by yourself. tune_and_evaluate(tuning_option) return
# instead of plain conv2d. # # We will use local mode for tuning configuration. RPC tracker # mode can be setup similarly to the approach in # :ref:`tune_relay_arm` tutorial. tuning_option = { 'log_filename': log_file, 'tuner': 'random', 'early_stopping': None, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=10, repeat=1, min_repeat_ms=1000), ), } # You can skip the implementation of this function for this tutorial. def tune_kernels(tasks, measure_option, tuner='gridsearch', early_stopping=None, log_filename='tuning.log'): for i, tsk in enumerate(tasks): prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) # converting conv2d tasks to conv2d_NCHWc tasks
tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 10, 'early_stopping': 450, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder( build_func='ndk' if use_android else 'default'), runner=autotvm.RPCRunner( device_key, host=tracker_host, port=tracker_port, number=1, timeout=5, ), ), } #################################################################### # # .. note:: How to set tuning options # # In general, the default values provided here work well. # If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, # which makes the tuning run longer. # If your device runs very slow or your conv2d operators have many GFLOPs, considering to
# Since our space is small, a random tuner is just okay. # # We only make 10 trials in this tutorial for demonstration. In practice, # you can do more trials according to your time budget. # We will log the tuning results into a log file. This file can be # used to get the best config later. # logging config (for printing tuning log to the screen) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) # There are two steps for measuring a config: build and run. # By default, we use all CPU cores to compile program. Then measure them sequentially. # We measure 5 times and take average to reduce variance. measure_option = autotvm.measure_option( builder='local', runner=autotvm.LocalRunner(number=5)) # begin tuning, log records to file `matmul.log` tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=10, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('matmul.log')]) ######################################################################### # Finally we apply history best from the cache file and check its correctness. # We can call the function :code:`matmul` directly under the # :any:`autotvm.apply_history_best` context. When we call this function, # it will query the dispatch context with its argument and get the best config # with the same argument.
def search_op_config(code_only=False): tvm_target = 'cuda' logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) default_tune_op = importlib.import_module('templates.' + (os.environ['OP'])) print(' >> Backend = %s, Python PID = %s, Task = %s;' % (backend, os.getpid(), default_tune_op.__name__)) task = autotvm.task.create(default_tune_op.get_template_op, args=(), target=tvm_target) op_attributes = default_tune_op.op_attributes op_summary = '_'.join([k + str(op_attributes[k]) for k in op_attributes]) def json_to_config(json_dict): config = ConfigEntity.from_json_dict({ "i": -1, "t": "", "c": None, "e": json_dict }) return config def config_to_json(config): jobj = config.to_json_dict()['e'] json_dict = dict() for i in range(len(jobj)): assert (jobj[i][1] in ['sp', 'ot']) json_dict[jobj[i][0]] = jobj[i][2] return json_dict num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0 if 'CONFIG' in os.environ: params_given = json.loads(os.environ['CONFIG']) print("====>> [Current Config Option]", os.environ['CONFIG']) trial_config = [] for key in params_given: trial_config.append([ key, "sp" if type(params_given[key]) is list else "ot", params_given[key] ]) best_config = json_to_config(trial_config) elif 'NNI_TRIAL_JOB_ID' in os.environ: show_search_space(task.config_space, os.environ['NNI_TRIAL_JOB_ID'] == '@') import nni params_given = nni.get_next_parameter() if params_given is None: raise local_dir_id = os.environ['NNI_TRIAL_JOB_ID'] t = run_config_entity(params_given, local_dir_id) gflops = compute_gflops(task.flop, t) print('[TVM-engine] Final entity result is: %g' % gflops) try: nni.report_final_result(gflops) except: print('[TVM-engine] (not reporting final result to NNI.)') exit(0) elif num_trials > 0: n_parallel = 16 if 'BATCH' not in os.environ else int( os.environ['BATCH']) measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(n_parallel=n_parallel), runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)) # if DO_TUNING: tuner = autotvm.tuner.XGBTuner(task, num_threads=8) from concurrent.futures import ThreadPoolExecutor thread_pool = ThreadPoolExecutor(max_workers=n_parallel) dev_num = get_tuning_parallism() def parse_configs(task, configs): results = [] futures = [] expected_timecost = 'inf' for i in range(len(configs)): futures.append( thread_pool.submit(run_config_entity, config_to_json(configs[i]), i, expected_timecost, i % dev_num)) for i in range(len(configs)): t = futures[i].result() if t < tuner.task.best_config[0]: tuner.task.best_config = (t, configs[i]) results.append( autotvm.measure.MeasureResult(costs=(t, ), error_no=0, all_cost=i, timestamp=time.time())) return results tuner.task.best_config = (float('inf'), None) tuner.parse_configs = parse_configs tuner.tune(n_trial=num_trials, measure_option=measure_option, callbacks=[]) assert (not math.isinf(tuner.task.best_config[0])) best_config = tuner.task.best_config[1] print('\n[Best Config]', json.dumps(config_to_json(best_config))) else: best_config = task.config_space with ApplyConfig(best_config): with tvm.target.create(tvm_target): s, arg_bufs = default_tune_op.get_template_op() lower_source = str(tvm.lower(s, arg_bufs, simple_mode=True)) # Verify Source Code assert (len(('\n' + lower_source).split('\nproduce ')) == 2) lower_file = local_get_dir_file('my_kernel.lower') with open(lower_file, 'w') as fp: fp.write(lower_source) max_threads_per_block = tvm.ndarray.gpu(0).max_threads_per_block max_shared_memory_per_block = tvm.ndarray.gpu( 0).max_shared_memory_per_block thread_extents = subprocess.getoutput( "cat '%s' | grep '^ *// attr.*iter_var.*thread_extent'" % (lower_file)).split('\n') reserved_axes = dict({ 'threadIdx.x': None, 'threadIdx.y': None, 'threadIdx.z': None, 'blockIdx.x': None, 'blockIdx.y': None, 'blockIdx.z': None }) for line in thread_extents: thread_name = line.split('[iter_var(')[-1].split(',')[0] if thread_name in reserved_axes: thread_val = int(line.split('thread_extent = ')[-1]) if reserved_axes[thread_name] is not None: if reserved_axes[thread_name] != thread_val: assert (False) else: reserved_axes[thread_name] = thread_val else: raise Exception("Invalid thread_axis name: %s" % thread_name) num_threads = 1 for thread_name in ['threadIdx.x', 'threadIdx.y', 'threadIdx.z']: if reserved_axes[thread_name] is not None: num_threads *= reserved_axes[thread_name] if num_threads > max_threads_per_block: raise Exception( "Invalid kernel code: using num_threads %d > max_threads_per_block %d" % (num_threads, max_threads_per_block)) allocate_shared = subprocess.getoutput( "cat '%s' | grep 'allocate .*shared\[.*\]'" % (lower_file)).split('\n') shared_memory_in_bytes = 0 for line in allocate_shared: if not line: continue parts = line.split('[') assert (len(parts) == 2) parts = parts[1].split(' * ') assert (len(parts) == 2) assert (parts[1][-1] == ']') allocate_type = parts[0] allocate_val = int(parts[1][:-1]) if allocate_type in ['float32']: shared_memory_in_bytes += allocate_val * 4 else: raise Exception( "Unrecognized shared memory data type: %s" % allocate_type) if shared_memory_in_bytes > max_shared_memory_per_block: raise Exception( "Invalid kernel code: using shared_memory_in_bytes %d > max_shared_memory_per_block %d" % (shared_memory_in_bytes, max_shared_memory_per_block)) func = tvm.build(s, arg_bufs, tvm_target, name='template_op') assert (len(func.imported_modules) == 1) device_source = translate_code(func.imported_modules[0].get_source()) if code_only: return device_source if lower_source and device_source: tune_slot_id = 0 if 'CUDA_VISIBLE_DEVICES' not in os.environ else int( os.environ['CUDA_VISIBLE_DEVICES']) exec_fd, _ = system_lock([tune_slot_id]) gpu_id = 0 ctx = tvm.context(tvm_target, gpu_id) tensors, outs = [], [] for arg in arg_bufs: shape = [int(x) for x in arg.shape] is_output = arg.op.__class__ != tvm.tensor.PlaceholderOp from tvm._ffi.ndarray import empty td = empty(shape, arg.dtype, ctx) if is_output: outs.append(td) tensors.append(td) def timeout_handler(): print("Error: Timeout during Kernel warmup") os._exit(1) my_timer = Timer(10, timeout_handler, []) my_timer.start() # Warmup func(*tensors) tvm.ndarray.gpu(gpu_id).sync() # Estimate t_start = time.time() func(*tensors) tvm.ndarray.gpu(gpu_id).sync() t_diff = time.time() - t_start my_timer.cancel() del my_timer num_runs = max(3, min(100, math.floor(1.0 / t_diff))) timeout_seconds = math.ceil((num_runs + 5) * t_diff) my_timer = Timer(timeout_seconds, timeout_handler, []) my_timer.start() timer_f = func.time_evaluator(func.entry_name, ctx, number=num_runs) t = timer_f(*tensors).mean my_timer.cancel() exec_fd() gflops = compute_gflops(task.flop, t) print("[TVM-engine] Average time cost of %d runs = %g ms, %g gflops." % (num_runs, t * 1e3, gflops)) with open(local_get_dir_file('result.txt'), 'w') as fp: fp.write(str(t))
def tune_model( tvmc_model: TVMCModel, target: str, tuning_records: Optional[str] = None, prior_records: Optional[str] = None, enable_autoscheduler: bool = False, rpc_key: Optional[str] = None, hostname: Optional[str] = None, port: Optional[Union[int, str]] = 9090, trials: int = 10000, target_host: Optional[str] = None, tuner: str = "xgb", min_repeat_ms: Optional[int] = None, early_stopping: Optional[int] = None, desired_layout: Optional[str] = None, timeout: int = 10, repeat: int = 1, number: int = 10, parallel: int = 4, hardware_params: Optional[HardwareParams] = None, include_simple_tasks: bool = False, log_estimated_latency: bool = False, ): """Use tuning to automatically optimize the functions in a model. Parameters ---------- tvmc_model : TVMCModel The model to be optimized. target : str Compilation target as plain string, inline JSON or path to a JSON file. tuning_records: str, optional The path to a file that tuning results will be saved to. If not specified, a temporary file will be used. prior_records: str, optional A path to previous tuning results that will be used to hot-start the tuning cost model if provided. enable_autoscheduler : bool, optional When true, use autoscheduling rather than autotvm. This should produce faster kernels for compatible model-target pairs. rpc_key : str, optional The RPC tracker key of the target device. Required when rpc_tracker is provided. host_name : str, optional The IP address of an RPC tracker, used when benchmarking remotely. port : int or str, optional The port of the RPC tracker to connect to. Defaults to 9090. trials : int, optional The number of schedules to try out for the entire model. Note that the default value is chosen as a decent average for most models, but larger models may need more trials to reach a good result while smaller models will converge with fewer trials. tuner : str, optional The type of tuner to use when tuning with autotvm. Can be one of "ga", "gridsearch", "random", "xgb", "xgb_knob", and "xgb-rank". min_repeat_ms : int, optional Minimum time to run each trial. Defaults to 0 on x86 and 1000 on other targets. early_stopping : int, optional When specified, stop tuning after this number of trials if results aren't improving. desired_layout : str, optional Can be one of "NCHW" or "NHWC". When specified, compatible operations in the graph will have their layout set to this format. Tasks will then be tuned using this specified layout. timeout : int, optional, If a kernel trial lasts longer than this duration in seconds, it will be considered a failure. repeat : int, optional How many times each measurement should be repeated. number : int, optional The number of runs a single repeat is made of. parallel : int, optional The maximum number of parallel devices to use when tuning. hardware_params : auto_scheduler.HardwareParams, optional When using the autoscheduler, this object defines the configuration of the target hardware. include_simple_tasks : bool, optional Whether to extract simple operations or only computationally intensive ones when using the autoscheduler. log_estimated_latency : bool, optional If using the autoscheduler, write the estimated latency at each step of tuning to file. Returns ------- tuning_records : str The path to the produced tuning log file. """ target, extra_targets = common.target_from_cli(target) target, target_host = Target.check_and_update_host_consist( target, target_host) # TODO(jwfromm) Remove this deepcopy once AlterOpLayout bug that mutates source # model is fixed. For now, creating a clone avoids the issue. mod = deepcopy(tvmc_model.mod) params = tvmc_model.params if tuning_records is None: tuning_records = tvmc_model.default_tuning_records_path() for codegen_from_cli in extra_targets: codegen = composite_target.get_codegen_by_target( codegen_from_cli["name"]) partition_function = codegen["pass_pipeline"] mod = partition_function(mod, params, **codegen_from_cli["opts"]) # min_repeat_ms should be: # a. the value provided by the user, if any, or # b. 0ms in case target is "cpu"; otherwise 1000ms if min_repeat_ms is None: min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000 logger.info("Default --min-repeat-ms for this target is %s", min_repeat_ms) if rpc_key: if hostname is None or port is None: raise common.TVMCException( "You must provide a hostname and port to connect to a remote RPC device." ) if isinstance(port, str): port = int(port) logger.info("Tuning will be performed on device %s at %s:%d.", rpc_key, hostname, port) runner_ctor = auto_scheduler.RPCRunner if enable_autoscheduler else autotvm.RPCRunner runner = runner_ctor( key=rpc_key, host=hostname, port=port, number=number, repeat=repeat, n_parallel=parallel, timeout=timeout, min_repeat_ms=min_repeat_ms, ) else: logger.info("Starting localhost tuning.") runner_ctor = (auto_scheduler.LocalRPCMeasureContext if enable_autoscheduler else autotvm.LocalRunner) local_server = runner_ctor( number=number, repeat=repeat, timeout=timeout, min_repeat_ms=min_repeat_ms, ) # For autoscheduling on some devices, we need to maintain a LocalRPCMeasureContext object. if enable_autoscheduler: runner = local_server.runner else: runner = local_server if enable_autoscheduler: tasks, weights = autoscheduler_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=desired_layout, hardware_params=hardware_params, include_simple_tasks=include_simple_tasks, ) # Create the autoscheduler tuning options tuning_options = auto_scheduler.TuningOptions( num_measure_trials=trials, measure_callbacks=[auto_scheduler.RecordToFile(tuning_records)], runner=runner, early_stopping=early_stopping, ) logger.info("Autoscheduling with configuration: %s", tuning_options) # Schedule the tasks (i.e., produce a schedule for each task) schedule_tasks(tasks, weights, tuning_options, prior_records, log_estimated_latency) else: tasks = autotvm_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=desired_layout, ) # In autotvm, trials is specified per task. We can convert the per-model input # provided to per-task trials by dividing by the number of tasks. trials = int(trials / len(tasks)) logger.info("Autotuning with %d trials per task.", trials) tuning_options = { "tuner": tuner, "trials": trials, "early_stopping": early_stopping, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="default"), runner=runner), "tuning_records": prior_records, } logger.info("Autotuning with configuration: %s", tuning_options) tune_tasks(tasks, tuning_records, **tuning_options) return tuning_records
print(task.config_space) ################################################################ # Then we need to define how to measure the generated code and pick a tuner. # Since our space is small, a random tuner is just okay. # # We only make 10 trials in this tutorial for demonstration. In practice, # you can do more trials according to your time budget. # We will log the tuning results into a cache file. This file can be # used to get the best config later. # logging config (for printing tuning log to screen) logging.basicConfig(level=logging.INFO, stream=sys.stdout) # use local cpu, measure 5 times for every config to reduce variance measure_option = autotvm.measure_option(mode='local', number=5) # begin tuning, log records to file `cache.tsv` tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=10, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('cache.tsv')]) ######################################################################### # Finally we apply history best from the cache file and check its correctness. # We can call the function :code:`matmul` directly under the # :any:`autotvm.apply_history_best` context. When we call this function, # it will query the dispatch context with its argument and get the best config # with the same argument. # apply history best from log file
tuning_option = { "log_filename": log_file, "tuner": "random", "n_trial": 1000, "early_stopping": None, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.RPCRunner( env.TARGET, host=tracker_host, port=tracker_port, number=5, timeout=60, check_correctness=True, ), ), } #################################################################### # # .. note:: How to set tuning options # # In general, the default values provided here work well. # If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` # to larger values, makes the tuning run for longer. # If your device is under-powered or your conv2d operators are large, consider
import tvm.contrib.graph_runtime as runtime import cv2 network = 'peleenet_1d_float16_nano' log_file = "%s.log" % network dtype = 'float32' tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 600, 'early_stopping': 600, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150)), } tuning_rpc_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 1000, 'early_stopping': 1000, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=1000), runner=autotvm.RPCRunner( 'nano', host='0.0.0.0', port=9190, number=2, repeat = 3,
# num flop NH, NW = [e.value for e in output.shape[2:4]] cfg.add_flop(N * CO * NH * NW * (CI * KH * KW * 2)) return s, [raw_data, kernel, output] logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) N, H, W, CO, CI, KH, KW, strides, padding, scaling_factor = 1, 14, 14, 512, 512, 1, 1, 2, 0, 1.0 task = autotvm.task.create(conv2d, args=(N, H, W, CO, CI, KH, KW, strides, padding, scaling_factor), target='cuda') measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=10, timeout=4)) if DO_TUNING: tuner = autotvm.tuner.XGBTuner(task) tuner.tune(n_trial=2000, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('conv2d.log')]) dispatch_context = autotvm.apply_history_best("conv2d.log") best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) else: config = task.config_space.get(PRETUNED_INDEX) dispatch_context = autotvm.task.ApplyConfig(config)
cfg.define_reorder("reorder", [ko, xc, ki, yc], "all") cfg["reorder"].apply(s, CC, [ko, xc, ki, yc]) cfg.define_annotate('ann', [ko, xc, ki, yc], policy='try_unroll_vec') cfg['ann'].apply(s, CC, [ko, xc, ki, yc]) x, y, z = s[packedB].op.axis s[packedB].vectorize(z) s[packedB].parallel(x) return s, [A, B, C] task = autotvm.task.create('matmul', args=[], target=target) measure_option = autotvm.measure_option( #builder='local', builder=autotvm.LocalBuilder(n_parallel=56), runner=autotvm.LocalRunner(number=3)) # begin tuning, log records to file `matmul.log` #tuner = autotvm.tuner.XGBTuner(task, argsDict=None) #tuner = autotvm.tuner.XGBTuner(task) #tuner = autotvm.tuner.RandomTuner(task) tuner = autotvm.tuner.GridSearchTuner(task) n_trial = 4000 early_stopping = None if os.path.exists('matmul_skx.log.tmp'): os.remove('matmul_skx.log.tmp') tuner.tune(n_trial=n_trial, early_stopping=early_stopping, measure_option=measure_option, callbacks=[
#### TUNING OPTION #### network = 'resnet-18' log_file = "%s.log" % network dtype = 'float32' tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 2000, 'early_stopping': 600, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), #runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150), runner=autotvm.RPCRunner( '1080ti', # change the device key to your key '0.0.0.0', 9190, number=20, repeat=3, timeout=4, min_repeat_ms=150) ), } #################################################################### # # .. note:: How to set tuning options # # In general, the default value provided here works well. # # If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, # which makes the tuning runs longer. # # If you have multiple devices, you can use all of them for measurement to
log_file = "%s.%s.log" % (device_key, network) dtype = 'float32' tuning_option = { 'log_filename': log_file, 'tuner': 'xgb_knob', 'n_trial': 1500, 'early_stopping': 800, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder( build_func='ndk' if use_android else 'default'), runner=autotvm.RPCRunner( device_key, host='0.0.0.0', port=9000, number=5, timeout=10, ), #runner=autotvm.LocalRunner() ), } def get_val_data(image_path): filenames = os.listdir(image_path) images = [] imgs = [] for filename in filenames: image = cv2.imread(image_path + filename) images.append(image) img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
def tune_and_compile(graph: Graph, batch_size, target, target_host, device=None): # # this function is adopted and modified from tvm tutorial # log_dir = "./tvm_schedule_configs" os.makedirs(log_dir, exist_ok=True) log_file = os.path.join(log_dir, f"{graph.name}_{device}_{batch_size}.log") tuning_option = { 'log_filename': log_file, 'tuner': 'ga', 'n_trial': 2000, 'early_stopping': 600, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), # runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150), runner=autotvm.RPCRunner( 'v100', # change the device key to your key '0.0.0.0', 9190, number=20, repeat=3, timeout=4), ) } # You can skip the implementation of this function for this tutorial. def tune_tasks(tasks, measure_option, tuner, n_trial, early_stopping, log_filename, use_transfer_learning=True): # create tmp log file tmp_log_file = log_filename + ".tmp" for i, tsk in enumerate(reversed(tasks)): prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) # create tuner if tuner == 'xgb' or tuner == 'xgb-rank': tuner_obj = XGBTuner(tsk, loss_type='rank') elif tuner == 'ga': tuner_obj = GATuner(tsk, pop_size=100) elif tuner == 'random': tuner_obj = RandomTuner(tsk) elif tuner == 'gridsearch': tuner_obj = GridSearchTuner(tsk) else: raise ValueError("Invalid tuner: " + tuner) if use_transfer_learning: if os.path.isfile(tmp_log_file): tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) # do tuning # print(f"tsk.config_space {tsk.config_space}") tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)), early_stopping=early_stopping, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(n_trial, prefix=prefix), autotvm.callback.log_to_file(tmp_log_file)]) # pick best records to a cache file autotvm.record.pick_best(tmp_log_file, log_filename) os.remove(tmp_log_file) mod, params = graph2relay(graph, batch_size) input_shape = (batch_size,) + tuple(graph.enter_node.output_shape) out_shape = (batch_size,) + tuple(graph.blocks[-1].exit_node.output_shape) # print(input_shape, out_shape) tasks = autotvm.task.extract_from_program(mod["main"], target=target, target_host=target_host, params=params, ops=(relay.op.nn.conv2d,)) # run tuning tasks if os.path.exists(log_file): print(f"Tuned config found, use {log_file} as config") else: print("Tuning...") tune_tasks(tasks, **tuning_option) # compile kernels with history best records with autotvm.apply_history_best(log_file): # print("Compile...") with relay.build_config(opt_level=3): # opt_level = 3 has problem graph, lib, params = relay.build_module.build(mod, target=target, target_host=target_host, params=params) return graph, lib, params
def test_db_filter(): logging.info("test db filter ...") # Pick a GPU target because there are more likely to be failures/invalid configs task, target = get_sample_task() ctx = tvm.context(str(target)) if not ctx.exist: logging.warning( "Skip this test because there is no supported device for test") batch_size = 2 measure_option = autotvm.measure_option('local', do_fork=False, timeout=2) measure_batch = autotvm.measure.create_measure_batch(task, measure_option) ct = 0 all_inputs = list() all_results = list() batches = list() tuner = autotvm.tuner.RandomTuner(task) while ct < TRIAL_LIMIT: inputs = list() for i in range(batch_size): cfg = tuner.next_batch(1)[0] inputs.append((MeasureInput(target, task, cfg))) all_inputs.append(inputs[-1]) batches.append(inputs) results = measure_batch(inputs) all_results += results ct += 1 del measure_batch db = database.DummyDatabase() db.flush() # First setting, memoize one input at a time, check that each is saved and replayed measure_option = autotvm.measure_option('local', do_fork=False, timeout=2, replay_db=db) measure_batch = autotvm.measure.create_measure_batch(task, measure_option) for i in range(len(all_inputs) + 1): db.flush() for j in range(i): db.save(all_inputs[j], all_results[j]) for k in range(len(batches)): batch = batches[k] batch_result = measure_batch(batch) for l in range(batch_size): all_idx = k * batch_size + l assert batch_result[l] is not None if all_idx < i: assert encode(batch[l], batch_result[l]) == encode(batch[l], all_results[all_idx]), \ "(no retry) EXPECTED MATCH, GOT MISMATCH" else: assert encode(batch[l], batch_result[l]) != encode(batch[l], all_results[all_idx]), \ "(no retry) EXPECTED MISMATCH, GOT MATCH" del measure_batch
task = autotvm.task.create( group_conv2d, args=(N, CI, H, W, CO, KH, KW, strides, padding, dilation, groups), target=tvm.target.vta(), target_host=env.target_host, template_key="direct", ) print(task.config_space) # Tune measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.RPCRunner( env.TARGET, host=tracker_host, port=int(tracker_port), number=5, timeout=60, # check_correctness=True, # TODO: re-enable when check_correctness works again. ), ) # Run Tuner tuner = autotvm.tuner.RandomTuner(task) tuner.tune( n_trial=len(task.config_space), early_stopping=None, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(len(task.config_space), prefix=prefix),
ctx = tvm.context(target, 0) src = str(M) + "*" + str(K) + "*" + str(N) print(src) matmul = matmul space_len = 16 early_stopping = 8 task = autotvm.task.create(matmul,args=(M,K,N,dtype),target=target) print(task.config_space) testwithnumpy() # logging config (for printing tuning log to the screen) # logging.getLogger('autotvm').setLevel(logging.DEBUG) # logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) # There are two steps for measuring a config: build and run. # By default, we use all CPU cores to compile program. Then measure them sequentially. # We measure 5 times and take average to reduce variance. measure_option = autotvm.measure_option(builder='local',runner=autotvm.LocalRunner(number=5)) # Begin tuning with RandomTuner, log records to file `matmul.log` # You can use alternatives like XGBTuner. print("XGBoost:") XGBtuner = autotvm.tuner.XGBTuner(task) XGBtuner.tune(n_trial=space_len,early_stopping=early_stopping, measure_option=measure_option, callbacks=[autotvm.callback.progress_bar(space_len),autotvm.callback.log_to_file('XGBtuner_matmul.log')]) print("###############################") #testwithNoneopt('XGBtuner_matmul.log',ctx,matmul) testwithnumpy() print(XGBtuner.flops_max) print(XGBtuner.task) print(XGBtuner.xs)
# times and use the average of results. In addition, we need to flush the cache # for the weight tensors between repeated measurements. This can make the measured # latency of one operator closer to its actual latency during end-to-end inference. tuning_option = { "log_filename": log_file, "tuner": "random", "early_stopping": None, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True), ), } # You can skip the implementation of this function for this tutorial. def tune_kernels(tasks, measure_option, tuner="gridsearch", early_stopping=None, log_filename="tuning.log"): for i, task in enumerate(tasks): prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
def tune_and_evaluate(M, N, L, dtype, layout): task = autotvm.task.create("tutorial/auto_tensorcore/test_gemm", args=(N, L, M, dtype, layout), target='cuda') print(task.config_space) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) measure_option = autotvm.measure_option( builder='local', runner=autotvm.LocalRunner(number=5)) tuner = autotvm.tuner.XGBTuner(task) tuner.tune(n_trial=1000, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('matmul.log')]) dispatch_context = autotvm.apply_history_best("matmul.log") best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) with autotvm.apply_history_best('matmul.log'): with tvm.target.Target("cuda"): s, arg_bufs = test_gemm(N, L, M, dtype, layout) print(tvm.lower(s, arg_bufs, simple_mode=True)) func = tvm.build(s, arg_bufs) dev_module = func.imported_modules[0] print(dev_module.get_source()) # check correctness if (layout == "NN"): shape_a = (N, L) shape_b = (L, M) elif (layout == "NT"): shape_a = (L, N) shape_b = (L, M) elif (layout == "TN"): shape_a = (N, L) shape_b = (M, L) elif (layout == "TT"): shape_a = (L, N) shape_b = (M, L) a_np = None b_np = None c_np = None c_np_type = None if dtype == 'float16': c_np_type = np.float32 a_np = np.random.uniform(size=shape_a).astype(np.float16) b_np = np.random.uniform(size=shape_b).astype(np.float16) if (layout == "NN"): c_np = np.dot(a_np, b_np) elif (layout == "NT"): c_np = np.dot(a_np.T, b_np) elif (layout == "TN"): c_np = np.dot(a_np, b_np.T) elif (layout == "TT"): c_np = np.dot(a_np.T, b_np.T) elif dtype == 'int8': c_np_type = np.int32 a_np = np.random.randint(low=-128, high=127, size=shape_a).astype(np.int8) b_np = np.random.randint(low=-128, high=127, size=shape_b).astype(np.int8) if (layout == "NN"): c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32)) elif (layout == "NT"): c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32)) elif (layout == "TN"): c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T) elif (layout == "TT"): c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T) elif dtype == 'int4': c_np_type = np.int32 a_np_int = np.random.randint(low=-8, high=7, size=shape_a).astype(np.int32) b_np_int = np.random.randint(low=-8, high=7, size=shape_b).astype(np.int32) # "TN" c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T) a_np = np.zeros(shape=(N, int(L / 8)), dtype=np.int32) b_np = np.zeros(shape=(M, int(L / 8)), dtype=np.int32) # a_np --> col_major for i in range(N): for j in range(int(L / 8)): for k in range(8): a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4)) # b_np --> row_major for i in range(M): for j in range(int(L / 8)): for k in range(8): b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4)) elif dtype == 'int1': c_np_type = np.int32 a_np_int = np.random.randint(low=0, high=1, size=shape_a).astype(np.int32) b_np_int = np.random.randint(low=0, high=1, size=shape_b).astype(np.int32) # "TN" c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T) a_np = np.zeros(shape=(N, int(L / 32)), dtype=np.int32) b_np = np.zeros(shape=(M, int(L / 32)), dtype=np.int32) for i in range(N): for j in range(int(L / 32)): for k in range(32): a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 32 + k] & 0xf) << (31 - k)) for i in range(M): for j in range(int(L / 32)): for k in range(32): b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xf) << (31 - k)) c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx) a_tvm = tvm.nd.array(a_np, ctx=ctx) b_tvm = tvm.nd.array(b_np, ctx=ctx) func(a_tvm, b_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3) evaluator = func.time_evaluator(func.entry_name, ctx, number=100) print('Time cost of this operator: %f' % evaluator(a_tvm, b_tvm, c_tvm).mean)
batch_size = 1 model_name = 'resnet18v2' log_file = "%s-batchsize%d-optimization.log" % (model_name, batch_size) input_filename = "kitten.jpg" tuning_options = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 2000, 'early_stopping': 600, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4), ), } tuneable = tvm.autotvm.task.get_config() #### DEVICE CONFIG #### target = tvm.target.cuda() input_shape = (batch_size, 3, 224, 224) output_shape = (batch_size, 1000) sym, params = load_onnx_model(model_name + ".onnx") tune_and_evaluate(sym, params, input_shape, output_shape, tuning_options)
tuning_opt = { 'log_filename': opt.log_filename, 'tuner': opt.tuner, 'n_trial': 1e9, 'early_stopping': None, 'measure_option': autotvm.measure_option(builder=autotvm.LocalBuilder( build_func=vta.vta_autotvm_build_func), runner=autotvm.RPCRunner( env.TARGET, tracker_host, tracker_port, number=4, min_repeat_ms=150, repeat=opt.measurements, timeout=60, check_correctness=True)) } tune_tasks(tasks, **tuning_opt) # Compile kernels with history best records with autotvm.tophub.context(target, extra_files=[opt.log_filename]): # Compile network print("Compiling network with best tuning parameters...") with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta":
# small. The ``early_stopping`` parameter is the minimum number of trails to # run before a condition that stops the search early can be applied. The # measure option indicates where trial code will be built, and where it will be # run. In this case, we're using the ``LocalRunner`` we just created and a # ``LocalBuilder``. The ``tuning_records`` option specifies a file to write # the tuning data to. tuning_option = { "tuner": "xgb", "trials": 10, "early_stopping": 100, "measure_option": autotvm.measure_option(builder=autotvm.LocalBuilder(build_func="default"), runner=runner), "tuning_records": "resnet-50-v2-autotuning.json", } ################################################################################ # .. admonition:: Defining the Tuning Search Algorithm # # By default this search is guided using an `XGBoost Grid` algorithm. # Depending on your model complexity and amount of time available, you might # want to choose a different algorithm. ################################################################################ # .. admonition:: Setting Tuning Parameters # # In this example, in the interest of time, we set the number of trials and
def compile_via_tvm(sym, arg_params, aux_params, symbol_file, data_shape, tune): input_shape = [1] + list(data_shape) input_dict = {'data': input_shape} input_name = 'data' batch = 1 seq_length = 128 input_dict = { 'data0': (batch, seq_length), 'data1': (batch, seq_length), 'data2': (batch,) } mod, params = relay.frontend.from_mxnet(sym, dtype={}, shape=input_dict, arg_params=arg_params, aux_params=aux_params) model_name = symbol_file.split('/')[-1].replace('.json','') log_dir = os.getcwd() + "/tuned_logs_c5" pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) log_file = log_dir + "/" + "%s.log" % model_name graph_opt_sch_file = log_dir + "/" + "%s_graph_opt.log" % model_name Path(log_file).touch() Path(graph_opt_sch_file).touch() if tune: tuning_option = { 'log_filename': log_file, 'tuner': 'random', 'early_stopping': None, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=10, repeat=1, min_repeat_ms=1000), ), } tune_and_evaluate(tuning_option, mod, params, input_shape, log_file, graph_opt_sch_file, input_name) # with autotvm.apply_graph_best(graph_opt_sch_file): with autotvm.apply_history_best(log_file): with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build( mod, target=target, params=params) base_dir = os.getcwd() + "/compiled_models" pathlib.Path(base_dir).mkdir(parents=True, exist_ok=True) base = base_dir + '/tvm_' + symbol_file.split('/')[-1].replace('.json','') path_lib = base + '_deploy_lib.tar' path_graph = base + '_deploy_graph.json' path_params = base + '_deploy_params.params' lib.export_library(path_lib) with open(path_graph, 'w') as fo: fo.write(graph) with open(path_params, 'wb') as fo: fo.write(relay.save_param_dict(params))
network = 'resnet-18' log_file = "%s.%s.log" % (device_key, network) dtype = 'float32' tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 2000, 'early_stopping': 800, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder( build_func='ndk' if use_android else 'default'), runner=autotvm.RPCRunner( device_key, host='localhost', port=9190, number=5, timeout=4, ), ), } #################################################################### # # .. note:: How to set tuning options # # In general, the default values provided here work well. # If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, # which makes the tuning run longer. # If your device runs very slow or your conv2d operators have many GFLOPs, considering to # set timeout larger.
# logging config (for printing tuning log to screen) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) # the last layer in resnet N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1) task = autotvm.task.create(conv2d_no_batching, args=(N, H, W, CO, CI, KH, KW, strides, padding), target='cuda') print(task.config_space) # Use local gpu, measure 10 times for every config to reduce variance # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4) ) # Begin tuning, log records to file `conv2d.log` # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task) tuner.tune(n_trial=20, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('conv2d.log')]) ######################################################################### # Finally we can inspect the best config from log file, check correctness, # and measure running time. # inspect the best config