Ejemplo n.º 1
0
def test_check_correctness():
    task, target = get_sample_task()

    measure_option = autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(check_correctness=True)
    )

    def _callback_correct(tuner, measure_inputs, measure_results):
        for inp, res in zip(measure_inputs, measure_results):
            assert res.error_no == 0

    tuner = autotvm.tuner.RandomTuner(task)
    tuner.tune(n_trial=2, measure_option=measure_option,
               callbacks=[_callback_correct])

    # a bad template
    n = 128
    target = tvm.target.create("llvm -device=bad_device")
    task = autotvm.task.create(bad_matmul, args=(n, n, n, 'float32'), target=target)

    def _callback_wrong(tuner, measure_inputs, measure_results):
        for inp, res in zip(measure_inputs, measure_results):
            assert res.error_no == MeasureErrorNo.WRONG_ANSWER

    tuner = autotvm.tuner.RandomTuner(task)
    tuner.tune(n_trial=2, measure_option=measure_option,
               callbacks=[_callback_wrong])
Ejemplo n.º 2
0
def test_task_tuner_without_measurement():
    """test task and tuner without measurement"""
    task, target = get_sample_task()

    class DummyRunner(Runner):
        def __init__(self):
            super(DummyRunner, self).__init__(1, 1)

        def run(self, measure_inputs, build_results):
            return [MeasureResult((np.random.random(),), 0, 0.2, time.time())
                    for _ in range(len(measure_inputs))]

        def get_build_kwargs(self):
            return {}

    measure_option = autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=DummyRunner()
    )

    logging.info("%s", task.config_space)

    for tuner_class in [autotvm.tuner.RandomTuner,
                        autotvm.tuner.GridSearchTuner,
                        autotvm.tuner.GATuner,
                        autotvm.tuner.XGBTuner]:
        tuner = tuner_class(task)
        tuner.tune(n_trial=10, measure_option=measure_option)
        assert tuner.best_flops > 1
Ejemplo n.º 3
0
    def check(target, target_host):
        ctx = tvm.context(target, 0)
        if not ctx.exist:
            logging.info("Skip test because %s is not available" % target)
            return

        # init task
        task, target = get_sample_task(target, target_host)
        logging.info("%s", task.config_space)

        measure_option = autotvm.measure_option(
            autotvm.LocalBuilder(),
            autotvm.LocalRunner())

        tuner = RandomTuner(task)
        tuner.tune(n_trial=20, measure_option=measure_option)
Ejemplo n.º 4
0
def test_min_repeat_ms():
    task, target = get_sample_task()

    measure_option = autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(number=1, min_repeat_ms=100)
    )

    def _callback(tuner, measure_inputs, measure_results):
        for inp, res in zip(measure_inputs, measure_results):
            if res.error_no != 0:
                continue

            assert 1000 * np.mean(res.costs) * \
                   measure_option['runner'].cur_number >= 100

    tuner = autotvm.tuner.RandomTuner(task)
    tuner.tune(n_trial=5, measure_option=measure_option,
               callbacks=[_callback])
Ejemplo n.º 5
0
def autoTVM_conv(bs, oc, ic, nh, nw, kh, kw, ph=0, pw=0, sh=1, sw=1):
    """
        use autoTVM to get the best throughput
    """
    @autotvm.template("leslie/second_conv")
    def my_tune(bs, oc, ic, nh, nw, kh, kw, ph, pw, sh, sw):
        # toc = 16
        # tic = 16
        # tw = 4

        cfg = autotvm.get_config()
        cfg.define_knob("tile_w", [1, 2, 4])
        cfg.define_knob("tile_oc", [16, 32, 64])
        cfg.define_knob("tile_ic", [16, 32, 64])

        tic = cfg["tile_ic"].val
        toc = cfg["tile_oc"].val
        tw = cfg["tile_w"].val

        X = te.placeholder((bs, ic, nh, nw), name='X')
        K = te.placeholder((oc, ic, kh, kw), name='K')
        PaddedX = padding(X, ph, pw) if ph * pw != 0 else X
        # pack X and K
        assert ic % tic == 0 and oc % toc == 0

        PackedX = te.compute(
            (bs, ic // tic, nh + ph * 2, nw + pw * 2, tic),
            lambda b, ic_out, x, y, ic_in: PaddedX[b, ic_out * tic + ic_in, x, y],
            name='PackedX')

        PackedK = te.compute(
            (oc // toc, ic // tic, kh, kw, tic, toc),
            lambda oc_out, ic_out, x, y, ic_in, oc_in: K[
                oc_out * toc + oc_in, ic_out * tic + ic_in, x, y],
            name='PackedK')

        # reduction axes
        ric_in = te.reduce_axis((0, tic), name='ric_in')
        ric_out = te.reduce_axis((0, ic // tic), name='ric_out')
        rkh = te.reduce_axis((0, kh), name='rkh')
        rkw = te.reduce_axis((0, kw), name='rkw')
        # output height and weights
        oh = conv_out_size(nh, kh, ph, sh)
        ow = conv_out_size(nw, kw, pw, sw)

        # Compuated Y in the packed layout
        PackedY = te.compute(
            (bs, oc // toc, oh, ow, toc),
            lambda b, oc_out, x, y, oc_in: te.sum(
                PackedX[b, ric_out, x * sh + rkh, y * sw + rkw, ric_in] *
                PackedK[oc_out, ric_out, rkh, rkw, ric_in, oc_in],
                axis=[ric_out, rkh, rkw, ric_in]), name='Y')

        # Unpack the result
        Y = te.compute((bs, oc, oh, ow),
                       lambda b, oc, x, y: PackedY[b, oc // toc, x, y, oc % toc],
                       name='Y')

        s = te.create_schedule(Y.op)

        CachedY = s.cache_write(PackedY, 'local')

        # self test by leslie
        bso, oc_out, h, w, oc_in = s[PackedY].op.axis

        s[PackedY].reorder(bso, h, w, oc_out, oc_in)

        #w_out, w_in = s[PackedY].split(w, cfg["tile_w"].val)  # Split the columns
        w_out, w_in = s[PackedY].split(w, tw)

        bso_h_w_out = s[PackedY].fuse(bso, h, w_out)
        s[PackedY].parallel(bso_h_w_out)

        # CachedY = s.cache_write(PackedY, 'local')

        s[CachedY].compute_at(s[PackedY], bso_h_w_out)

        c_bso, c_oc_out, ch, cw, c_oc_in = CachedY.op.axis

        ric_out, rkh, rkw, ric_in = CachedY.op.reduce_axis

        s[CachedY].reorder(ric_out, rkh, rkw, ric_in, c_oc_out, cw, c_oc_in)

        s[CachedY].unroll(cw)
        # s[CachedY].unroll(c_oc_out)
        s[CachedY].vectorize(c_oc_in)

        # Schedule the padding by adding thread-level parallelism
        if PaddedX != X:
            s[PaddedX].parallel(PaddedX.op.axis[0])
        # Optimize the packing of X and K
        s[PackedX].parallel(s[PackedX].fuse(*PackedX.op.axis[0:2]))
        s[PackedX].unroll(PackedX.op.axis[-1])
        s[PackedK].parallel(s[PackedK].fuse(*PackedK.op.axis[0:2]))
        s[PackedK].unroll(PackedK.op.axis[-1])
        # Optimize the unpacking of Y
        s[Y].parallel(s[Y].fuse(*Y.op.axis[0:2]))
        s[Y].unroll(Y.op.axis[-1])

        return s, [X, K, Y]

    #param = (bs, oc, ic, nh, nw, kh, kw, ph, pw, sh, sw)
    task = autotvm.task.create("leslie/second_conv", args=(bs, oc, ic, nh, nw, kh, kw, ph, pw, sh, sw), target=target)
    #task = autotvm.task.create(bs, oc, ic, nh, nw, kh, kw, ph, pw, sh, sw)
    print(task.config_space)
    print(len(task.config_space))
    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(timeout=10000),
                                            runner=autotvm.LocalRunner(repeat=1, number=10, min_repeat_ms=1000, timeout=1000))

    logfile = 'leslie_tune.log'
    os.system("rm -rf {}".format(logfile))
    tuner = autotvm.tuner.XGBTuner(task)
    #n_trial = len(task.config_space)
    n_trial = len(task.config_space)
    prefix = "[Task]"
    tuner.tune(n_trial=n_trial, measure_option=measure_option, callbacks=[autotvm.callback.progress_bar(n_trial, prefix=prefix),
                                                                          autotvm.callback.log_to_file(logfile)])

    # evalute task
    with autotvm.apply_history_best(logfile):
        print("Compiling")
        with tvm.target.create(target):
            s, arg_bufs = my_tune(bs, oc, ic, nh, nw, kh, kw, ph, pw, sh, sw)
            mod = tvm.build(s, arg_bufs, target=target)

    return mod
Ejemplo n.º 6
0
# logging config (for printing tuning log to screen)
logging.getLogger("autotvm").setLevel(logging.DEBUG)
logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))

# the last layer in resnet
N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
task = autotvm.task.create(
    "tutorial/conv2d_no_batching", args=(N, H, W, CO, CI, KH, KW, strides, padding), target="cuda"
)
print(task.config_space)

# Use local gpu, measure 10 times for every config to reduce variance
# The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds
measure_option = autotvm.measure_option(
    builder=autotvm.LocalBuilder(),
    runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4),
)

# Begin tuning, log records to file `conv2d.log`
# During tuning we will also try many invalid configs, so you are expected to
# see many error reports. As long as you can see non-zero GFLOPS, it is okay.
tuner = autotvm.tuner.XGBTuner(task)
tuner.tune(
    n_trial=20,
    measure_option=measure_option,
    callbacks=[autotvm.callback.log_to_file("conv2d.log")],
)

#########################################################################
# Finally we can inspect the best config from log file, check correctness,
# and measure running time.
Ejemplo n.º 7
0
    def benchmark_layout_transform(
        self,
        min_exec_num=100,
        timeout=10,
        use_rpc=False,
        device_key=None,
        host="localhost",
        port=9190,
        n_parallel=1,
        build_func="default",
        layout_records=None,
        target_host=None,
        infer_layout=False,
    ):
        """Benchmark all possible layout transformation in the graph,
        given a set of schedule candidates for each workload of target operator.

        Parameters
        ----------
        min_exec_num : int, optional
            Minimum number of execution. Final execution time is the average of
            all execution time.

        timeout : int, optional
            Time out for each execution.

        use_rpc : boolean, optional
            Whether to use rpc mode for benchmarking.

        device_key : str, optional
            Remote device key which can be queried by
            python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190

        host : str, optional
            IP address used to create RPC tracker on host machine.

        port : int, optional
            Port number used to create RPC tracker on host machine.

        n_parallel: int, optional
            The number of measurement task that can run in parallel.
            Set this according to the number of cpu cores (for compilation) and
            the number of devices you have (for measuring generate code).

        build_func: str or callable, optional
            'default': call default builder. This works for normal target (llvm, cuda)

            'ndk': use Android NDK to create shared library. Use this for android target.

            callable: customized build function for other backends (e.g. VTA).
                      See autotvm/measure/measure_methods.py::default_build_func for example.

        layout_records : str or iterator of (MeasureInput, MeasureResult). optional
            Collection of layout_transform benchmarking records.
            If is str, then it should be the filename of a records log file.
                   Each row of this file is an encoded record pair.
            Otherwise, it is an iterator.

            If this argument is set, graph tuner will first check whether layout_transform
            workload already exists in records and skip benchmarking if possible.

        target_host : str, optional
            str or :any:`tvm.target.Target` optional
            Host compilation target, if target is device.
            When TVM compiles device specific program such as CUDA,
            we also need host(CPU) side code to interact with the driver
            setup the dimensions and parameters correctly.
            target_host is used to specify the host side codegen target.
            By default, llvm is used if it is enabled,
            otherwise a stackvm intepreter is used.

        infer_layout : bool, optional
            Whether to infer layout transformation time if it doesn't exist in records, instead
            of benchmarking on target device.

            This might bring performance loss comparing to benchmarking layout transformation.
        """
        self._logger.info("Start to benchmark layout transformation...")
        if layout_records is None and infer_layout:
            raise RuntimeError("Requires some records to infer layout transformation time.")

        if isinstance(layout_records, str):
            layout_records = load_from_file(layout_records)
            if not layout_records and infer_layout:
                raise RuntimeError("Records must be non-empty to infer layout transformation time.")

        if isinstance(layout_records, str):
            layout_records = load_from_file(layout_records)
        num_flops, total_time = 0, 0
        if layout_records is not None:
            for record in layout_records:
                ltf_wkl = record[0].task.workload
                self._layout_transform_perf_records[ltf_wkl] = record
                input_shape = ltf_wkl[1][1]
                flops = np.prod(input_shape)
                num_flops += flops
                total_time += record[1].costs[0]
        avg_time = total_time / num_flops if num_flops > 0 else 0

        args_list = []

        def _fetch_args_callback(from_node_idx, to_node_idx, from_sch_idx, to_sch_idx, args):
            """Callback function to fetch layout transform args"""
            _, in_layout, out_layout = args
            if in_layout != out_layout:
                args_list.append(args)

        self._iterate_layout_transform(_fetch_args_callback)

        def _log_to_list(record_list):
            """Callback to log result to a list."""

            def _callback(_, inputs, results):
                """Callback implementation"""
                record_list.append((inputs[0], results[0]))

            return _callback

        builder = autotvm.LocalBuilder(n_parallel=n_parallel, build_func=build_func)
        runner = autotvm.LocalRunner(number=min_exec_num, repeat=1, timeout=timeout)
        if use_rpc:
            if device_key is None:
                raise RuntimeError("device_key need to be set to use rpc tracker mode.")
            runner = autotvm.measure.RPCRunner(
                device_key,
                host,
                port,
                n_parallel=n_parallel,
                number=min_exec_num,
                repeat=1,
                timeout=timeout,
            )
        measure_option = autotvm.measure_option(builder=builder, runner=runner)
        for args in args_list:
            data, in_layout, out_layout = args
            ltf_workload = autotvm.task.args_to_workload(args, "layout_transform")
            if ltf_workload in self._layout_transform_perf_records:
                continue

            if infer_layout:
                input_shape = ltf_workload[1][1]
                flops = 1
                for i in input_shape:
                    flops *= i

                # Rule out invalid layout transformations
                out = topi.layout_transform(data, in_layout, out_layout)
                out_flops = 1
                for i in topi.utils.get_const_tuple(out.shape):
                    out_flops *= i

                if flops != out_flops:
                    inferred_time = INVALID_LAYOUT_TIME
                else:
                    inferred_time = flops * avg_time

                record_input = MeasureInput(target=self._target, task=None, config=None)
                record_output = MeasureResult(
                    costs=(inferred_time,), error_no=0, all_cost=-1, timestamp=-1
                )
                self._layout_transform_perf_records[ltf_workload] = (record_input, record_output)
                continue

            records = []
            task = autotvm.task.create(
                "layout_transform", args=args, target=self._target, target_host=target_host
            )
            tuner = autotvm.tuner.GridSearchTuner(task)
            tuner.tune(n_trial=1, measure_option=measure_option, callbacks=[_log_to_list(records)])
            if not isinstance(records[0][1].costs[0], float):
                records[0] = (records[0][0], records[0][1]._replace(costs=(INVALID_LAYOUT_TIME,)))
            self._layout_transform_perf_records[ltf_workload] = records[0]

        self._iterate_layout_transform(self._create_matrix_callback)
        self._logger.info("Benchmarking layout transformation successful.")
Ejemplo n.º 8
0
network = 'resnet-18'
log_file = "%s.log" % network
dtype = 'float32'

tuning_option = {
    'log_filename':
    log_file,
    'tuner':
    'xgb',
    'n_trial':
    2000,
    'early_stopping':
    600,
    'measure_option':
    autotvm.measure_option(
        builder=autotvm.LocalBuilder(timeout=10),
        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4),
    ),
}

####################################################################
#
# .. note:: How to set tuning options
#
#   In general, the default value provided here works well.
#
#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
#   which makes the tuning runs longer.
#
#   If you have multiple devices, you can use all of them for measurement to
#   accelerate the tuning process. (see the 'Scale up measurement` section below).
#
Ejemplo n.º 9
0
dtype = "float32"

tuning_option = {
    "log_filename":
    log_file,
    "tuner":
    "xgb",
    "n_trial":
    2000,
    "early_stopping":
    600,
    "measure_option":
    autotvm.measure_option(
        builder=autotvm.LocalBuilder(timeout=10),
        runner=autotvm.LocalRunner(number=20,
                                   repeat=3,
                                   timeout=4,
                                   min_repeat_ms=150),
    ),
}

####################################################################
#
# .. note:: How to set tuning options
#
#   In general, the default value provided here works well.
#
#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
#   which makes the tuning runs longer.
#
#   If you have multiple devices, you can use all of them for measurement to
Ejemplo n.º 10
0
tuning_option = {
    'log_filename':
    log_file,
    'tuner':
    'xgb',
    'n_trial':
    1000,
    'early_stopping':
    450,
    'measure_option':
    autotvm.measure_option(
        builder=autotvm.LocalBuilder(
            build_func='ndk' if use_android else 'default'),
        runner=autotvm.RPCRunner(
            device_key,
            host='localhost',
            port=9190,
            number=10,
            timeout=5,
        ),
    ),
}

####################################################################
#
# .. note:: How to set tuning options
#
#   In general, the default values provided here work well.
#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
#   which makes the tuning run longer.
#   If your device runs very slow or your conv2d operators have many GFLOPs, considering to
Ejemplo n.º 11
0
#### TUNING OPTION ####
network = "resnet-18"
log_file = "%s.%s.log" % (device_key, network)
dtype = "float32"

tuning_option = {
    "log_filename": log_file,
    "tuner": "xgb",
    "n_trial": 1500,
    "early_stopping": 800,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"),
        runner=autotvm.RPCRunner(
            device_key,
            host="0.0.0.0",
            port=9190,
            number=5,
            timeout=10,
        ),
    ),
}

####################################################################
#
# .. note:: How to set tuning options
#
#   In general, the default values provided here work well.
#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
#   which makes the tuning run longer.
#   If your device runs very slow or your conv2d operators have many GFLOPs, considering to
#   set timeout larger.
Ejemplo n.º 12
0
# "NCHW" to "NCHWc". To deal with this situation, we define
# conv2d_NCHWc operator in topi. We will tune this operator
# instead of plain conv2d.
#
# We will use local mode for tuning configuration. RPC tracker
# mode can be setup similarly to the approach in
# :ref:`tune_nnvm_arm` tutorial.

tuning_option = {
    'log_filename': log_file,
    'tuner': 'random',
    'early_stopping': None,

    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(number=10, repeat=1,
                                   min_repeat_ms=1000),
    ),
}

# You can skip the implementation of this function for this tutorial.
def tune_kernels(tasks,
                 measure_option,
                 tuner='gridsearch',
                 early_stopping=None,
                 log_filename='tuning.log'):

    for i, tsk in enumerate(tasks):
        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))

        # converting conv2d tasks to conv2d_NCHWc tasks
Ejemplo n.º 13
0
target = tvm.target.cuda()

#### TUNING OPTION ####
network = 'resnet-18'
log_file = "%s.log" % network
dtype = 'float32'

tuning_option = {
    'log_filename': log_file,

    'tuner': 'xgb',
    'n_trial': 2000,
    'early_stopping': 600,

    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(timeout=10),
        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4),
    ),
}

####################################################################
#
# .. note:: How to set tuning options
#
#   In general, the default value provided here works well.
#
#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
#   which makes the tuning runs longer.
#
#   If you have multiple devices, you can use all of them for measurement to
#   accelerate the tuning process. (see the 'Scale up measurement` section below).
#
Ejemplo n.º 14
0
dtype = 'float32'

tuning_option = {
    'log_filename':
    log_file,
    'tuner':
    'xgb',
    'n_trial':
    1000,
    'early_stopping':
    250,
    'measure_option':
    autotvm.measure_option(
        autotvm.measure.rpc(device_key, host='localhost', port=9190),
        number=4,
        n_parallel=1,
        timeout=10,
        build_func='ndk' if use_android else 'default',
    ),
}

####################################################################
#
# .. note:: How to set tuning options
#
#   In general, the default value provided here works well. It is the same
#   value that we used to generate pre-tuned parameters.
#   If you have multiple devices, you can set :code:`n_parallel` to
#   the number of devices you have. (e.g. set it to 3 if you register 3 rk3399
#   boards to the tracker).
#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
Ejemplo n.º 15
0
#### TUNING OPTION ####
#network = 'onnx'
network = 'resnet-50'
log_file = "%s.log" % network
log_file = 'gtx-1060.log'
dtype = 'float32'

tuning_option = {
    'log_filename': log_file,

    'tuner': 'xgb',
    'n_trial': 2000,
    'early_stopping': 600,

    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(timeout=10),
        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4),
    ),
}


# You can skip the implementation of this function for this tutorial.
def tune_tasks(tasks,
               measure_option,
               tuner='xgb',
               n_trial=1000,
               early_stopping=None,
               log_filename='tuning.log',
               use_transfer_learning=True,
               try_winograd=True):
    if try_winograd:
        for i in range(len(tasks)):
Ejemplo n.º 16
0
network = 'sample'
log_file = 'gpu.log'
dtype = 'float32'

tuning_option = {
    'log_filename': log_file,

    'tuner': 'xgb',
    'n_trial': 1000,
    'early_stopping': 600,

    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(timeout=10),
        #runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),
        runner=autotvm.RPCRunner(
            '1080ti',  # change the device key to your key
            '0.0.0.0', 9090,
            number=1, repeat=3, timeout=100, min_repeat_ms=150)
    ),
}

def tune_tasks(tasks,
              measure_option,
              tuner='xgb',
              n_trial=1000,
              early_stopping=None,
              log_filename='tuning.log',
              use_transfer_learning=True,
              try_winograd=True):
    if try_winograd:
        for i in range(len(tasks)):
Ejemplo n.º 17
0
def run_tuning():
    import os
    import numpy as np
    from tvm import autotvm
    from tvm.relay import testing
    from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
    from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
    import tvm.contrib.graph_runtime as runtime
    from datetime import datetime

    tunemods = ["Resnet", "VGG", "MobileNet", "Squeezenet", "Inception", "MXNet"]
    tuners = ["XGBoost", "Genetic Algorithm", "Random", "Grid Search"]
    gtuners = ["DPTuner", "PBQPTuner"]

    pat = get_menu("Which platform do you want to tune?", supportedPlatforms)
    model = get_menu("Which model do you want to tune?", tunemods)
    if model == 5:
        submod = get_menu("Which submodel do you want to tune?", supportedModels)
    tunes = get_menu("Which kernel tuner do you want to use?", tuners)
    gtuner = get_menu("Which graph tuner do you want to use?", gtuners)
    batch = get_menu("How many pictures should be run at a time?")
    core = get_menu("How many cores should be used at a time?")
    print("\n──────────────────────────── TVMUI ────────────────────────────\n")
    print("Started on " + str(datetime.now().strftime("%m/%d/%Y at %H:%M:%S")))
    from tvm import relay
    import tvm
    def get_network(name, batch_size):
        """Get the symbol definition and random weight of a network"""
        input_shape = (batch_size, 3, 224, 224)
        output_shape = (batch_size, 1000)

        if "resnet" in name:
            n_layer = int(name.split("-")[1])
            mod, params = relay.testing.resnet.get_workload(
                num_layers=n_layer, batch_size=batch_size, dtype=dtype
            )
            print("Tuning ResNet")
        elif "vgg" in name:
            n_layer = int(name.split("-")[1])
            mod, params = relay.testing.vgg.get_workload(
                num_layers=n_layer, batch_size=batch_size, dtype=dtype
            )
            print("Tuning VGG")
        elif name == "mobilenet":
            mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
            print("Tuning MobileNet")
        elif name == "squeezenet_v1.1":
            mod, params = relay.testing.squeezenet.get_workload(
                batch_size=batch_size, version="1.1", dtype=dtype
            )
            print("Tuning SqueezeNet")
        elif name == "inception_v3":
            input_shape = (1, 3, 299, 299)
            mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
            print("Tuning Inception")
        elif name == "mxnet":
            # an example for mxnet model
            from mxnet.gluon.model_zoo.vision import get_model
            if submod == 0:
                modn = "resnet18_v1"
                print("Tuning MXNet's ResNet")
            elif submod == 1:
                modn = "inceptionv3"
                print("Tuning MXNet's Inception")
            elif submod == 2:
                modn = "mobilenetv2_1.0"
                print("Tuning MXNet's MobileNet")
            else:
                raise Exception("Not Supported!")
            block = get_model(modn, pretrained=True)
            mod, params = relay.frontend.from_mxnet(block, shape={input_name: input_shape}, dtype=dtype)
            net = mod["main"]
            net = relay.Function(
                net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
            )
            mod = tvm.IRModule.from_expr(net)
        else:
            raise ValueError("Unsupported network: " + name)

        return mod, params, input_shape, output_shape

    if pat == 0:
        target = "llvm"
        print("Using LLVM")
    if pat == 1:
        target = "metal"
        print("Using metal")
    batch_size = batch
    if model == 0:
        dtype = "float32"
        model_name = "resnet-18"
    elif model == 1:
        dtype = "float32"
        model_name = "vgg-18"
    elif model == 2:
        dtype = "float32"
        model_name = "mobilenet"
    elif model == 3:
        dtype = "float32"
        model_name = "squeezenet_v1.1"
    elif model == 4:
        dtype = "float32"
        model_name = "inception_v3"
    elif model == 5:
        dtype = "float32"
        model_name = "mxnet"
    else:
        raise Exception('Not Supported!')
    filename = "TVMTune_" + supportedPlatforms[pat] + "_" + tunemods[model]
    if model == 5:
        filename = filename + "_" + supportedModels[submod]
    filename = filename + "_" + str(batch)
    if tunes == 0:
        filename = filename + "_XG"
    elif tunes == 1:
        filename = filename + "_GA"
    elif tunes == 2:
        filename = filename + "_RD"
    elif tunes == 3:
        filename = filename + "_GS"
    if gtuner == 0:
        filename = filename + "DP"
    elif gtuner == 1:
        filename = filename + "PB"
    log_file = "logs/" + filename + ".log"
    graph_opt_sch_file = "tunings/" + filename + "_graph_opt.log"

    # Set the input name of the graph
    # For ONNX models, it is typically "0".
    input_name = "data"

    # Set number of threads used for tuning based on the number of
    # physical CPU cores on your machine.
    num_threads = core
    os.environ["TVM_NUM_THREADS"] = str(num_threads)

    #################################################################
    # Configure tensor tuning settings and create tasks
    # -------------------------------------------------
    # To get better kernel execution performance on x86 CPU,
    # we need to change data layout of convolution kernel from
    # "NCHW" to "NCHWc". To deal with this situation, we define
    # conv2d_NCHWc operator in topi. We will tune this operator
    # instead of plain conv2d.
    #
    # We will use local mode for tuning configuration. RPC tracker
    # mode can be setup similarly to the approach in
    # :ref:`tune_relay_arm` tutorial.
    #
    # To perform a precise measurement, we should repeat the measurement several
    # times and use the average of results. In addition, we need to flush the cache
    # for the weight tensors between repeated measurements. This can make the measured
    # latency of one operator closer to its actual latency during end-to-end inference.

    tuning_option = {
        "log_filename": log_file,
        "tuner": "random",
        "early_stopping": None,
        "measure_option": autotvm.measure_option(
            builder=autotvm.LocalBuilder(),
            runner=autotvm.LocalRunner(
                number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True
            ),
        ),
    }

    # You can skip the implementation of this function for this tutorial.
    def tune_kernels(
            tasks, measure_option, tuner="gridsearch", early_stopping=None, log_filename="logs/tuning.log"
    ):

        for i, task in enumerate(tasks):
            prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

            # create tuner
            if tunes == 0:
                tuner_obj = XGBTuner(task, loss_type="rank")
                # print("Using XGBTuner")
            elif tunes == 1:
                tuner_obj = GATuner(task, pop_size=50)
                # print("Using GATuner")
            elif tunes == 2:
                tuner_obj = RandomTuner(task)
                # print("Using Random")
            elif tunes == 3:
                tuner_obj = GridSearchTuner(task)
                # print("Using GridSearch")
            else:
                raise ValueError("Invalid tuner: " + tuner)

            # do tuning
            n_trial = len(task.config_space)
            tuner_obj.tune(
                n_trial=n_trial,
                early_stopping=early_stopping,
                measure_option=measure_option,
                callbacks=[
                    autotvm.callback.progress_bar(n_trial, prefix=prefix),
                    autotvm.callback.log_to_file(log_filename),
                ],
            )

    # Use graph tuner to achieve graph level optimal schedules
    # Set use_DP=False if it takes too long to finish.
    def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True):
        target_op = [
            relay.op.get("nn.conv2d"),
        ]
        if gtuner == 0:
            Tuner = DPTuner
            # print("Using DPTuner")
        else:
            Tuner = PBQPTuner
            # print("Using PBQPTuner")
        executor = Tuner(graph, {input_name: dshape}, records, target_op, target)
        executor.benchmark_layout_transform(min_exec_num=2000)
        executor.run()
        executor.write_opt_sch2record_file(opt_sch_file)

    ########################################################################
    # Finally, we launch tuning jobs and evaluate the end-to-end performance.

    def tune_and_evaluate(tuning_opt):
        # extract workloads from relay program
        print("Extract tasks...")
        mod, params, data_shape, out_shape = get_network(model_name, batch_size)
        tasks = autotvm.task.extract_from_program(
            mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
        )

        # run tuning tasks
        tune_kernels(tasks, **tuning_opt)
        tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file)

        # compile kernels with graph-level best records
        with autotvm.apply_graph_best(graph_opt_sch_file):
            print("Compile...")
            with tvm.transform.PassContext(opt_level=3):
                lib = relay.build_module.build(mod, target=target, params=params)

            # upload parameters to device
            if pat == 0:
                ctx = tvm.cpu()
            if pat == 1:
                ctx = tvm.metal()
            data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
            module = runtime.GraphModule(lib["default"](ctx))
            module.set_input(input_name, data_tvm)

            # evaluate
            print("Evaluate inference time cost...")
            ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
            prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
            print(
                "Mean inference time (std dev): %.2f ms (%.2f ms)"
                % (np.mean(prof_res), np.std(prof_res))
            )

    # We do not run the tuning in our webpage server since it takes too long.
    # Uncomment the following line to run it by yourself.

    tune_and_evaluate(tuning_option)

    return
Ejemplo n.º 18
0
# instead of plain conv2d.
#
# We will use local mode for tuning configuration. RPC tracker
# mode can be setup similarly to the approach in
# :ref:`tune_relay_arm` tutorial.

tuning_option = {
    'log_filename':
    log_file,
    'tuner':
    'random',
    'early_stopping':
    None,
    'measure_option':
    autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(number=10, repeat=1, min_repeat_ms=1000),
    ),
}


# You can skip the implementation of this function for this tutorial.
def tune_kernels(tasks,
                 measure_option,
                 tuner='gridsearch',
                 early_stopping=None,
                 log_filename='tuning.log'):

    for i, tsk in enumerate(tasks):
        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

        # converting conv2d tasks to conv2d_NCHWc tasks
Ejemplo n.º 19
0
tuning_option = {
    'log_filename':
    log_file,
    'tuner':
    'xgb',
    'n_trial':
    10,
    'early_stopping':
    450,
    'measure_option':
    autotvm.measure_option(
        builder=autotvm.LocalBuilder(
            build_func='ndk' if use_android else 'default'),
        runner=autotvm.RPCRunner(
            device_key,
            host=tracker_host,
            port=tracker_port,
            number=1,
            timeout=5,
        ),
    ),
}

####################################################################
#
# .. note:: How to set tuning options
#
#   In general, the default values provided here work well.
#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
#   which makes the tuning run longer.
#   If your device runs very slow or your conv2d operators have many GFLOPs, considering to
Ejemplo n.º 20
0
# Since our space is small, a random tuner is just okay.
#
# We only make 10 trials in this tutorial for demonstration. In practice,
# you can do more trials according to your time budget.
# We will log the tuning results into a log file. This file can be
# used to get the best config later.

# logging config (for printing tuning log to the screen)
logging.getLogger('autotvm').setLevel(logging.DEBUG)
logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

# There are two steps for measuring a config: build and run.
# By default, we use all CPU cores to compile program. Then measure them sequentially.
# We measure 5 times and take average to reduce variance.
measure_option = autotvm.measure_option(
    builder='local',
    runner=autotvm.LocalRunner(number=5))

# begin tuning, log records to file `matmul.log`
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(n_trial=10,
           measure_option=measure_option,
           callbacks=[autotvm.callback.log_to_file('matmul.log')])

#########################################################################
# Finally we apply history best from the cache file and check its correctness.
# We can call the function :code:`matmul` directly under the
# :any:`autotvm.apply_history_best` context. When we call this function,
# it will query the dispatch context with its argument and get the best config
# with the same argument.
Ejemplo n.º 21
0
def search_op_config(code_only=False):
    tvm_target = 'cuda'
    logging.getLogger('autotvm').setLevel(logging.DEBUG)
    logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

    default_tune_op = importlib.import_module('templates.' +
                                              (os.environ['OP']))
    print('  >> Backend = %s, Python PID = %s, Task = %s;' %
          (backend, os.getpid(), default_tune_op.__name__))

    task = autotvm.task.create(default_tune_op.get_template_op,
                               args=(),
                               target=tvm_target)
    op_attributes = default_tune_op.op_attributes
    op_summary = '_'.join([k + str(op_attributes[k]) for k in op_attributes])

    def json_to_config(json_dict):
        config = ConfigEntity.from_json_dict({
            "i": -1,
            "t": "",
            "c": None,
            "e": json_dict
        })
        return config

    def config_to_json(config):
        jobj = config.to_json_dict()['e']
        json_dict = dict()
        for i in range(len(jobj)):
            assert (jobj[i][1] in ['sp', 'ot'])
            json_dict[jobj[i][0]] = jobj[i][2]
        return json_dict

    num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0

    if 'CONFIG' in os.environ:
        params_given = json.loads(os.environ['CONFIG'])
        print("====>> [Current Config Option]", os.environ['CONFIG'])

        trial_config = []
        for key in params_given:
            trial_config.append([
                key, "sp" if type(params_given[key]) is list else "ot",
                params_given[key]
            ])
        best_config = json_to_config(trial_config)

    elif 'NNI_TRIAL_JOB_ID' in os.environ:
        show_search_space(task.config_space,
                          os.environ['NNI_TRIAL_JOB_ID'] == '@')
        import nni
        params_given = nni.get_next_parameter()
        if params_given is None:
            raise
        local_dir_id = os.environ['NNI_TRIAL_JOB_ID']
        t = run_config_entity(params_given, local_dir_id)
        gflops = compute_gflops(task.flop, t)
        print('[TVM-engine] Final entity result is: %g' % gflops)
        try:
            nni.report_final_result(gflops)
        except:
            print('[TVM-engine] (not reporting final result to NNI.)')
        exit(0)

    elif num_trials > 0:
        n_parallel = 16 if 'BATCH' not in os.environ else int(
            os.environ['BATCH'])
        measure_option = autotvm.measure_option(
            builder=autotvm.LocalBuilder(n_parallel=n_parallel),
            runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4))
        # if DO_TUNING:
        tuner = autotvm.tuner.XGBTuner(task, num_threads=8)

        from concurrent.futures import ThreadPoolExecutor
        thread_pool = ThreadPoolExecutor(max_workers=n_parallel)

        dev_num = get_tuning_parallism()

        def parse_configs(task, configs):
            results = []
            futures = []
            expected_timecost = 'inf'
            for i in range(len(configs)):
                futures.append(
                    thread_pool.submit(run_config_entity,
                                       config_to_json(configs[i]), i,
                                       expected_timecost, i % dev_num))
            for i in range(len(configs)):
                t = futures[i].result()
                if t < tuner.task.best_config[0]:
                    tuner.task.best_config = (t, configs[i])
                results.append(
                    autotvm.measure.MeasureResult(costs=(t, ),
                                                  error_no=0,
                                                  all_cost=i,
                                                  timestamp=time.time()))
            return results

        tuner.task.best_config = (float('inf'), None)
        tuner.parse_configs = parse_configs

        tuner.tune(n_trial=num_trials,
                   measure_option=measure_option,
                   callbacks=[])
        assert (not math.isinf(tuner.task.best_config[0]))
        best_config = tuner.task.best_config[1]
        print('\n[Best Config]', json.dumps(config_to_json(best_config)))
    else:
        best_config = task.config_space

    with ApplyConfig(best_config):
        with tvm.target.create(tvm_target):
            s, arg_bufs = default_tune_op.get_template_op()
            lower_source = str(tvm.lower(s, arg_bufs, simple_mode=True))

            # Verify Source Code
            assert (len(('\n' + lower_source).split('\nproduce ')) == 2)
            lower_file = local_get_dir_file('my_kernel.lower')
            with open(lower_file, 'w') as fp:
                fp.write(lower_source)

            max_threads_per_block = tvm.ndarray.gpu(0).max_threads_per_block
            max_shared_memory_per_block = tvm.ndarray.gpu(
                0).max_shared_memory_per_block

            thread_extents = subprocess.getoutput(
                "cat '%s' | grep '^ *// attr.*iter_var.*thread_extent'" %
                (lower_file)).split('\n')
            reserved_axes = dict({
                'threadIdx.x': None,
                'threadIdx.y': None,
                'threadIdx.z': None,
                'blockIdx.x': None,
                'blockIdx.y': None,
                'blockIdx.z': None
            })
            for line in thread_extents:
                thread_name = line.split('[iter_var(')[-1].split(',')[0]
                if thread_name in reserved_axes:
                    thread_val = int(line.split('thread_extent = ')[-1])
                    if reserved_axes[thread_name] is not None:
                        if reserved_axes[thread_name] != thread_val:
                            assert (False)
                    else:
                        reserved_axes[thread_name] = thread_val
                else:
                    raise Exception("Invalid thread_axis name: %s" %
                                    thread_name)

            num_threads = 1
            for thread_name in ['threadIdx.x', 'threadIdx.y', 'threadIdx.z']:
                if reserved_axes[thread_name] is not None:
                    num_threads *= reserved_axes[thread_name]
            if num_threads > max_threads_per_block:
                raise Exception(
                    "Invalid kernel code: using num_threads %d > max_threads_per_block %d"
                    % (num_threads, max_threads_per_block))

            allocate_shared = subprocess.getoutput(
                "cat '%s' | grep 'allocate .*shared\[.*\]'" %
                (lower_file)).split('\n')
            shared_memory_in_bytes = 0
            for line in allocate_shared:
                if not line:
                    continue
                parts = line.split('[')
                assert (len(parts) == 2)
                parts = parts[1].split(' * ')
                assert (len(parts) == 2)
                assert (parts[1][-1] == ']')
                allocate_type = parts[0]
                allocate_val = int(parts[1][:-1])
                if allocate_type in ['float32']:
                    shared_memory_in_bytes += allocate_val * 4
                else:
                    raise Exception(
                        "Unrecognized shared memory data type: %s" %
                        allocate_type)
            if shared_memory_in_bytes > max_shared_memory_per_block:
                raise Exception(
                    "Invalid kernel code: using shared_memory_in_bytes %d > max_shared_memory_per_block %d"
                    % (shared_memory_in_bytes, max_shared_memory_per_block))

            func = tvm.build(s, arg_bufs, tvm_target, name='template_op')

    assert (len(func.imported_modules) == 1)
    device_source = translate_code(func.imported_modules[0].get_source())

    if code_only:
        return device_source

    if lower_source and device_source:
        tune_slot_id = 0 if 'CUDA_VISIBLE_DEVICES' not in os.environ else int(
            os.environ['CUDA_VISIBLE_DEVICES'])
        exec_fd, _ = system_lock([tune_slot_id])
        gpu_id = 0
        ctx = tvm.context(tvm_target, gpu_id)
        tensors, outs = [], []
        for arg in arg_bufs:
            shape = [int(x) for x in arg.shape]
            is_output = arg.op.__class__ != tvm.tensor.PlaceholderOp
            from tvm._ffi.ndarray import empty
            td = empty(shape, arg.dtype, ctx)
            if is_output:
                outs.append(td)
            tensors.append(td)

        def timeout_handler():
            print("Error: Timeout during Kernel warmup")
            os._exit(1)

        my_timer = Timer(10, timeout_handler, [])
        my_timer.start()
        # Warmup
        func(*tensors)
        tvm.ndarray.gpu(gpu_id).sync()
        # Estimate
        t_start = time.time()
        func(*tensors)
        tvm.ndarray.gpu(gpu_id).sync()
        t_diff = time.time() - t_start
        my_timer.cancel()
        del my_timer

        num_runs = max(3, min(100, math.floor(1.0 / t_diff)))
        timeout_seconds = math.ceil((num_runs + 5) * t_diff)
        my_timer = Timer(timeout_seconds, timeout_handler, [])
        my_timer.start()
        timer_f = func.time_evaluator(func.entry_name, ctx, number=num_runs)
        t = timer_f(*tensors).mean
        my_timer.cancel()
        exec_fd()

        gflops = compute_gflops(task.flop, t)
        print("[TVM-engine] Average time cost of %d runs = %g ms, %g gflops." %
              (num_runs, t * 1e3, gflops))

        with open(local_get_dir_file('result.txt'), 'w') as fp:
            fp.write(str(t))
Ejemplo n.º 22
0
def tune_model(
    tvmc_model: TVMCModel,
    target: str,
    tuning_records: Optional[str] = None,
    prior_records: Optional[str] = None,
    enable_autoscheduler: bool = False,
    rpc_key: Optional[str] = None,
    hostname: Optional[str] = None,
    port: Optional[Union[int, str]] = 9090,
    trials: int = 10000,
    target_host: Optional[str] = None,
    tuner: str = "xgb",
    min_repeat_ms: Optional[int] = None,
    early_stopping: Optional[int] = None,
    desired_layout: Optional[str] = None,
    timeout: int = 10,
    repeat: int = 1,
    number: int = 10,
    parallel: int = 4,
    hardware_params: Optional[HardwareParams] = None,
    include_simple_tasks: bool = False,
    log_estimated_latency: bool = False,
):
    """Use tuning to automatically optimize the functions in a model.

    Parameters
    ----------
    tvmc_model : TVMCModel
        The model to be optimized.
    target : str
        Compilation target as plain string, inline JSON or path to a JSON file.
    tuning_records: str, optional
        The path to a file that tuning results will be saved to. If not specified,
        a temporary file will be used.
    prior_records: str, optional
        A path to previous tuning results that will be used to hot-start the tuning
        cost model if provided.
    enable_autoscheduler : bool, optional
        When true, use autoscheduling rather than autotvm. This should produce
        faster kernels for compatible model-target pairs.
    rpc_key : str, optional
        The RPC tracker key of the target device. Required when rpc_tracker is provided.
    host_name : str, optional
        The IP address of an RPC tracker, used when benchmarking remotely.
    port : int or str, optional
        The port of the RPC tracker to connect to. Defaults to 9090.
    trials : int, optional
        The number of schedules to try out for the entire model. Note that the default
        value is chosen as a decent average for most models, but larger models may need
        more trials to reach a good result while smaller models will converge with fewer
        trials.
    tuner : str, optional
        The type of tuner to use when tuning with autotvm. Can be one of
        "ga", "gridsearch", "random", "xgb", "xgb_knob", and "xgb-rank".
    min_repeat_ms : int, optional
        Minimum time to run each trial. Defaults to 0 on x86 and 1000 on other targets.
    early_stopping : int, optional
        When specified, stop tuning after this number of trials if results aren't improving.
    desired_layout : str, optional
        Can be one of "NCHW" or "NHWC". When specified, compatible operations in the graph
        will have their layout set to this format. Tasks will then be tuned using this
        specified layout.
    timeout : int, optional,
        If a kernel trial lasts longer than this duration in seconds, it will be
        considered a failure.
    repeat : int, optional
        How many times each measurement should be repeated.
    number : int, optional
        The number of runs a single repeat is made of.
    parallel : int, optional
        The maximum number of parallel devices to use when tuning.
    hardware_params : auto_scheduler.HardwareParams, optional
        When using the autoscheduler, this object defines the configuration of the target hardware.
    include_simple_tasks : bool, optional
        Whether to extract simple operations or only computationally intensive ones when using
        the autoscheduler.
    log_estimated_latency : bool, optional
        If using the autoscheduler, write the estimated latency at each step of tuning to file.

    Returns
    -------
    tuning_records : str
        The path to the produced tuning log file.
    """
    target, extra_targets = common.target_from_cli(target)
    target, target_host = Target.check_and_update_host_consist(
        target, target_host)
    # TODO(jwfromm) Remove this deepcopy once AlterOpLayout bug that mutates source
    # model is fixed. For now, creating a clone avoids the issue.
    mod = deepcopy(tvmc_model.mod)
    params = tvmc_model.params
    if tuning_records is None:
        tuning_records = tvmc_model.default_tuning_records_path()

    for codegen_from_cli in extra_targets:
        codegen = composite_target.get_codegen_by_target(
            codegen_from_cli["name"])
        partition_function = codegen["pass_pipeline"]
        mod = partition_function(mod, params, **codegen_from_cli["opts"])

    # min_repeat_ms should be:
    # a. the value provided by the user, if any, or
    # b. 0ms in case target is "cpu"; otherwise 1000ms
    if min_repeat_ms is None:
        min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000
        logger.info("Default --min-repeat-ms for this target is %s",
                    min_repeat_ms)

    if rpc_key:
        if hostname is None or port is None:
            raise common.TVMCException(
                "You must provide a hostname and port to connect to a remote RPC device."
            )
        if isinstance(port, str):
            port = int(port)

        logger.info("Tuning will be performed on device %s at %s:%d.", rpc_key,
                    hostname, port)

        runner_ctor = auto_scheduler.RPCRunner if enable_autoscheduler else autotvm.RPCRunner
        runner = runner_ctor(
            key=rpc_key,
            host=hostname,
            port=port,
            number=number,
            repeat=repeat,
            n_parallel=parallel,
            timeout=timeout,
            min_repeat_ms=min_repeat_ms,
        )
    else:
        logger.info("Starting localhost tuning.")
        runner_ctor = (auto_scheduler.LocalRPCMeasureContext
                       if enable_autoscheduler else autotvm.LocalRunner)
        local_server = runner_ctor(
            number=number,
            repeat=repeat,
            timeout=timeout,
            min_repeat_ms=min_repeat_ms,
        )

        # For autoscheduling on some devices, we need to maintain a LocalRPCMeasureContext object.
        if enable_autoscheduler:
            runner = local_server.runner
        else:
            runner = local_server

    if enable_autoscheduler:

        tasks, weights = autoscheduler_get_tuning_tasks(
            mod=mod,
            params=params,
            target=target,
            alter_layout=desired_layout,
            hardware_params=hardware_params,
            include_simple_tasks=include_simple_tasks,
        )

        # Create the autoscheduler tuning options
        tuning_options = auto_scheduler.TuningOptions(
            num_measure_trials=trials,
            measure_callbacks=[auto_scheduler.RecordToFile(tuning_records)],
            runner=runner,
            early_stopping=early_stopping,
        )

        logger.info("Autoscheduling with configuration: %s", tuning_options)

        # Schedule the tasks (i.e., produce a schedule for each task)
        schedule_tasks(tasks, weights, tuning_options, prior_records,
                       log_estimated_latency)
    else:
        tasks = autotvm_get_tuning_tasks(
            mod=mod,
            params=params,
            target=target,
            alter_layout=desired_layout,
        )

        # In autotvm, trials is specified per task. We can convert the per-model input
        # provided to per-task trials by dividing by the number of tasks.
        trials = int(trials / len(tasks))
        logger.info("Autotuning with %d trials per task.", trials)

        tuning_options = {
            "tuner":
            tuner,
            "trials":
            trials,
            "early_stopping":
            early_stopping,
            "measure_option":
            autotvm.measure_option(
                builder=autotvm.LocalBuilder(build_func="default"),
                runner=runner),
            "tuning_records":
            prior_records,
        }
        logger.info("Autotuning with configuration: %s", tuning_options)

        tune_tasks(tasks, tuning_records, **tuning_options)

    return tuning_records
Ejemplo n.º 23
0
print(task.config_space)

################################################################
# Then we need to define how to measure the generated code and pick a tuner.
# Since our space is small, a random tuner is just okay.
#
# We only make 10 trials in this tutorial for demonstration. In practice,
# you can do more trials according to your time budget.
# We will log the tuning results into a cache file. This file can be
# used to get the best config later.

# logging config (for printing tuning log to screen)
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

# use local cpu, measure 5 times for every config to reduce variance
measure_option = autotvm.measure_option(mode='local', number=5)

# begin tuning, log records to file `cache.tsv`
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(n_trial=10,
           measure_option=measure_option,
           callbacks=[autotvm.callback.log_to_file('cache.tsv')])

#########################################################################
# Finally we apply history best from the cache file and check its correctness.
# We can call the function :code:`matmul` directly under the
# :any:`autotvm.apply_history_best` context. When we call this function,
# it will query the dispatch context with its argument and get the best config
# with the same argument.

# apply history best from log file
Ejemplo n.º 24
0
tuning_option = {
    "log_filename":
    log_file,
    "tuner":
    "random",
    "n_trial":
    1000,
    "early_stopping":
    None,
    "measure_option":
    autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.RPCRunner(
            env.TARGET,
            host=tracker_host,
            port=tracker_port,
            number=5,
            timeout=60,
            check_correctness=True,
        ),
    ),
}

####################################################################
#
# .. note:: How to set tuning options
#
#   In general, the default values provided here work well.
#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping`
#   to larger values, makes the tuning run for longer.
#   If your device is under-powered or your conv2d operators are large, consider
Ejemplo n.º 25
0
import tvm.contrib.graph_runtime as runtime
import cv2

network = 'peleenet_1d_float16_nano'
log_file = "%s.log" % network
dtype = 'float32'

tuning_option = {
    'log_filename': log_file,

    'tuner': 'xgb',
    'n_trial': 600,
    'early_stopping': 600,

    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(timeout=10),
        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150)),
}
tuning_rpc_option = {
    'log_filename': log_file,

    'tuner': 'xgb',
    'n_trial': 1000,
    'early_stopping': 1000,

    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(timeout=1000),
        runner=autotvm.RPCRunner(
            'nano', host='0.0.0.0', port=9190,
            number=2,
            repeat = 3,
Ejemplo n.º 26
0
    # num flop
    NH, NW = [e.value for e in output.shape[2:4]]
    cfg.add_flop(N * CO * NH * NW * (CI * KH * KW * 2))
    return s, [raw_data, kernel, output]


logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)

N, H, W, CO, CI, KH, KW, strides, padding, scaling_factor = 1, 14, 14, 512, 512, 1, 1, 2, 0, 1.0
task = autotvm.task.create(conv2d,
                           args=(N, H, W, CO, CI, KH, KW, strides, padding,
                                 scaling_factor),
                           target='cuda')

measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(),
                                        runner=autotvm.LocalRunner(number=10,
                                                                   timeout=4))

if DO_TUNING:
    tuner = autotvm.tuner.XGBTuner(task)
    tuner.tune(n_trial=2000,
               measure_option=measure_option,
               callbacks=[autotvm.callback.log_to_file('conv2d.log')])

    dispatch_context = autotvm.apply_history_best("conv2d.log")
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)
else:
    config = task.config_space.get(PRETUNED_INDEX)
    dispatch_context = autotvm.task.ApplyConfig(config)
Ejemplo n.º 27
0
    cfg.define_reorder("reorder", [ko, xc, ki, yc], "all")
    cfg["reorder"].apply(s, CC, [ko, xc, ki, yc])
    cfg.define_annotate('ann', [ko, xc, ki, yc], policy='try_unroll_vec')
    cfg['ann'].apply(s, CC, [ko, xc, ki, yc])

    x, y, z = s[packedB].op.axis
    s[packedB].vectorize(z)
    s[packedB].parallel(x)

    return s, [A, B, C]


task = autotvm.task.create('matmul', args=[], target=target)

measure_option = autotvm.measure_option(
    #builder='local',
    builder=autotvm.LocalBuilder(n_parallel=56),
    runner=autotvm.LocalRunner(number=3))

# begin tuning, log records to file `matmul.log`
#tuner = autotvm.tuner.XGBTuner(task, argsDict=None)
#tuner = autotvm.tuner.XGBTuner(task)
#tuner = autotvm.tuner.RandomTuner(task)
tuner = autotvm.tuner.GridSearchTuner(task)
n_trial = 4000
early_stopping = None
if os.path.exists('matmul_skx.log.tmp'):
    os.remove('matmul_skx.log.tmp')
tuner.tune(n_trial=n_trial,
           early_stopping=early_stopping,
           measure_option=measure_option,
           callbacks=[
Ejemplo n.º 28
0
#### TUNING OPTION ####
network = 'resnet-18'
log_file = "%s.log" % network
dtype = 'float32'

tuning_option = {
    'log_filename': log_file,

    'tuner': 'xgb',
    'n_trial': 2000,
    'early_stopping': 600,

    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(timeout=10),
        #runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),
        runner=autotvm.RPCRunner(
            '1080ti',  # change the device key to your key
            '0.0.0.0', 9190,
            number=20, repeat=3, timeout=4, min_repeat_ms=150)
    ),
}

####################################################################
#
# .. note:: How to set tuning options
#
#   In general, the default value provided here works well.
#
#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
#   which makes the tuning runs longer.
#
#   If you have multiple devices, you can use all of them for measurement to
Ejemplo n.º 29
0
log_file = "%s.%s.log" % (device_key, network)
dtype = 'float32'


tuning_option = {
    'log_filename': log_file,

    'tuner': 'xgb_knob',
    'n_trial': 1500,
    'early_stopping': 800,

    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(
            build_func='ndk' if use_android else 'default'),
        runner=autotvm.RPCRunner(
            device_key, host='0.0.0.0', port=9000,
            number=5,
            timeout=10,
        ),
        #runner=autotvm.LocalRunner()
    ),
}


def get_val_data(image_path):
    filenames = os.listdir(image_path)
    images = []
    imgs = []
    for filename in filenames:
        image = cv2.imread(image_path + filename)
        images.append(image)
        img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
Ejemplo n.º 30
0
def tune_and_compile(graph: Graph, batch_size, target, target_host, device=None):
    #
    # this function is adopted and modified from tvm tutorial
    #
    log_dir = "./tvm_schedule_configs"
    os.makedirs(log_dir, exist_ok=True)
    log_file = os.path.join(log_dir, f"{graph.name}_{device}_{batch_size}.log")
    tuning_option = {
        'log_filename': log_file,
        'tuner': 'ga',
        'n_trial': 2000,
        'early_stopping': 600,
        'measure_option': autotvm.measure_option(
            builder=autotvm.LocalBuilder(timeout=10),
#            runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),
            runner=autotvm.RPCRunner(
                'v100',  # change the device key to your key
                '0.0.0.0', 9190,
                number=20, repeat=3, timeout=4),
        )
    }

    # You can skip the implementation of this function for this tutorial.
    def tune_tasks(tasks,
                   measure_option,
                   tuner,
                   n_trial,
                   early_stopping,
                   log_filename,
                   use_transfer_learning=True):
        # create tmp log file
        tmp_log_file = log_filename + ".tmp"
        for i, tsk in enumerate(reversed(tasks)):
            prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

            # create tuner
            if tuner == 'xgb' or tuner == 'xgb-rank':
                tuner_obj = XGBTuner(tsk, loss_type='rank')
            elif tuner == 'ga':
                tuner_obj = GATuner(tsk, pop_size=100)
            elif tuner == 'random':
                tuner_obj = RandomTuner(tsk)
            elif tuner == 'gridsearch':
                tuner_obj = GridSearchTuner(tsk)
            else:
                raise ValueError("Invalid tuner: " + tuner)

            if use_transfer_learning:
                if os.path.isfile(tmp_log_file):
                    tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))

            # do tuning
            # print(f"tsk.config_space {tsk.config_space}")
            tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
                           early_stopping=early_stopping,
                           measure_option=measure_option,
                           callbacks=[
                               autotvm.callback.progress_bar(n_trial, prefix=prefix),
                               autotvm.callback.log_to_file(tmp_log_file)])

        # pick best records to a cache file
        autotvm.record.pick_best(tmp_log_file, log_filename)
        os.remove(tmp_log_file)
    mod, params = graph2relay(graph, batch_size)
    input_shape = (batch_size,) + tuple(graph.enter_node.output_shape)
    out_shape = (batch_size,) + tuple(graph.blocks[-1].exit_node.output_shape)
    # print(input_shape, out_shape)

    tasks = autotvm.task.extract_from_program(mod["main"], target=target, target_host=target_host,
                                              params=params, ops=(relay.op.nn.conv2d,))

    # run tuning tasks
    if os.path.exists(log_file):
        print(f"Tuned config found, use {log_file} as config")
    else:
        print("Tuning...")
        tune_tasks(tasks, **tuning_option)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        # print("Compile...")
        with relay.build_config(opt_level=3):  # opt_level = 3 has problem
            graph, lib, params = relay.build_module.build(mod, target=target, target_host=target_host, params=params)

        return graph, lib, params
Ejemplo n.º 31
0
def test_db_filter():
    logging.info("test db filter ...")

    # Pick a GPU target because there are more likely to be failures/invalid configs
    task, target = get_sample_task()

    ctx = tvm.context(str(target))
    if not ctx.exist:
        logging.warning(
            "Skip this test because there is no supported device for test")

    batch_size = 2

    measure_option = autotvm.measure_option('local', do_fork=False, timeout=2)
    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)

    ct = 0
    all_inputs = list()
    all_results = list()
    batches = list()
    tuner = autotvm.tuner.RandomTuner(task)
    while ct < TRIAL_LIMIT:
        inputs = list()
        for i in range(batch_size):
            cfg = tuner.next_batch(1)[0]
            inputs.append((MeasureInput(target, task, cfg)))
            all_inputs.append(inputs[-1])
        batches.append(inputs)
        results = measure_batch(inputs)
        all_results += results
        ct += 1

    del measure_batch

    db = database.DummyDatabase()
    db.flush()

    # First setting, memoize one input at a time, check that each is saved and replayed
    measure_option = autotvm.measure_option('local',
                                            do_fork=False,
                                            timeout=2,
                                            replay_db=db)
    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)

    for i in range(len(all_inputs) + 1):
        db.flush()
        for j in range(i):
            db.save(all_inputs[j], all_results[j])

        for k in range(len(batches)):
            batch = batches[k]
            batch_result = measure_batch(batch)
            for l in range(batch_size):
                all_idx = k * batch_size + l
                assert batch_result[l] is not None
                if all_idx < i:
                    assert encode(batch[l], batch_result[l]) == encode(batch[l], all_results[all_idx]), \
                        "(no retry) EXPECTED MATCH, GOT MISMATCH"
                else:
                    assert encode(batch[l], batch_result[l]) != encode(batch[l], all_results[all_idx]), \
                        "(no retry) EXPECTED MISMATCH, GOT MATCH"

    del measure_batch
Ejemplo n.º 32
0
        task = autotvm.task.create(
            group_conv2d,
            args=(N, CI, H, W, CO, KH, KW, strides, padding, dilation, groups),
            target=tvm.target.vta(),
            target_host=env.target_host,
            template_key="direct",
        )
        print(task.config_space)

        # Tune
        measure_option = autotvm.measure_option(
            builder=autotvm.LocalBuilder(),
            runner=autotvm.RPCRunner(
                env.TARGET,
                host=tracker_host,
                port=int(tracker_port),
                number=5,
                timeout=60,
                # check_correctness=True, # TODO: re-enable when check_correctness works again.
            ),
        )

        # Run Tuner
        tuner = autotvm.tuner.RandomTuner(task)
        tuner.tune(
            n_trial=len(task.config_space),
            early_stopping=None,
            measure_option=measure_option,
            callbacks=[
                autotvm.callback.progress_bar(len(task.config_space),
                                              prefix=prefix),
Ejemplo n.º 33
0
    ctx = tvm.context(target, 0)
    src = str(M) + "*" + str(K) + "*" + str(N)
    print(src)
    matmul = matmul
    space_len = 16
    early_stopping = 8
    task = autotvm.task.create(matmul,args=(M,K,N,dtype),target=target)
    print(task.config_space)
    testwithnumpy()
    # logging config (for printing tuning log to the screen)
    # logging.getLogger('autotvm').setLevel(logging.DEBUG)
    # logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
    # There are two steps for measuring a config: build and run.
    # By default, we use all CPU cores to compile program. Then measure them sequentially.
    # We measure 5 times and take average to reduce variance.
    measure_option = autotvm.measure_option(builder='local',runner=autotvm.LocalRunner(number=5))
    # Begin tuning with RandomTuner, log records to file `matmul.log`
    # You can use alternatives like XGBTuner.
    print("XGBoost:")
    XGBtuner = autotvm.tuner.XGBTuner(task)
    XGBtuner.tune(n_trial=space_len,early_stopping=early_stopping, measure_option=measure_option, callbacks=[autotvm.callback.progress_bar(space_len),autotvm.callback.log_to_file('XGBtuner_matmul.log')])

    print("###############################")
    #testwithNoneopt('XGBtuner_matmul.log',ctx,matmul)
    testwithnumpy()
    print(XGBtuner.flops_max)
    print(XGBtuner.task)
    print(XGBtuner.xs)


Ejemplo n.º 34
0
# times and use the average of results. In addition, we need to flush the cache
# for the weight tensors between repeated measurements. This can make the measured
# latency of one operator closer to its actual latency during end-to-end inference.

tuning_option = {
    "log_filename":
    log_file,
    "tuner":
    "random",
    "early_stopping":
    None,
    "measure_option":
    autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(number=1,
                                   repeat=10,
                                   min_repeat_ms=0,
                                   enable_cpu_cache_flush=True),
    ),
}


# You can skip the implementation of this function for this tutorial.
def tune_kernels(tasks,
                 measure_option,
                 tuner="gridsearch",
                 early_stopping=None,
                 log_filename="tuning.log"):

    for i, task in enumerate(tasks):
        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
Ejemplo n.º 35
0
def tune_and_evaluate(M, N, L, dtype, layout):
    task = autotvm.task.create("tutorial/auto_tensorcore/test_gemm",
                               args=(N, L, M, dtype, layout),
                               target='cuda')
    print(task.config_space)

    logging.getLogger('autotvm').setLevel(logging.DEBUG)
    logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

    measure_option = autotvm.measure_option(
        builder='local', runner=autotvm.LocalRunner(number=5))

    tuner = autotvm.tuner.XGBTuner(task)
    tuner.tune(n_trial=1000,
               measure_option=measure_option,
               callbacks=[autotvm.callback.log_to_file('matmul.log')])

    dispatch_context = autotvm.apply_history_best("matmul.log")
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)
    with autotvm.apply_history_best('matmul.log'):
        with tvm.target.Target("cuda"):
            s, arg_bufs = test_gemm(N, L, M, dtype, layout)
            print(tvm.lower(s, arg_bufs, simple_mode=True))
            func = tvm.build(s, arg_bufs)
    dev_module = func.imported_modules[0]
    print(dev_module.get_source())

    # check correctness
    if (layout == "NN"):
        shape_a = (N, L)
        shape_b = (L, M)
    elif (layout == "NT"):
        shape_a = (L, N)
        shape_b = (L, M)
    elif (layout == "TN"):
        shape_a = (N, L)
        shape_b = (M, L)
    elif (layout == "TT"):
        shape_a = (L, N)
        shape_b = (M, L)

    a_np = None
    b_np = None
    c_np = None
    c_np_type = None
    if dtype == 'float16':
        c_np_type = np.float32
        a_np = np.random.uniform(size=shape_a).astype(np.float16)
        b_np = np.random.uniform(size=shape_b).astype(np.float16)
        if (layout == "NN"):
            c_np = np.dot(a_np, b_np)
        elif (layout == "NT"):
            c_np = np.dot(a_np.T, b_np)
        elif (layout == "TN"):
            c_np = np.dot(a_np, b_np.T)
        elif (layout == "TT"):
            c_np = np.dot(a_np.T, b_np.T)
    elif dtype == 'int8':
        c_np_type = np.int32
        a_np = np.random.randint(low=-128, high=127,
                                 size=shape_a).astype(np.int8)
        b_np = np.random.randint(low=-128, high=127,
                                 size=shape_b).astype(np.int8)
        if (layout == "NN"):
            c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32))
        elif (layout == "NT"):
            c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32))
        elif (layout == "TN"):
            c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T)
        elif (layout == "TT"):
            c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T)
    elif dtype == 'int4':
        c_np_type = np.int32
        a_np_int = np.random.randint(low=-8, high=7,
                                     size=shape_a).astype(np.int32)
        b_np_int = np.random.randint(low=-8, high=7,
                                     size=shape_b).astype(np.int32)
        # "TN"
        c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
        a_np = np.zeros(shape=(N, int(L / 8)), dtype=np.int32)
        b_np = np.zeros(shape=(M, int(L / 8)), dtype=np.int32)
        # a_np --> col_major
        for i in range(N):
            for j in range(int(L / 8)):
                for k in range(8):
                    a_np[i,
                         j] = a_np[i, j] | ((a_np_int[i, j * 8 + k] & 0xf) <<
                                            ((7 - k) * 4))

        # b_np --> row_major
        for i in range(M):
            for j in range(int(L / 8)):
                for k in range(8):
                    b_np[i,
                         j] = b_np[i, j] | ((b_np_int[i, j * 8 + k] & 0xf) <<
                                            ((7 - k) * 4))
    elif dtype == 'int1':
        c_np_type = np.int32
        a_np_int = np.random.randint(low=0, high=1,
                                     size=shape_a).astype(np.int32)
        b_np_int = np.random.randint(low=0, high=1,
                                     size=shape_b).astype(np.int32)
        # "TN"
        c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
        a_np = np.zeros(shape=(N, int(L / 32)), dtype=np.int32)
        b_np = np.zeros(shape=(M, int(L / 32)), dtype=np.int32)
        for i in range(N):
            for j in range(int(L / 32)):
                for k in range(32):
                    a_np[i,
                         j] = a_np[i, j] | ((a_np_int[i, j * 32 + k] & 0xf) <<
                                            (31 - k))

        for i in range(M):
            for j in range(int(L / 32)):
                for k in range(32):
                    b_np[i,
                         j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xf) <<
                                            (31 - k))

    c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx)
    a_tvm = tvm.nd.array(a_np, ctx=ctx)
    b_tvm = tvm.nd.array(b_np, ctx=ctx)
    func(a_tvm, b_tvm, c_tvm)

    tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3)

    evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
    print('Time cost of this operator: %f' %
          evaluator(a_tvm, b_tvm, c_tvm).mean)
Ejemplo n.º 36
0
batch_size = 1
model_name = 'resnet18v2'
log_file = "%s-batchsize%d-optimization.log" % (model_name, batch_size)

input_filename = "kitten.jpg"

tuning_options = {
    'log_filename':
    log_file,
    'tuner':
    'xgb',
    'n_trial':
    2000,
    'early_stopping':
    600,
    'measure_option':
    autotvm.measure_option(
        builder=autotvm.LocalBuilder(timeout=10),
        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4),
    ),
}

tuneable = tvm.autotvm.task.get_config()

#### DEVICE CONFIG ####
target = tvm.target.cuda()
input_shape = (batch_size, 3, 224, 224)
output_shape = (batch_size, 1000)
sym, params = load_onnx_model(model_name + ".onnx")

tune_and_evaluate(sym, params, input_shape, output_shape, tuning_options)
Ejemplo n.º 37
0
    tuning_opt = {
        'log_filename':
        opt.log_filename,
        'tuner':
        opt.tuner,
        'n_trial':
        1e9,
        'early_stopping':
        None,
        'measure_option':
        autotvm.measure_option(builder=autotvm.LocalBuilder(
            build_func=vta.vta_autotvm_build_func),
                               runner=autotvm.RPCRunner(
                                   env.TARGET,
                                   tracker_host,
                                   tracker_port,
                                   number=4,
                                   min_repeat_ms=150,
                                   repeat=opt.measurements,
                                   timeout=60,
                                   check_correctness=True))
    }
    tune_tasks(tasks, **tuning_opt)

    # Compile kernels with history best records
    with autotvm.tophub.context(target, extra_files=[opt.log_filename]):

        # Compile network
        print("Compiling network with best tuning parameters...")
        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            if target.device_name != "vta":
Ejemplo n.º 38
0
# small. The ``early_stopping`` parameter is the minimum number of trails to
# run before a condition that stops the search early can be applied. The
# measure option indicates where trial code will be built, and where it will be
# run. In this case, we're using the ``LocalRunner`` we just created and a
# ``LocalBuilder``. The ``tuning_records`` option specifies a file to write
# the tuning data to.

tuning_option = {
    "tuner":
    "xgb",
    "trials":
    10,
    "early_stopping":
    100,
    "measure_option":
    autotvm.measure_option(builder=autotvm.LocalBuilder(build_func="default"),
                           runner=runner),
    "tuning_records":
    "resnet-50-v2-autotuning.json",
}

################################################################################
# .. admonition:: Defining the Tuning Search Algorithm
#
#   By default this search is guided using an `XGBoost Grid` algorithm.
#   Depending on your model complexity and amount of time available, you might
#   want to choose a different algorithm.

################################################################################
# .. admonition:: Setting Tuning Parameters
#
#   In this example, in the interest of time, we set the number of trials and
Ejemplo n.º 39
0
def compile_via_tvm(sym, arg_params, aux_params, symbol_file, data_shape, tune):

    input_shape = [1] + list(data_shape)
    input_dict = {'data': input_shape}
    input_name = 'data'

    batch = 1
    seq_length = 128
    input_dict = {
        'data0': (batch, seq_length),
        'data1': (batch, seq_length),
        'data2': (batch,)
    }
    mod, params = relay.frontend.from_mxnet(sym,
                                            dtype={},
                                            shape=input_dict,
                                            arg_params=arg_params,
                                            aux_params=aux_params)

    model_name = symbol_file.split('/')[-1].replace('.json','')
    log_dir = os.getcwd() + "/tuned_logs_c5"
    pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True)
    log_file = log_dir + "/" + "%s.log" % model_name
    graph_opt_sch_file = log_dir + "/" + "%s_graph_opt.log" % model_name

    Path(log_file).touch()
    Path(graph_opt_sch_file).touch()

    if tune:
        tuning_option = {
            'log_filename': log_file,
            'tuner': 'random',
            'early_stopping': None,

            'measure_option': autotvm.measure_option(
                builder=autotvm.LocalBuilder(),
                runner=autotvm.LocalRunner(number=10, repeat=1,
                                           min_repeat_ms=1000),
            ),
        }

        tune_and_evaluate(tuning_option, mod, params, input_shape, log_file,
                graph_opt_sch_file, input_name)

    # with autotvm.apply_graph_best(graph_opt_sch_file):
    with autotvm.apply_history_best(log_file):
        with relay.build_config(opt_level=3):
            graph, lib, params = relay.build_module.build(
                mod, target=target, params=params)

            base_dir = os.getcwd() + "/compiled_models"
            pathlib.Path(base_dir).mkdir(parents=True, exist_ok=True)

            base = base_dir + '/tvm_' + symbol_file.split('/')[-1].replace('.json','')

            path_lib = base + '_deploy_lib.tar'
            path_graph =  base + '_deploy_graph.json'
            path_params = base + '_deploy_params.params'

            lib.export_library(path_lib)
            with open(path_graph, 'w') as fo:
                fo.write(graph)
            with open(path_params, 'wb') as fo:
                fo.write(relay.save_param_dict(params))
Ejemplo n.º 40
0
network = 'resnet-18'
log_file = "%s.%s.log" % (device_key, network)
dtype = 'float32'

tuning_option = {
    'log_filename': log_file,

    'tuner': 'xgb',
    'n_trial': 2000,
    'early_stopping': 800,

    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(
            build_func='ndk' if use_android else 'default'),
        runner=autotvm.RPCRunner(
            device_key, host='localhost', port=9190,
            number=5,
            timeout=4,
        ),
    ),
}

####################################################################
#
# .. note:: How to set tuning options
#
#   In general, the default values provided here work well.
#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
#   which makes the tuning run longer.
#   If your device runs very slow or your conv2d operators have many GFLOPs, considering to
#   set timeout larger.
Ejemplo n.º 41
0
# logging config (for printing tuning log to screen)
logging.getLogger('autotvm').setLevel(logging.DEBUG)
logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

# the last layer in resnet
N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
task = autotvm.task.create(conv2d_no_batching,
                           args=(N, H, W, CO, CI, KH, KW, strides, padding),
                           target='cuda')
print(task.config_space)

# Use local gpu, measure 10 times for every config to reduce variance
# The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds
measure_option = autotvm.measure_option(
    builder=autotvm.LocalBuilder(),
    runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
)

# Begin tuning, log records to file `conv2d.log`
# During tuning we will also try many invalid configs, so you are expected to
# see many error reports. As long as you can see non-zero GFLOPS, it is okay.
tuner = autotvm.tuner.XGBTuner(task)
tuner.tune(n_trial=20,
           measure_option=measure_option,
           callbacks=[autotvm.callback.log_to_file('conv2d.log')])

#########################################################################
# Finally we can inspect the best config from log file, check correctness,
# and measure running time.

# inspect the best config