Beispiel #1
0
def test_file_io():
    temp = utils.tempdir()
    file_path = temp.relpath("temp.log")

    tsk, target = get_sample_task()
    inputs = [
        MeasureInput(target, tsk, tsk.config_space.get(i))
        for i in range(0, 10)
    ]
    results = [MeasureResult((i, ), 0, 0, 0) for i in range(0, 10)]

    invalid_inp = MeasureInput(target, tsk, tsk.config_space.get(10))
    invalid_res = MeasureResult((10, ), 0, 0, 0)

    # Erase the entity map to test if it will be ignored when loading back.
    invalid_inp.config._entity_map = {}

    with open(file_path, "w") as fo:
        cb = autotvm.callback.log_to_file(fo)
        cb(None, inputs, results)
        cb(None, [invalid_inp], [invalid_res])

    ref = zip(inputs, results)
    for x, y in zip(ref, autotvm.record.load_from_file(file_path)):
        assert x[1] == y[1]
Beispiel #2
0
def test_file_io():
    temp = utils.tempdir()
    file_path = temp.relpath("temp.log")

    tsk, target = get_sample_task()
    inputs = [
        MeasureInput(target, tsk, tsk.config_space.get(i))
        for i in range(0, 10)
    ]
    results = [MeasureResult((i, ), 0, 0, 0) for i in range(0, 10)]

    invalid_inp = MeasureInput(target, tsk, tsk.config_space.get(10))
    invalid_res = MeasureResult((10, ), 0, 0, 0)

    # Erase the entity map to test if it will be ignored when loading back.
    invalid_inp.config._entity_map = {}

    with open(file_path, "w") as fo:
        cb = autotvm.callback.log_to_file(fo)
        cb(None, inputs, results)
        cb(None, [invalid_inp], [invalid_res])

    ref = zip(inputs, results)
    for x, y in zip(ref, autotvm.record.load_from_file(file_path)):
        assert x[1] == y[1]

    # Confirm functionality of multiple file loads
    hist_best = ApplyHistoryBest([file_path, file_path])
    x = hist_best.query(target, tsk.workload)
    assert str(x) == str(inputs[0][2])
Beispiel #3
0
def test_apply_history_best():
    tsk, target = get_sample_task()

    records = [(MeasureInput(target, tsk, tsk.config_space.get(0)),
                MeasureResult((0.1, ), 0, 2.3, 0)),
               (MeasureInput(target, tsk, tsk.config_space.get(1)),
                MeasureResult((0.3, ), 0, 2.3, 0)),
               (MeasureInput(target, tsk, tsk.config_space.get(2)),
                MeasureResult((0.01, ), 0, 2.3, 0)),
               (MeasureInput(target, tsk, tsk.config_space.get(4)),
                MeasureResult((0.4, ), 0, 2.3, 0))]
    hist_best = ApplyHistoryBest(records)
    x = hist_best.query(target, tsk.workload)
    assert str(x) == str(tsk.config_space.get(2))
Beispiel #4
0
def parse_one_log(best_config_log, new_log_dir):
    target_wkl = None
    for inp, res in load_from_file(best_config_log):
        # Update the target string to generate the right SHA2 hash code.
        if target_wkl is None:
            inp.task.target = inp.target
            target_wkl = Workload.from_task(inp.task)
            target_wkl['target'] = log_target
            if target_wkl not in wkls:
                new_log_file_name = gen_log_file_name_from_workload(target_wkl)
                new_log_path = '{0}/{1}'.format(new_log_dir, new_log_file_name)
                wkls[target_wkl] = (new_log_path, [])

        if res.error_no != 0:
            continue

        # Only focus on the best N configs.
        new_inp = MeasureInput(target=log_target,
                               task=inp.task,
                               config=inp.config)
        if len(wkls[target_wkl][1]) < top_n_cfgs:
            heapq.heappush(wkls[target_wkl][1], (-np.mean(res.costs), new_inp))
        elif np.mean(res.costs) < -wkls[target_wkl][1][0][0]:
            heapq.heappop(wkls[target_wkl][1])
            heapq.heappush(wkls[target_wkl][1], (-np.mean(res.costs), new_inp))
Beispiel #5
0
def test_PBQPTuner_run():
    target = "llvm"
    dtype = "float32"
    layout = "NCHW"
    dshape = (1, 3, 8, 8)
    conv2d = relay.op.get("nn.conv2d")
    target_ops = [conv2d]

    g, records, ltf_records, ltf_keys, tasks = _create_data(
        target, dshape, dtype, layout)
    costs = [0.02, 0.02, 0.045]
    config_list = []
    cfg_dict = {
        "index":
        -1,
        "code_hash":
        None,
        "entity": [["tile_ic", "sp", [1, 3]], ["tile_oc", "sp", [2, 8]],
                   ["tile_ow", "sp", [4, 2]], ["unroll_kw", "ot", True]]
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "index":
        -1,
        "code_hash":
        None,
        "entity": [["tile_ic", "sp", [4, 4]], ["tile_oc", "sp", [2, 16]],
                   ["tile_oh", "ot", 1], ["tile_ow", "sp", [4, 2]]]
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "index":
        -1,
        "code_hash":
        None,
        "entity": [["tile_ic", "sp", [16, 2]], ["tile_oc", "sp", [8, 4]],
                   ["tile_ow", "sp", [2, 4]], ["unroll_kw", "ot", False]]
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    for cost, config, task in zip(costs, config_list, tasks):
        ms_input = MeasureInput(target=target, task=task, config=config)
        ms_output = MeasureResult(costs=(cost, ),
                                  error_no=0,
                                  all_cost=-1,
                                  timestamp=-1)
        records.append((ms_input, ms_output))

    executor = PBQPTuner(g, {"data": dshape}, records, target_ops, target)
    executor.benchmark_layout_transform(layout_records=ltf_records,
                                        infer_layout=True)
    executor.run()
    out = [record[0].config for record in executor.get_optimal_records()]
    expected_out = [
        records[3][0].config, records[1][0].config, records[2][0].config
    ]
    assert expected_out == out, "Output mismatch: expecting %s but got %s" \
                           % (str(expected_out), str(out))
Beispiel #6
0
def test_DPTuner_run():
    log_file = "%s/test_tuner.log" % (os.getcwd())
    target = "llvm"
    dtype = "float32"
    layout = "NCHW"
    dshape = (1, 3, 8, 8)
    target_ops = [relay.nn.conv2d]

    g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout)
    mod = relay.module.Module()
    mod["main"] = g
    costs = [0.02, 0.02, 0.045]
    config_list = []
    cfg_dict = {"i": -1,
                "c": None,
                "e": [["tile_ic", "sp", [1, 3]],
                      ["tile_oc", "sp", [2, 8]],
                      ["tile_ow", "sp", [4, 2]],
                      ["unroll_kw", "ot", True]],
                "t": ""}
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {"i": -1,
                "c": None,
                "e": [["tile_ic", "sp", [4, 4]],
                      ["tile_oc", "sp", [2, 16]],
                      ["tile_oh", "ot", 1],
                      ["tile_ow", "sp", [4, 2]]],
                "t": ""}
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {"i": -1,
                "c": None,
                "e": [["tile_ic", "sp", [16, 2]],
                      ["tile_oc", "sp", [8, 4]],
                      ["tile_ow", "sp", [2, 4]],
                      ["unroll_kw", "ot", False]],
                "t": ""}
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    for cost, config, task in zip(costs, config_list, tasks):
        ms_input = MeasureInput(target=target, task=task, config=config)
        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
        records.append((ms_input, ms_output))

    executor = DPTuner(mod, {"data": dshape}, records, target_ops, target, log_file=log_file)
    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
    executor.run()
    out = [record[0].config for record in executor.get_optimal_records()]
    expected_out = [records[3][0].config, records[1][0].config, records[2][0].config]
    assert expected_out == out, "Output mismatch: expecting %s but got %s" \
                                % (str(expected_out), str(out))
    assert os.path.isfile(log_file), "No log file with name %s exists." % log_file
Beispiel #7
0
def batch_loader(log_file, target, batch_size=8):
    """Batch loading measure inputs."""
    tvm_target = tvm.target.create(target)
    batch = []
    for inp, _ in load_from_file(log_file):
        # FIXME (comaniac): If we apply different target (e.g., llvm to cuda) then
        # the task might be missing.
        inp.task.target = tvm_target
        new_inp = MeasureInput(tvm_target, inp.task, inp.config)
        batch.append(new_inp)
        if len(batch) == batch_size:
            yield batch
            batch = []
    yield batch
Beispiel #8
0
def test_load_dump():
    task, target = get_sample_task()

    inp = MeasureInput(target, task, task.config_space.get(0))
    result = MeasureResult((2.0, 2.23, 0.23, 0.123, 0.234, 0.123),
                           MeasureErrorNo.NO_ERROR, 2.3, time.time())

    for protocol in ['json', 'pickle']:
        row = encode(inp, result, protocol=protocol)
        inp_2, result_2 = decode(row, protocol=protocol)

        assert measure_str_key(inp) == measure_str_key(inp_2), \
            "%s vs %s" % (measure_str_key(inp), measure_str_key(inp_2))
        assert result.costs == result_2.costs
        assert result.error_no == result_2.error_no
        assert result.timestamp == result_2.timestamp
Beispiel #9
0
    def measure_configs(self, transitions, n_parallel, measure_batch,
                        callbacks):
        """
        Measure results for current population.
        """
        for i in range(ceil(len(transitions) / n_parallel)):
            configs = []
            batch_size = min(n_parallel, len(transitions) - (i * n_parallel))
            transitions_offset = (i * n_parallel) - 1

            # Get configs
            for j in range(transitions_offset,
                           transitions_offset + batch_size):
                gene = transitions[j].gene
                configs.append(self.space.get(knob2point(gene, self.dims)))

            # Measure batch
            inputs = [
                MeasureInput(self.task.target, self.task, config)
                for config in configs
            ]
            results, end_time = measure_batch(inputs)

            # Unpack result
            for j in range(len(results)):
                self.step_count += 1
                transition = transitions[transitions_offset + j]
                input, result = inputs[j], results[j]
                transition.input = inputs[j]
                transition.result = results[j]
                transition.score = input.task.flop / np.mean(
                    result.costs) if result.error_no == 0 else 0.0
                self.scores.append(transition.score)

                # Update best
                if transition.score > self.best_flops:
                    self.best_flops = transition.score
                    self.best_config = transition.input.config
                    self.best_measure_pair = (transition.input,
                                              transition.result)
                    self.best_iter = self.step_count

        for callback in callbacks:
            inputs = [t.input for t in transitions]
            results = [t.result for t in transitions]
            callback(self, inputs, results)
Beispiel #10
0
def test_file_io():
    temp = util.tempdir()
    file_path = temp.relpath("temp.log")

    tsk, target = get_sample_task()
    inputs = [
        MeasureInput(target, tsk, tsk.config_space.get(i))
        for i in range(0, 10)
    ]
    results = [MeasureResult((i, ), 0, 0, 0) for i in range(0, 10)]

    with open(file_path, "w") as fo:
        cb = autotvm.callback.log_to_file(fo)
        cb(None, inputs, results)

    ref = zip(inputs, results)
    for x, y in zip(ref, autotvm.record.load_from_file(file_path)):
        assert x[1] == y[1]
Beispiel #11
0
def tune_kernels(tasks,
                 gen_graph_tuner_candidates,
                 measure_top_n,
                 measure_option,
                 tuner='random',
                 early_stopping=None,
                 n_trial=5000,
                 log_filename='tuning.log'):
    """Tune kernels with the ranking model."""

    remeasure_option = None
    if tuner == 'round':
        # Setup another measure option for final remeasurment.
        remeasure_option = autotvm.measure_option(
            builder=LocalBuilder(),
            runner=measure_option['runner'].local_runner,
        )
        assert isinstance(measure_option['runner'], RankModelRunner)

    best_results = []

    for i, task in enumerate(tasks):
        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

        callbacks = []
        if task.name in [
                'dense_small_batch.cuda', 'conv2d_cudnn.cuda',
                'dense_cublas.cuda', 'dense_large_batch.cuda',
                'conv2d_transpose_nchw.cuda', 'dense_tensorcore.cuda'
        ]:
            # Ignore these four tasks
            continue
        if task.name not in measure_option['runner'].models:
            print('not covered by cost models')
            continue

        # create tuner
        if tuner == 'round':
            tuner_obj = RoundTuner(task, n_cfg=measure_top_n)
            callbacks = [rank_progress(n_trial, prefix=prefix)
                         ]  # Use different callbacks.
        else:
            if tuner in ('xgb', 'xgb-rank'):
                tuner_obj = XGBTuner(task, loss_type='rank')
            elif tuner == 'ga':
                tuner_obj = GATuner(task, pop_size=50)
            elif tuner == 'random':
                tuner_obj = RandomTuner(task)
            elif tuner == 'gridsearch':
                tuner_obj = GridSearchTuner(task)
            else:
                raise ValueError("Invalid tuner: " + tuner)

            callbacks = [
                autotvm.callback.progress_bar(n_trial, prefix=prefix),
                autotvm.callback.log_to_file(log_filename)
            ]

        tic = time.time()

        # do tuning
        tuner_obj.tune(n_trial=n_trial,
                       early_stopping=early_stopping,
                       measure_option=measure_option,
                       callbacks=callbacks)

        # Round tuner needs an extra measurement step to get the real throughputs.
        if tuner == 'round':
            max_n_layout = 20 if gen_graph_tuner_candidates else 1
            top_cfgs = tuner_obj.get_top_rank_cfgs(max_n_layout)
            measure_batch = create_measure_batch(task, remeasure_option)
            inputs = [
                MeasureInput(task.target, task, config) for config in top_cfgs
            ]
            sys.stderr.write('{} Measure Top {} Configs'.format(
                prefix, len(inputs)))
            results = measure_batch(inputs)
            best_idx, best_flops = max(
                [(idx, i.task.flop / np.mean(r.costs) /
                  1e9 if r.error_no == 0 else 0)
                 for idx, (i, r) in enumerate(zip(inputs, results))],
                key=lambda x: x[1])
            best_results.append((task.workload, best_idx, best_flops))
            sys.stderr.write(' | Best %.2f GFLOPS at Top %d | %.2fs\n' %
                             (best_flops, best_idx, time.time() - tic))
            autotvm.callback.log_to_file(log_filename)(None, inputs, results)
    return best_results
    def benchmark_layout_transform(self,
                                   min_exec_num=100,
                                   timeout=10,
                                   use_rpc=False,
                                   device_key=None,
                                   host="localhost",
                                   port=9190,
                                   n_parallel=1,
                                   build_func='default',
                                   layout_records=None,
                                   target_host=None,
                                   infer_layout=False):
        """Benchmark all possible layout transformation in the graph,
        given a set of schedule candidates for each workload of target operator.

        Parameters
        ----------
        min_exec_num : int, optional
            Minimum number of execution. Final execution time is the average of
            all execution time.

        timeout : int, optional
            Time out for each execution.

        use_rpc : boolean, optional
            Whether to use rpc mode for benchmarking.

        device_key : str, optional
            Remote device key which can be queried by
            python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190

        host : str, optional
            IP address used to create RPC tracker on host machine.

        port : int, optional
            Port number used to create RPC tracker on host machine.

        n_parallel: int, optional
            The number of measurement task that can run in parallel.
            Set this according to the number of cpu cores (for compilation) and
            the number of devices you have (for measuring generate code).

        build_func: str or callable, optional
            'default': call default builder. This works for normal target (llvm, cuda)

            'ndk': use Android NDK to create shared library. Use this for android target.

            callable: customized build function for other backends (e.g. VTA).
                      See autotvm/measure/measure_methods.py::default_build_func for example.

        layout_records : str or iterator of (MeasureInput, MeasureResult). optional
            Collection of layout_transform benchmarking records.
            If is str, then it should be the filename of a records log file.
                   Each row of this file is an encoded record pair.
            Otherwise, it is an iterator.

            If this argument is set, graph tuner will first check whether layout_transform
            workload already exists in records and skip benchmarking if possible.

        target_host : str, optional
            str or :any:`tvm.target.Target` optional
            Host compilation target, if target is device.
            When TVM compiles device specific program such as CUDA,
            we also need host(CPU) side code to interact with the driver
            setup the dimensions and parameters correctly.
            target_host is used to specify the host side codegen target.
            By default, llvm is used if it is enabled,
            otherwise a stackvm intepreter is used.

        infer_layout : bool, optional
            Whether to infer layout transformation time if it doesn't exist in records, instead
            of benchmarking on target device.

            This might bring performance loss comparing to benchmarking layout transformation.
        """
        self._logger.info("Start to benchmark layout transformation...")
        if layout_records is None and infer_layout:
            raise RuntimeError(
                "Requires some records to infer layout transformation time.")

        if isinstance(layout_records, str):
            layout_records = load_from_file(layout_records)
            if not layout_records and infer_layout:
                raise RuntimeError(
                    "Records must be non-empty to infer layout transformation time."
                )

        if isinstance(layout_records, str):
            layout_records = load_from_file(layout_records)
        num_flops, total_time = 0, 0
        if layout_records is not None:
            for record in layout_records:
                ltf_wkl = record[0].task.workload
                self._layout_transform_perf_records[ltf_wkl] = record
                input_shape = ltf_wkl[1][1]
                flops = np.prod(input_shape)
                num_flops += flops
                total_time += record[1].costs[0]
        avg_time = total_time / num_flops if num_flops > 0 else 0

        args_list = []

        def _fetch_args_callback(from_node_idx, to_node_idx, from_sch_idx,
                                 to_sch_idx, args):
            """Callback function to fetch layout transform args"""
            _, in_layout, out_layout = args
            if in_layout != out_layout:
                args_list.append(args)

        self._iterate_layout_transform(_fetch_args_callback)

        def _log_to_list(record_list):
            """Callback to log result to a list."""
            def _callback(_, inputs, results):
                """Callback implementation"""
                record_list.append((inputs[0], results[0]))

            return _callback

        builder = autotvm.LocalBuilder(n_parallel=n_parallel,
                                       build_func=build_func)
        runner = autotvm.LocalRunner(number=min_exec_num,
                                     repeat=1,
                                     timeout=timeout)
        if use_rpc:
            if device_key is None:
                raise RuntimeError(
                    "device_key need to be set to use rpc tracker mode.")
            runner = autotvm.measure.RPCRunner(device_key,
                                               host,
                                               port,
                                               n_parallel=n_parallel,
                                               number=min_exec_num,
                                               repeat=1,
                                               timeout=timeout)
        measure_option = autotvm.measure_option(builder=builder, runner=runner)
        for args in args_list:
            data, in_layout, out_layout = args
            args = serialize_args(args)
            ltf_workload = (
                'layout_transform', ) + autotvm.task.args_to_workload(args)
            if ltf_workload in self._layout_transform_perf_records:
                continue

            if infer_layout:
                input_shape = ltf_workload[1][1]
                flops = 1
                for i in input_shape:
                    flops *= i

                # Rule out invalid layout transformations
                out = topi.layout_transform(data, in_layout, out_layout)
                out_flops = 1
                for i in topi.util.get_const_tuple(out.shape):
                    out_flops *= i

                if flops != out_flops:
                    inferred_time = INVALID_LAYOUT_TIME
                else:
                    inferred_time = flops * avg_time

                record_input = MeasureInput(target=self._target,
                                            task=None,
                                            config=None)
                record_output = MeasureResult(costs=(inferred_time, ),
                                              error_no=0,
                                              all_cost=-1,
                                              timestamp=-1)
                self._layout_transform_perf_records[ltf_workload] = (
                    record_input, record_output)
                continue

            records = []
            task = autotvm.task.create(layout_transform,
                                       args=args,
                                       target=self._target,
                                       target_host=target_host)
            task.workload = ltf_workload
            tuner = autotvm.tuner.GridSearchTuner(task)
            tuner.tune(n_trial=1,
                       measure_option=measure_option,
                       callbacks=[_log_to_list(records)])
            if not isinstance(records[0][1].costs[0], float):
                records[0] = (records[0][0], records[0][1]._replace(
                    costs=(INVALID_LAYOUT_TIME, )))
            self._layout_transform_perf_records[ltf_workload] = records[0]

        self._iterate_layout_transform(self._create_matrix_callback)
        self._logger.info("Benchmarking layout transformation successful.")
Beispiel #13
0
def test_tuple():
    target = "llvm"
    dtype = "float32"
    dshape = (1, 5, 32, 32)
    layout = "NCHW"
    target_ops = [relay.nn.conv2d]

    data = relay.var("data", shape=dshape, dtype=dtype)
    w0 = relay.var("w0_weight")
    conv0 = relay.nn.conv2d(data,
                            w0,
                            channels=2,
                            kernel_size=(3, 3),
                            padding=(1, 1))
    w1 = relay.var("w1_weight")
    conv1 = relay.nn.conv2d(data,
                            w1,
                            channels=3,
                            kernel_size=(3, 3),
                            padding=(1, 1))
    out = relay.concatenate([conv0, conv1], axis=1)
    net = relay.Function(relay.analysis.free_vars(out), out)
    net, params = relay.testing.create_workload(net)

    tasks = autotvm.task.extract_from_program(net["main"],
                                              target=target,
                                              params=params,
                                              ops=(relay.op.nn.conv2d, ))
    wkl_list = [
        create_workload((1, 5, 32, 32), (2, 5, 3, 3), (1, 1), (1, 1), (1, 1),
                        layout, layout, dtype, dtype),
        create_workload((1, 5, 32, 32), (3, 5, 3, 3), (1, 1), (1, 1), (1, 1),
                        layout, layout, dtype, dtype),
    ]
    costs = [0.01, 0.012, 0.03, 0.04]
    config_list = []
    cfg_dict = {
        "i":
        -1,
        "c":
        None,
        "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [1, 2]],
              ["tile_ow", "sp", [4, 8]], ["unroll_kw", "ot", True]],
        "t":
        ""
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "i":
        -1,
        "c":
        None,
        "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [1, 3]],
              ["tile_ow", "sp", [2, 16]], ["unroll_kw", "ot", False]],
        "t":
        ""
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "i":
        -1,
        "c":
        None,
        "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [2, 1]],
              ["tile_ow", "sp", [4, 8]], ["unroll_kw", "ot", True]],
        "t":
        ""
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "i":
        -1,
        "c":
        None,
        "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [3, 1]],
              ["tile_ow", "sp", [2, 16]], ["unroll_kw", "ot", False]],
        "t":
        ""
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))

    records = []

    wkl_list = wkl_list + wkl_list
    tasks = tasks + tasks
    for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks):
        task.workload = wkl
        ms_input = MeasureInput(target=target, task=task, config=config)
        ms_output = MeasureResult(costs=(cost, ),
                                  error_no=0,
                                  all_cost=-1,
                                  timestamp=-1)
        records.append((ms_input, ms_output))

    ltf_records = []
    ltf_arg = [
        tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"
    ]
    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
    ltf_wkl = ('layout_transform', ) + autotvm.task.args_to_workload(ltf_arg)
    ltf_task = copy.deepcopy(tasks[0])
    ltf_task.workload = ltf_wkl
    ms_input = MeasureInput(target=target, task=ltf_task, config=None)
    ms_output = MeasureResult(costs=(1.91224744e-05, ),
                              error_no=0,
                              all_cost=-1,
                              timestamp=-1)
    ltf_records.append((ms_input, ms_output))

    executor = DPTuner(net, {"data": dshape}, records, target_ops, target)
    executor.benchmark_layout_transform(layout_records=ltf_records,
                                        infer_layout=True)
    executor.run()
    out = [record[0].config for record in executor.get_optimal_records()]
    expected_out = [records[2][0].config, records[1][0].config]
    assert expected_out == out, "Output mismatch: expecting %s but got %s" \
                                % (str(expected_out), str(out))

    executor = PBQPTuner(net, {"data": dshape}, records, target_ops, target)
    executor.benchmark_layout_transform(layout_records=ltf_records,
                                        infer_layout=True)
    executor.run()
    out = [record[0].config for record in executor.get_optimal_records()]
    expected_out = [records[2][0].config, records[1][0].config]
    assert expected_out == out, "Output mismatch: expecting %s but got %s" \
                                % (str(expected_out), str(out))
Beispiel #14
0
def _create_data(target, dshape, dtype, layout):
    data = relay.var("data", shape=dshape, dtype=dtype)
    w0 = relay.var("w0_weight")
    conv0 = relay.nn.conv2d(data,
                            w0,
                            channels=16,
                            kernel_size=(3, 3),
                            padding=(1, 1))
    w1 = relay.var("w1_weight")
    conv1 = relay.nn.conv2d(conv0, w1, channels=32, kernel_size=(1, 1))
    w2 = relay.var("w2_weight")
    conv2 = relay.nn.conv2d(conv1,
                            w2,
                            channels=32,
                            kernel_size=(3, 3),
                            padding=(1, 1))
    out = relay.add(conv1, conv2)
    net = relay.Function(relay.analysis.free_vars(out), out)
    mod, params = relay.testing.create_workload(net)
    tasks = autotvm.task.extract_from_program(mod["main"],
                                              target=target,
                                              params=params,
                                              ops=(relay.op.nn.conv2d, ))
    wkl_list = [
        create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1), (1, 1),
                        layout, layout, dtype, dtype),
        create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0), (1, 1),
                        layout, layout, dtype, dtype),
        create_workload((1, 32, 8, 8), (32, 32, 3, 3), (1, 1), (1, 1), (1, 1),
                        layout, layout, dtype, dtype),
    ]
    costs = [0.04, 0.012, 0.03]
    config_list = []
    cfg_dict = {
        "i":
        -1,
        "c":
        None,
        "e": [["tile_ic", "sp", [3, 1]], ["tile_oc", "sp", [4, 4]],
              ["tile_ow", "sp", [4, 2]], ["unroll_kw", "ot", True]],
        "t":
        ""
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "i":
        -1,
        "c":
        None,
        "e": [["tile_ic", "sp", [2, 8]], ["tile_oc", "sp", [1, 32]],
              ["tile_oh", "ot", 1], ["tile_ow", "sp", [4, 2]]],
        "t":
        ""
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "i":
        -1,
        "c":
        None,
        "e": [["tile_ic", "sp", [8, 4]], ["tile_oc", "sp", [4, 8]],
              ["tile_ow", "sp", [2, 4]], ["unroll_kw", "ot", False]],
        "t":
        ""
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))

    records = []
    for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks):
        task.workload = wkl
        ms_input = MeasureInput(target=target, task=task, config=config)
        ms_output = MeasureResult(costs=(cost, ),
                                  error_no=0,
                                  all_cost=-1,
                                  timestamp=-1)
        records.append((ms_input, ms_output))

    ltf_records = []
    ltf_arg = [
        tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"
    ]
    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
    ltf_wkl = ('layout_transform', ) + autotvm.task.args_to_workload(ltf_arg)
    ltf_task = copy.deepcopy(tasks[0])
    ltf_task.workload = ltf_wkl
    ms_input = MeasureInput(target=target, task=ltf_task, config=None)
    ms_output = MeasureResult(costs=(1.91224744e-05, ),
                              error_no=0,
                              all_cost=-1,
                              timestamp=-1)
    ltf_records.append((ms_input, ms_output))

    ltf_keys = []
    ltf_arg = [
        tvm.placeholder((1, 4, 8, 8, 4), dtype=dtype), "NCHW4c", "NCHW8c"
    ]
    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
    ltf_wkl = ('layout_transform', ) + autotvm.task.args_to_workload(ltf_arg)
    ltf_keys.append(ltf_wkl)
    ltf_arg = [
        tvm.placeholder((1, 1, 8, 8, 32), dtype=dtype), "NCHW32c", "NCHW4c"
    ]
    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
    ltf_wkl = ('layout_transform', ) + autotvm.task.args_to_workload(ltf_arg)
    ltf_keys.append(ltf_wkl)
    ltf_arg = [
        tvm.placeholder((1, 4, 8, 8, 8), dtype=dtype), "NCHW8c", "NCHW32c"
    ]
    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
    ltf_wkl = ('layout_transform', ) + autotvm.task.args_to_workload(ltf_arg)
    ltf_keys.append(ltf_wkl)

    return net, records, ltf_records, ltf_keys, tasks
Beispiel #15
0
def test_triangle_block():
    target = "llvm"
    dtype = "float32"
    dshape = (1, 3, 8, 8)
    layout = "NCHW"
    conv2d = relay.op.get("nn.conv2d")
    target_ops = [conv2d]

    data = relay.var("data", shape=dshape, dtype=dtype)
    w0 = relay.var("w0_weight")
    conv0 = relay.nn.conv2d(data,
                            w0,
                            channels=16,
                            kernel_size=(3, 3),
                            padding=(1, 1))
    w1 = relay.var("w1_weight")
    conv1 = relay.nn.conv2d(conv0, w1, channels=32, kernel_size=(1, 1))
    w2 = relay.var("w2_weight")
    conv2 = relay.nn.conv2d(data,
                            w2,
                            channels=32,
                            kernel_size=(3, 3),
                            padding=(1, 1))
    out = relay.concatenate([conv0, conv1, conv2], axis=1)
    net = relay.Function(relay.analysis.free_vars(out), out)
    net, params = relay.testing.create_workload(net)

    tasks = autotvm.task.extract_from_program(net["main"],
                                              target=target,
                                              params=params,
                                              ops=(conv2d, ))
    costs = [0.04, 0.012, 0.03, 0.02, 0.02, 0.045]
    config_list = []
    cfg_dict = {
        "index":
        -1,
        "code_hash":
        None,
        "entity": [["tile_ic", "sp", [3, 1]], ["tile_oc", "sp", [4, 4]],
                   ["tile_ow", "sp", [4, 2]], ["unroll_kw", "ot", True]]
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "index":
        -1,
        "code_hash":
        None,
        "entity": [["tile_ic", "sp", [2, 8]], ["tile_oc", "sp", [1, 32]],
                   ["tile_oh", "ot", 1], ["tile_ow", "sp", [4, 2]]]
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "index":
        -1,
        "code_hash":
        None,
        "entity": [["tile_ic", "sp", [8, 4]], ["tile_oc", "sp", [4, 8]],
                   ["tile_ow", "sp", [2, 4]], ["unroll_kw", "ot", False]]
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "index":
        -1,
        "code_hash":
        None,
        "entity": [["tile_ic", "sp", [1, 3]], ["tile_oc", "sp", [2, 8]],
                   ["tile_ow", "sp", [4, 2]], ["unroll_kw", "ot", True]]
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "index":
        -1,
        "code_hash":
        None,
        "entity": [["tile_ic", "sp", [4, 4]], ["tile_oc", "sp", [2, 16]],
                   ["tile_oh", "ot", 1], ["tile_ow", "sp", [4, 2]]]
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
    cfg_dict = {
        "index":
        -1,
        "code_hash":
        None,
        "entity": [["tile_ic", "sp", [16, 2]], ["tile_oc", "sp", [8, 4]],
                   ["tile_ow", "sp", [2, 4]], ["unroll_kw", "ot", False]]
    }
    config_list.append(ConfigEntity.from_json_dict(cfg_dict))

    records = []

    tasks = tasks + tasks
    for cost, config, task in zip(costs, config_list, tasks):
        ms_input = MeasureInput(target=target, task=task, config=config)
        ms_output = MeasureResult(costs=(cost, ),
                                  error_no=0,
                                  all_cost=-1,
                                  timestamp=-1)
        records.append((ms_input, ms_output))

    ltf_records = []
    ltf_arg = [
        tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"
    ]
    ltf_task = autotvm.task.create('layout_transform', ltf_arg, target)
    ms_input = MeasureInput(target=target, task=ltf_task, config=None)
    ms_output = MeasureResult(costs=(1.91224744e-05, ),
                              error_no=0,
                              all_cost=-1,
                              timestamp=-1)
    ltf_records.append((ms_input, ms_output))

    executor = DPTuner(net, {"data": dshape}, records, target_ops, target)
    executor.benchmark_layout_transform(layout_records=ltf_records,
                                        infer_layout=True)
    executor.run()
    out = [record[0].config for record in executor.get_optimal_records()]
    expected_out = [
        records[3][0].config, records[1][0].config, records[2][0].config
    ]
    assert expected_out == out, "Output mismatch: expecting %s but got %s" \
                                % (str(expected_out), str(out))

    executor = PBQPTuner(net, {"data": dshape}, records, target_ops, target)
    executor.benchmark_layout_transform(layout_records=ltf_records,
                                        infer_layout=True)
    executor.run()
    out = [record[0].config for record in executor.get_optimal_records()]
    expected_out = [
        records[3][0].config, records[1][0].config, records[2][0].config
    ]
    assert expected_out == out, "Output mismatch: expecting %s but got %s" \
                                % (str(expected_out), str(out))
Beispiel #16
0
def tune_kernels(
    tasks,
    measure_top_n,
    measure_option,
    tuner="random",
    early_stopping=None,
    n_trial=5000,
    log_filename="tuning.log",
):
    """Tune kernels with the ranking model."""

    remeasure_option = None
    if tuner == "round":
        # Setup another measure option for final remeasurment.
        remeasure_option = autotvm.measure_option(
            builder=LocalBuilder(),
            runner=measure_option["runner"].local_runner,
        )
        assert isinstance(measure_option["runner"], RankModelRunner)

    for i, task in enumerate(tasks):
        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

        callbacks = []
        if task.name not in measure_option["runner"].models:
            print("%s %s not covered by cost models" % (prefix, task.name))
            continue

        # create tuner
        if tuner == "round":
            tuner_obj = RoundTuner(task, n_cfg=measure_top_n)
            callbacks = [rank_progress(n_trial, prefix=prefix)
                         ]  # Use different callbacks.
        else:
            if tuner in ("xgb", "xgb-rank"):
                tuner_obj = XGBTuner(task, loss_type="rank")
            elif tuner == "ga":
                tuner_obj = GATuner(task, pop_size=50)
            elif tuner == "random":
                tuner_obj = RandomTuner(task)
            elif tuner == "gridsearch":
                tuner_obj = GridSearchTuner(task)
            else:
                raise ValueError("Invalid tuner: " + tuner)

            callbacks = [
                autotvm.callback.progress_bar(n_trial, prefix=prefix),
                autotvm.callback.log_to_file(log_filename),
            ]

        tic = time.time()

        # do tuning
        tuner_obj.tune(
            n_trial=n_trial,
            early_stopping=early_stopping,
            measure_option=measure_option,
            callbacks=callbacks,
        )

        # Round tuner needs an extra measurement step to get the real throughputs.
        if tuner == "round":
            top_cfgs = tuner_obj.get_top_rank_cfgs(1)
            measure_batch = create_measure_batch(task, remeasure_option)
            inputs = [
                MeasureInput(task.target, task, config) for config in top_cfgs
            ]
            sys.stderr.write("{} Measure Top {} Configs".format(
                prefix, len(inputs)))
            results = measure_batch(inputs)

            best_idx, best_flops = max(
                [(idx, i.task.flop / np.mean(r.costs) /
                  1e9 if r.error_no == 0 else 0)
                 for idx, (i, r) in enumerate(zip(inputs, results))],
                key=lambda x: x[1],
            )

            sys.stderr.write(" | Best %.2f GFLOPS at Top %d | %.2fs\n" %
                             (best_flops, best_idx, time.time() - tic))
            autotvm.callback.log_to_file(log_filename)(None, inputs, results)