Ejemplo n.º 1
0
def test_task_extraction_cuda():
    auto_scheduler.enable_relay_integration()
    target = tvm.target.Target("cuda")

    mod, params = get_network("mlp")
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
    assert len(tasks) == 1
    assert sum(task_weights) == 2

    for layout in ["NHWC", "NCHW"]:
        mod, params = get_network("resnet-18", layout=layout)
        tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

        assert len(tasks) == 21
        assert sum(task_weights) == 22

        mod, params = get_network("mobilenet", layout=layout)
        tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

        assert len(tasks) == 20
        assert sum(task_weights) == 28

    for layout in ["NCDHW", "NDHWC"]:
        mod, params = get_network("resnet3d-18", layout=layout)
        tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

        assert len(tasks) == 21
        assert sum(task_weights) == 22

    auto_scheduler.enable_relay_integration(False)
Ejemplo n.º 2
0
def test_task_extraction_cuda():
    target = tvm.target.Target("cuda")

    mod, params = get_network("mlp")
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params,
                                                       target)

    assert len(tasks) == 1
    assert sum(task_weights) == 2

    for layout in ["NHWC", "NCHW"]:
        mod, params = get_network("resnet-18", layout=layout)
        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"], params, target)

        assert len(tasks) == 24
        assert sum(task_weights) == 25

        mod, params = get_network("mobilenet", layout=layout)
        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"], params, target)

        assert len(tasks) == 22
        assert sum(task_weights) == 30

    for layout in ["NCDHW", "NDHWC"]:
        mod, params = get_network("resnet3d-18", layout=layout)
        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"], params, target)

        assert len(tasks) == 23
        assert sum(task_weights) == 24, sum(task_weights)
def test_task_extraction_cuda():
    auto_scheduler.enable_relay_integration()

    mod, params = get_network("mlp")
    target = tvm.target.Target("cuda")
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

    assert len(tasks) == 1
    assert sum(task_weights) == 2

    mod, params = get_network("resnet-18")
    target = tvm.target.Target("cuda")
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

    assert len(tasks) == 21
    assert sum(task_weights) == 22
Ejemplo n.º 4
0
def auto_scheduler_tune(network, target, input_name, log_file):
    if os.path.exists(log_file):
        os.remove(log_file)
    mod, net_params, input_shape, output_shape = get_network(network)
    if network not in ["bert"]:
        # convert to NHWC layout
        desired_layouts = {'nn.conv2d': ['NHWC', 'default']}
        seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
                                        relay.transform.ConvertLayout(desired_layouts)])
        with tvm.transform.PassContext(opt_level=3):
            mod = seq(mod)

    if "cpu" in target.keys:
        tuning_opt = auto_scheduler.TuningOptions(
            num_measure_trials=20000,  # change this to 20000 to achieve the best performance
            runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
    else:
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
        tuning_opt = auto_scheduler.TuningOptions(
            num_measure_trials=20000,  # change this to 20000 to achieve the best performance
            runner=measure_ctx.runner,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )

    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], net_params, target)
    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tuner.tune(tuning_opt)
Ejemplo n.º 5
0
    def local_auto_scheduler(self,
                             repeat=1,
                             min_repeat_ms=300,
                             timeout=10,
                             num_measure_trials=200):
        # extract tasks
        tasks, task_weights = auto_scheduler.extract_tasks(
            self.mod["main"], self.params, self.target)
        for idx, task in enumerate(tasks):
            logger.debug("========== Task %d  (workload key: %s) ==========" %
                         (idx, task.workload_key))
            logger.debug(task.compute_dag)

        # generate tuner
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)

        logging.info("Begin tuning...")
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(
            repeat=repeat, min_repeat_ms=min_repeat_ms, timeout=timeout)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=num_measure_trials,
            runner=measure_ctx.runner,
            measure_callbacks=[auto_scheduler.RecordToFile(self.log_file)],
        )
        tuner.tune(tune_option)

        # update self.lib
        with auto_scheduler.ApplyHistoryBest(self.log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                self._lib = relay.build(self.mod,
                                        target=self.target,
                                        params=self.params)
            logger.info(f"load optimized library from {self.log_file}")
def test_custom_hash_func_extract_tasks():
    @_ffi_api.register_func("auto_scheduler.compute_dag.hash_func")
    def counting_unique_hash(str_dag):
        ret = counting_unique_hash.i
        counting_unique_hash.i += 1
        return ret

    counting_unique_hash.i = 0

    mod, _ = get_network("mobilenet", layout="NHWC")
    tasks, _ = auto_scheduler.extract_tasks(mod["main"],
                                            None,
                                            "llvm",
                                            include_simple_tasks=True)

    hash_values = []
    for task in tasks:
        # task.workload_key should look like
        # [43, [3, 3, 1024, 1], [1024], [3, 3, 1024, 1]] where the first int is the result of the hash
        # Extract the hash and keep track of every hash
        hash_value = int(task.workload_key[1:].split(",")[0])
        hash_values.append(hash_value)

    # All values are unique, and we know the min and max
    # This is a sufficient condition to know that hashes in hash_values are an increasing list
    # of hashes up to counting_unique_hash.i - 1
    assert len(hash_values) == len(set(hash_values))
    assert min(hash_values) == 0
    assert max(hash_values) == counting_unique_hash.i - 1
def test_tuning_cuda():
    auto_scheduler.enable_relay_integration()

    # Extract tasks
    mod, params = get_network("mlp")
    target = tvm.target.Target("cuda")
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
    objective = lambda costs: sum(c * w for c, w in zip(costs, task_weights))

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tuning
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=100)
        tuner = auto_scheduler.TaskScheduler(tasks, objective)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=2,
            num_measures_per_round=1,
            runner=measure_ctx.runner,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")
        del measure_ctx

        # Compile with the history best
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(opt_level=3):
                lib = relay.build(mod, target=target, params=params)

    # Todo(merrymercy): compile without any history to test the fallback mechanism

    auto_scheduler.enable_relay_integration(False)
Ejemplo n.º 8
0
def tune_network(network, target):
    # Extract tasks
    mod, params = get_network(network)
    target = tvm.target.Target(target)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params,
                                                       target)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tuning
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60)
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=100,
            num_measures_per_round=2,
            early_stopping=1,
            runner=measure_ctx.runner,
            builder=auto_scheduler.LocalBuilder(timeout=60),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")
        del measure_ctx

        # Compile with the history best
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                lib = relay.build(mod, target=target, params=params)
Ejemplo n.º 9
0
    def verify_task_extraction(func, expected_task, include_simple_tasks=False):
        mod = tvm.IRModule.from_expr(func)
        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"], None, target, include_simple_tasks=include_simple_tasks
        )

        assert len(tasks) == expected_task
        assert len(task_weights) == expected_task
Ejemplo n.º 10
0
def tune_network(network, target):
    # Extract tasks
    mod, params = get_network(network)
    target = tvm.target.Target(target)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params,
                                                       target)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tuning
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60)
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=100,
            num_measures_per_round=2,
            early_stopping=1,
            runner=measure_ctx.runner,
            builder=auto_scheduler.LocalBuilder(timeout=60),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")
        del measure_ctx

        # Compile with the history best
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                lib = relay.build(mod, target=target, params=params)

        # Compile without auto-scheduler and any other optimization for correctness check
        with tvm.transform.PassContext(opt_level=0):
            lib2 = relay.build(mod, target=target, params=params)

        # Check the correctness
        def get_output(data, lib):
            ctx = tvm.gpu()
            module = graph_runtime.GraphModule(lib["default"](ctx))
            module.set_input("data", data)
            module.run()
            return module.get_output(0).asnumpy()

        np.random.seed(0)
        if network == "mlp":
            data = np.random.uniform(size=(1, 32))
        elif network == "winograd-test":
            data = np.random.uniform(size=(1, 23, 40, 32))
        else:
            raise ValueError("Unknown network: " + network)

        actual_output = get_output(data, lib)
        expected_output = get_output(data, lib2)

        tvm.testing.assert_allclose(actual_output,
                                    expected_output,
                                    rtol=1e-4,
                                    atol=1e-4)
def test_task_extraction_cuda(params):
    target = tvm.target.Target("cuda")
    network, layout, expected_task, expected_weights = params

    mod, params = get_network(network, layout=layout)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params,
                                                       target)
    for task, weight in zip(tasks, task_weights):
        print(task.desc, task.workload_key, weight)

    assert len(tasks) == expected_task
    assert sum(task_weights) == expected_weights
def test_dump_workload_to_dag_extract_tasks():
    mod, _ = get_network("mobilenet", layout="NHWC")
    with tempfile.NamedTemporaryFile() as f:
        tasks, _ = auto_scheduler.extract_tasks(
            mod["main"],
            None,
            "llvm",
            include_simple_tasks=True,
            dump_workload_to_dag_log=f.name)
        expected = {task.workload_key: str(task.compute_dag) for task in tasks}
        actual = json.load(f)
        assert expected == actual
Ejemplo n.º 13
0
def autoscheduler_get_tuning_tasks(
    mod: tvm.IRModule,
    params: Dict[str, tvm.nd.NDArray],
    target: str,
    target_host: Optional[str] = None,
    alter_layout: Optional[str] = None,
    hardware_params: Optional[HardwareParams] = None,
    include_simple_tasks: bool = False,
):
    """Get the autoscheduler tuning tasks for a given relay module.

    Parameters
    ----------
    mod : tvm.IRModule
        The relay module from which to extract tuning tasks.
    params : dict
        The params for the relay module.
    target : tvm.target.Target
        The compilation target.
    target_host : str, optional
        The compilation target for the host.
    alter_layout : str, optional
        The layout to convert the graph to. Note, the convert layout
        pass doesn't currently guarantee the whole of the graph will
        be converted to the chosen layout.
    hardware_params : Optional[HardwareParams]
        Hardware parameters used for the search tasks

    Returns
    -------
    tasks : list of autotvm.Tasks
        list of tasks to be tuned
    weights : List[int]
        the weight (i.e. the number of appearance) of extracted tasks
    """
    target, target_host = Target.check_and_update_host_consist(
        target, target_host)

    if alter_layout:
        mod = common.convert_graph_layout(mod, alter_layout)

    # Extract the tasks
    tasks, task_weights = auto_scheduler.extract_tasks(
        mod["main"],
        params,
        target=target,
        hardware_params=hardware_params,
        include_simple_tasks=include_simple_tasks,
    )

    return tasks, task_weights
def tune_and_check(mod, data, weight):
    # Extract tasks from a relay program
    target = tvm.target.Target("llvm")
    tasks, task_weights = auto_scheduler.extract_tasks(
        mod, target=target, params={"weight": weight})

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tune tasks
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights, callbacks=[])
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=1,
            num_measures_per_round=1,
            builder=auto_scheduler.LocalBuilder(timeout=60),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")

        # Compile
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True},
            ):
                lib = relay.build(mod,
                                  target=target,
                                  params={"weight": weight})

        # Compile without auto-scheduler for correctness check
        with tvm.transform.PassContext(opt_level=0):
            lib2 = relay.build(mod, target=target, params={"weight": weight})

        def get_output(data, lib):
            dev = tvm.cpu()
            module = graph_executor.GraphModule(lib["default"](dev))
            module.set_input("data", data)
            module.run()

            return module.get_output(0).numpy()

        # Check correctness
        actual_output = get_output(data, lib)
        expected_output = get_output(data, lib2)

        tvm.testing.assert_allclose(actual_output,
                                    expected_output,
                                    rtol=1e-4,
                                    atol=2e-4)
Ejemplo n.º 15
0
def test_check_auto_schedule_tuning(host, port):  # pylint: disable=too-many-locals
    log_file = TEMPORARY_DIRECTORY.relpath("ios_tuning_stat.log")
    target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin")
    mod, params = relay.testing.mlp.get_workload(batch_size=4,
                                                 image_shape=(1, 4, 4))

    try:
        status_ok = True
        measure_runner = auto_scheduler.RPCRunner(
            DEVICE_KEY,
            host,
            port,
            min_repeat_ms=1,
            timeout=10,
            n_parallel=multiprocessing.cpu_count(),
        )
        builder = auto_scheduler.LocalBuilder(timeout=10,
                                              build_func=ios_create_dylib)
        tune_option = auto_scheduler.TuningOptions(
            builder=builder,
            num_measure_trials=2,
            num_measures_per_round=1,
            runner=measure_runner,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
            verbose=0,
        )

        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"], params, target)
        tasks, task_weights = tasks[:2], task_weights[:2]
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
        tuner.tune(tune_option, search_policy="sketch.random")

        # Check tuning log
        tuning_statistic = list(load_records(log_file))
        for _, measure_result in tuning_statistic:
            if measure_result.error_no != MeasureErrorNo.NO_ERROR:
                raise ValueError(
                    f"Error for MeasureResult. Error code: {measure_result.error_no},"
                    f" for details see MeasureErrorNO.")

    except Exception as e:  # pylint: disable=broad-except
        status_ok = False
        print(e)

    assert status_ok, "Tuning failed, see logs."
Ejemplo n.º 16
0
def auto_scheduler_tune(network, batch_size, dtype, target, log_file):
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    #if os.path.exists(log_file):
    #    os.remove(log_file)

    layout = "NHWC"
    mod, params, input_name, input_shape, output_shape = get_network(
        network, batch_size, dtype, layout)

    n_trials = network_to_n_trials[(network, batch_size, dtype,
                                    str(target.kind))]

    if "cpu" in target.keys:
        tuning_opt = auto_scheduler.TuningOptions(
            num_measure_trials=n_trials,
            runner=auto_scheduler.LocalRunner(repeat=10,
                                              enable_cpu_cache_flush=True),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
    else:
        min_repeat_ms = 450 if network in ["bert"] else 300
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(
            repeat=1, min_repeat_ms=min_repeat_ms, timeout=10)
        tuning_opt = auto_scheduler.TuningOptions(
            num_measure_trials=n_trials,
            runner=measure_ctx.runner,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )

    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params,
                                                       target)
    print(log_file)
    update_file(log_file, tasks)
    return
    for idx, task in enumerate(tasks):
        print("========== Task %d  (workload key: %s) ==========" %
              (idx, task.workload_key))
        print(task.compute_dag)

    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tuner.tune(tuning_opt)
Ejemplo n.º 17
0
    def remote_auto_scheduler(self, device_key, rpc_host, rpc_port):
        # generate tasks
        tasks, task_weights = auto_scheduler.extract_tasks(
            self.mod["main"], self.params, self.target)
        for idx, task in enumerate(tasks):
            logger.debug("========== Task %d  (workload key: %s) ==========" %
                         (idx, task.workload_key))
            logger.debug(task.compute_dag)

        # generate tuner
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)

        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=200,
            builder=auto_scheduler.LocalBuilder(),
            runner=auto_scheduler.RPCRunner(
                device_key,
                host=rpc_host,
                port=rpc_port,
                timeout=30,
                repeat=1,
                min_repeat_ms=200,
                enable_cpu_cache_flush=True,
            ),
            measure_callbacks=[auto_scheduler.RecordToFile(self.log_file)],
        )
        tuner.tune(tune_option)

        # update self.lib
        with auto_scheduler.ApplyHistoryBest(self.log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                self._lib = relay.build(self.mod,
                                        target=self.target,
                                        params=self.params)
            logger.info(f"load optimized library from {self.log_file}")
Ejemplo n.º 18
0
def tune_network(network, target):
    auto_scheduler.enable_relay_integration()

    # Extract tasks
    mod, params = get_network(network)
    target = tvm.target.Target(target)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params,
                                                       target)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tuning
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60)
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=100,
            num_measures_per_round=2,
            early_stopping=1,
            runner=measure_ctx.runner,
            builder=auto_scheduler.LocalBuilder(timeout=60),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")
        del measure_ctx

        # Compile with the history best
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(opt_level=3):
                lib = relay.build(mod, target=target, params=params)

    # Todo(merrymercy): when the cpu backend is upstreamed, do the following things:
    # 1. compile without history to test the fallback mechanism
    # 2. check the correctness of layout rewrite / winograd pre-transform

    auto_scheduler.enable_relay_integration(False)
Ejemplo n.º 19
0
#################################################################
# Extract Search Tasks
# --------------------
# Next, we extract the search tasks and their weights from a network.
# The weight of a task is the number of appearances of the task's subgraph
# in the whole network.
# By using the weight, we can approximate the end-to-end latency of the network
# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
# latency of a task and :code:`weight[t]` is the weight of the task.
# The task scheduler will just optimize this objective.

# Extract tasks from the network
print("Extract tasks...")
mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host)

for idx, task in enumerate(tasks):
    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
    print(task.compute_dag)
######################################################################
# .. note:: How to get the hardware parameters from remote device
#
#   .. code-block:: python
#
#     from tvm.auto_scheduler.utils import request_remote
#     remote = request_remote(device_key, "0.0.0.0", 9190)
#     dev = remote.cl()
#     max_shared_memory_per_block = dev.max_shared_memory_per_block
#     # There is no explicit local memory limition
#     # so we can use INT32_MAX to disalbe the check on local_memory.
Ejemplo n.º 20
0
def _tvm_compile(fx_module,
                 example_inputs,
                 target=None,
                 tuning_logfile=None,
                 use_ansor_tuning=False):
    import tvm
    from tvm import relay, auto_scheduler
    from tvm.contrib import graph_executor
    import os

    # Find the target and device for TVM.
    dev = tvm.cpu(0)
    if target is None:
        raise ValueError("Setup the TVM target correctly.")
    elif isinstance(target, str):
        if "cuda" in target:
            dev = tvm.cuda(0)
        target = tvm.target.Target(target)
    elif isinstance(target, tvm.target.target.Target):
        if "cuda" in target.keys:
            dev = tvm.cuda(0)

    # JIT the model and pass it to Torchscript to Relay frontend parser. TVM
    # tutorials suggest tracing instead of scripting. The main reason is to
    # avoid Pythonic computation to show up in JIT module. However, with Python
    # key tracing, AOT Autograd leads to simpler graphs. Therefore, we use
    # scripting here to retrieve the JIT module.
    jit_mod = torch.jit.script(fx_module)
    shape_list = [(f"inp_{idx}", i.shape)
                  for idx, i in enumerate(example_inputs)]
    mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)

    # TVM Autotuning
    if use_ansor_tuning:
        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"], params, target)
        if tuning_logfile is None:
            log_file = f"{time.time()}.json"
        else:
            log_file = f"{tuning_logfile}.json"
        if len(tasks) != 0:
            tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
            tune_option = auto_scheduler.TuningOptions(
                num_measure_trials=20000,
                measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
                # early_stopping=1000,
                # verbose=2,
            )
            tuner.tune(tune_option)
    elif tuning_logfile is not None:
        log_file = f"{tuning_logfile}.json"

    if use_ansor_tuning or tuning_logfile is not None:
        assert os.path.exists(log_file)
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                lib = relay.build(mod, target=target, params=params)
    else:
        with tvm.transform.PassContext(opt_level=3):
            lib = relay.build(mod, target=target, params=params)

    # Get a graph executor graph module
    m = graph_executor.GraphModule(lib["default"](dev))

    def exec_tvm(*args):
        for idx, arg in enumerate(args, 0):
            if arg.dim() != 0:
                m.set_input(
                    f"inp_{idx}",
                    tvm.nd.from_dlpack(
                        torch.utils.dlpack.to_dlpack(arg.contiguous())),
                )
        m.run()
        outs = [
            torch.utils.dlpack.from_dlpack(m.get_output(i).to_dlpack())
            for i in range(m.get_num_outputs())
        ]
        return outs

    return exec_tvm
def test_task_extraction():
    ishape = (1, 3, 224, 224)
    w1shape = (32, 3, 3, 3)
    w2shape = (32, 32, 3, 3)
    dtype = "float32"
    target = tvm.target.Target("llvm")

    def get_func():
        data = relay.var("data", shape=(ishape), dtype=dtype)
        weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
        weight2 = relay.var("weight2", shape=(w2shape), dtype=dtype)

        conv2d = relay.nn.conv2d(data,
                                 weight1,
                                 kernel_size=(3, 3),
                                 padding=(1, 1))
        relu = relay.nn.relu(conv2d)
        conv2d = relay.nn.conv2d(relu,
                                 weight2,
                                 kernel_size=(3, 3),
                                 padding=(1, 1))
        out = relay.nn.relu(conv2d)
        return relay.Function([data, weight1, weight2], out)

    def get_fused_func():
        data = relay.var("data", shape=(ishape), dtype=dtype)
        weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
        weight2 = relay.var("weight2", shape=(w2shape), dtype=dtype)

        fused_func = get_func()

        # Set to primitive to keep fuse_ops untouch.
        fused_func = fused_func.with_attr("Primitive",
                                          tvm.tir.IntImm("int32", 1))

        call = relay.Call(fused_func, [data, weight1, weight2])
        return relay.Function([data, weight1, weight2], call)

    def get_simple_func():
        data = relay.var("data", relay.TensorType((1, 2, 3), "float32"))
        out = relay.image.affine_grid(data, (150, 150))
        return relay.Function([data], out)

    def get_func_with_unsupported_op():
        def get_postproc_func():
            data = relay.var("data", shape=((1, 3, 6)), dtype=dtype)
            out = relay.nn.relu(data)
            func = relay.Function([data], out)
            func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
            return func

        cls_prob = relay.var("cls_prob",
                             relay.ty.TensorType((1, 3, 3), "float32"))
        loc_pred = relay.var("loc_pred",
                             relay.ty.TensorType((1, 3 * 4), "float32"))
        anchors = relay.var("anchors", relay.ty.TensorType((1, 3, 4),
                                                           "float32"))

        mtl = relay.vision.multibox_transform_loc(cls_prob=cls_prob,
                                                  loc_pred=loc_pred,
                                                  anchor=anchors)
        nms = relay.vision.non_max_suppression(mtl[0],
                                               mtl[1],
                                               mtl[0],
                                               return_indices=False)
        out = relay.Call(get_postproc_func(), [nms])
        return relay.Function([cls_prob, loc_pred, anchors], out)

    func = get_func()
    mod = tvm.IRModule.from_expr(func)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None,
                                                       target)

    # Relay FuseOps puts two conv2ds to separate functions and results in two tasks.
    assert len(tasks) == 2
    assert len(task_weights) == 2

    func = get_fused_func()
    mod = tvm.IRModule.from_expr(func)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None,
                                                       target)

    # By setting the function to primitive, Relay FuseOps will not break it and result in one task.
    assert len(tasks) == 1
    assert len(task_weights) == 1

    func = get_simple_func()
    mod = tvm.IRModule.from_expr(func)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None,
                                                       target)

    # The Relay function without complex ops will not form a task by default.
    assert len(tasks) == 0
    assert len(task_weights) == 0

    tasks, task_weights = auto_scheduler.extract_tasks(
        mod["main"], None, target, include_simple_tasks=True)

    # Every Relay function becomes a task regardless what ops in its body.
    assert len(tasks) == 1
    assert len(task_weights) == 1

    # Func1 (with NMS) -> Func2 (injective).
    func = get_func_with_unsupported_op()
    mod = tvm.IRModule.from_expr(func)
    tasks, task_weights = auto_scheduler.extract_tasks(
        mod["main"], None, target, include_simple_tasks=True)

    # The function with NMS should fail, but the other function with ReLU should be a task.
    assert len(tasks) == 1
    assert len(task_weights) == 1
Ejemplo n.º 22
0
def main():
    log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json")

    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=cpu_count(logical=True),
        number=ARGS.number,
        repeat=ARGS.repeat,
        min_repeat_ms=ARGS.min_repeat_ms,
        enable_cpu_cache_flush=ARGS.cpu_flush,
        timeout=ARGS.rpc_config.session_timeout_sec,
    )

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(
                ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(
                ARGS.target.attrs["max_threads_per_block"]),
            # The value `max_local_memory_per_block` is not used in AutoScheduler,
            # but is required by the API.
            max_local_memory_per_block=12345678,
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")

    describe()
    print(f"Workload: {ARGS.model_name}")
    onnx_model = onnx.load(ARGS.onnx_path)
    shape_dict = {}
    for item in ARGS.input_shape:
        print(f"  input_name : {item['name']}")
        print(f"  input_shape: {item['shape']}")
        print(f"  input_dtype: {item['dtype']}")
        shape_dict[item["name"]] = item["shape"]
    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
    input_data = {
        item["name"]: generate_input_data(item["shape"], item["dtype"])
        for item in ARGS.input_shape
    }

    with ms.Profiler() as profiler:
        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"],
            params,
            target=ARGS.target,
            hardware_params=hardware_params,
        )
        for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
            print(f"==== Task {idx}: {task.desc} "
                  f"(weight {task_weight} key: {task.workload_key}) =====")
            print(task.compute_dag)

        if ARGS.num_trials > 0:
            tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
            tuner.tune(
                auto_scheduler.TuningOptions(
                    num_measure_trials=ARGS.num_trials,
                    runner=runner,
                    measure_callbacks=[
                        auto_scheduler.RecordToFile(log_file),
                    ],
                ),
                adaptive_training=ARGS.adaptive_training,
            )

        relay_build = {
            "graph": relay.build,
            "vm": relay.vm.compile
        }[ARGS.backend]
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True},
            ):
                lib = relay_build(
                    mod,
                    target=ARGS.target,
                    params=params,
                )
    print("Tuning Time:")
    print(profiler.table())

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=lib,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=create_timer(ARGS.backend),
        backend=ARGS.backend,
    )
Ejemplo n.º 23
0
from matplotlib import pyplot as plt

out_y = Image.fromarray(np.uint8((tvm_output[0, 0]).clip(0, 255)), mode="L")
out_cb = img_cb.resize(out_y.size, Image.BICUBIC)
out_cr = img_cr.resize(out_y.size, Image.BICUBIC)
result = Image.merge("YCbCr", [out_y, out_cb, out_cr]).convert("RGB")
canvas = np.full((672, 672 * 2, 3), 255)
canvas[0:224, 0:224, :] = np.asarray(img)
canvas[:, 672:, :] = np.asarray(result)
plt.imshow(canvas.astype(np.uint8))
plt.show()

######################################################################
# Tune the model
# ---------------------------------------------
tasks, task_weights = auto_scheduler.extract_tasks(mod['main'], params, target)

dtype = "float32"
batch_size = 1
log_file = "%s-B%d-%s.json" % ("superres", batch_size, target)

for idx, task in enumerate(tasks):
    print("========== Task %d  (workload key: %s) ==========" %
          (idx, task.workload_key))
    print(task.compute_dag)


def run_tuning():
    print("Begin tuning...")
    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1,
                                                        min_repeat_ms=300,
Ejemplo n.º 24
0
def tune_network(network, target):
    # Extract tasks
    mod, params = get_network(network)
    target = tvm.target.Target(target)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tuning
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60, device=0)
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights, callbacks=[])
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=100,
            num_measures_per_round=2,
            early_stopping=1,
            runner=measure_ctx.runner,
            builder=auto_scheduler.LocalBuilder(timeout=60),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")
        del measure_ctx

        # Compile with the history best
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                opt_level=3, config={"relay.backend.use_auto_scheduler": True}
            ):
                lib = relay.build(mod, target=target, params=params)

        # Also test that multiple log files can be loaded.
        with auto_scheduler.ApplyHistoryBest([log_file, log_file]) as best:
            assert isinstance(
                best, auto_scheduler.dispatcher.ApplyHistoryBest
            ), "Unable to load multiple log files jointly."

        # Confirm iterables can be directly loaded.
        loaded_recs = auto_scheduler.dispatcher.load_records(log_file)
        with auto_scheduler.ApplyHistoryBest(iter(loaded_recs)) as best:
            assert isinstance(
                best, auto_scheduler.dispatcher.ApplyHistoryBest
            ), "Unable to ingest logs from an interator."

        # Sample a schedule when missing
        with auto_scheduler.ApplyHistoryBestOrSample(None, num_measure=2):
            with tvm.transform.PassContext(
                opt_level=3, config={"relay.backend.use_auto_scheduler": True}
            ):
                lib2 = relay.build(mod, target=target, params=params)

        # Compile without auto-scheduler and any other optimization for correctness check
        with tvm.transform.PassContext(opt_level=0):
            ref_lib = relay.build(mod, target=target, params=params)

        # Check the correctness
        def get_output(data, lib):
            dev = tvm.cuda()
            module = graph_executor.GraphModule(lib["default"](dev))
            module.set_input("data", data)
            module.run()
            return module.get_output(0).numpy()

        np.random.seed(0)
        if network == "mlp":
            data = np.random.uniform(size=(1, 32))
        elif network == "winograd-test":
            data = np.random.uniform(size=(1, 23, 40, 32))
        else:
            raise ValueError("Unknown network: " + network)

        actual_output1 = get_output(data, lib)
        actual_output2 = get_output(data, lib2)
        expected_output = get_output(data, ref_lib)

        tvm.testing.assert_allclose(actual_output1, expected_output, rtol=1e-4, atol=1e-4)
        tvm.testing.assert_allclose(actual_output2, expected_output, rtol=1e-4, atol=1e-4)
Ejemplo n.º 25
0
def main():
    log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json")

    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=cpu_count(logical=True),
        number=ARGS.number,
        repeat=ARGS.repeat,
        min_repeat_ms=ARGS.min_repeat_ms,
        enable_cpu_cache_flush=ARGS.cpu_flush,
    )

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(
                ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(
                ARGS.target.attrs["max_threads_per_block"]),
            # The value `max_local_memory_per_block` is not used in AutoScheduler,
            # but is required by the API.
            max_local_memory_per_block=12345678,
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")

    describe()
    print(f"Workload: {ARGS.model_name}")
    onnx_model = onnx.load(ARGS.onnx_path)
    shape_dict = {}
    for item in ARGS.input_shape:
        print(f"  input_name: {item['name']}")
        print(f"  input_shape: {item['shape']}")
        print(f"  input_dtype: {item['dtype']}")
        shape_dict[item["name"]] = item["shape"]
    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
    tasks, task_weights = auto_scheduler.extract_tasks(
        mod["main"],
        params,
        target=ARGS.target,
        hardware_params=hardware_params,
    )
    for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
        print(
            f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) ====="
        )
        print(task.compute_dag)

    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tuner.tune(
        auto_scheduler.TuningOptions(
            num_measure_trials=ARGS.num_trials,
            runner=runner,
            measure_callbacks=[
                auto_scheduler.RecordToFile(log_file),
            ],
        ))

    with auto_scheduler.ApplyHistoryBest(log_file):
        with tvm.transform.PassContext(
                opt_level=3,
                config={"relay.backend.use_auto_scheduler": True},
        ):
            lib = relay.build(
                mod,
                target=ARGS.target,
                params=params,
            )
    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
    input_data = {}
    for item in ARGS.input_shape:
        input_name, input_shape, input_dtype = item["name"], item[
            "shape"], item["dtype"]
        if input_dtype.startswith("float"):
            input_data[input_name] = np.random.uniform(
                size=input_shape).astype(input_dtype)
        else:
            input_data[input_name] = np.random.randint(low=0,
                                                       high=10000,
                                                       size=input_shape,
                                                       dtype=input_dtype)

    def f_timer(rt_mod, dev, input_data):
        # pylint: disable=import-outside-toplevel
        from tvm.contrib.graph_executor import GraphModule

        # pylint: enable=import-outside-toplevel

        mod = GraphModule(rt_mod["default"](dev))
        for input_name, input_value in input_data.items():
            mod.set_input(input_name, input_value)
        ftimer = mod.module.time_evaluator(
            "run",
            dev,
            min_repeat_ms=500,
            repeat=3,
        )
        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
        print("Running time in time_evaluator: ", results)

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=lib,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=f_timer,
    )

    def f_per_layer(rt_mod, dev, input_data):
        # pylint: disable=import-outside-toplevel
        from tvm.contrib.debugger.debug_executor import create

        # pylint: enable=import-outside-toplevel
        mod = create(graph, rt_mod, dev)
        for input_name, input_value in input_data.items():
            mod.set_input(input_name, input_value)
        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
        graph_time = mod.run_individual(number=10,
                                        repeat=1,
                                        min_repeat_ms=5000)
        print("|graph_nodes| = ", len(graph_nodes))
        print("|graph_time| = ", len(graph_time))
        graph_nodes_time = {
            k: float(v)
            for k, v in zip(graph_nodes, graph_time)
        }
        for k, v in graph_nodes_time.items():
            print(f"{k} : {v:.3f}")

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=rt_mod,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=f_per_layer,
    )
Ejemplo n.º 26
0
def main():
    # Get Model
    print("Get Model...")
    onnx_model = onnx.load(args.onnx_path)
    shape_dict = {}
    for input in onnx_model.graph.input:
        shape_dict[input.name] = [
            dim.dim_value for dim in input.type.tensor_type.shape.dim
        ]
    mod, params = relay.frontend.from_onnx(onnx_model)

    bs_r = args.bsr
    bs_c = args.bsc
    sparsity = args.sparsity

    # Conver to Sparse Model
    mod, params = ddo.simplify_fc_transpose.convert(mod["main"], params)
    mod, params = ddo.bsr_conv2d.convert(mod,
                                         params, (bs_r, bs_c),
                                         sparsity_threshold=sparsity,
                                         layout='NHWC')
    mod = tvm.IRModule.from_expr(mod)

    # Set tune config
    target = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+neon")
    device_key = "pixel2"
    rpc_host = "127.0.0.1"
    rpc_port = 9190

    log_file = f"{str(args.onnx_path).split('.')[-2]}.json"

    # Extract tasks
    print("Extract tasks...")
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params,
                                                       target)

    for idx, task in enumerate(tasks):
        print("========== Task %d  (workload key: %s) ==========" %
              (idx, task.workload_key))
        print(task.compute_dag)

    def tune_and_evaluate():
        print("Begin tuning...")
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=200,
            builder=auto_scheduler.LocalBuilder(build_func="ndk"),
            runner=auto_scheduler.RPCRunner(
                device_key,
                host=rpc_host,
                port=rpc_port,
                timeout=30,
                repeat=1,
                min_repeat_ms=200,
                enable_cpu_cache_flush=True,
            ),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )

        tuner.tune(tune_option)

        # Compile with the history best
        print("Compile...")
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                lib = relay.build(mod, target=target, params=params)

        # Export library
        tmp = tempdir()
        filename = "net.so"
        lib.export_library(tmp.relpath(filename), ndk.create_shared)

        # Upload module to device
        print("Upload...")
        remote = auto_scheduler.utils.request_remote(device_key,
                                                     rpc_host,
                                                     rpc_port,
                                                     timeout=10000)
        remote.upload(tmp.relpath(filename))
        rlib = remote.load_module(filename)

        # Create graph executor
        dev = remote.cpu()
        module = graph_executor.GraphModule(rlib["default"](dev))
        for key, value in shape_dict.items():
            data_tvm = tvm.nd.array(
                (np.random.uniform(size=value)).astype("float32"))
            module.set_input(key, data_tvm)

        # Evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run",
                                              dev,
                                              repeat=3,
                                              min_repeat_ms=500)
        prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))

    tune_and_evaluate()