def test_task_scheduler_round_robin():
    tasks = []
    for n in [2, 4, 8]:
        tasks.append(
            auto_scheduler.create_task(matmul_auto_scheduler_test, (n, n, n),
                                       "llvm"))

    def objective_func(costs):
        return sum(costs)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name
        num_trials_per_task = 2

        # Tune all tasks
        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=num_trials_per_task * len(tasks),
            runner=measure_ctx.runner,
            num_measures_per_round=1,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        task_scheduler = auto_scheduler.TaskScheduler(tasks,
                                                      objective_func,
                                                      strategy="round-robin")
        task_scheduler.tune(tune_option, search_policy="sketch.random")

        # Check the result of round robin
        counters = {}
        for task in tasks:
            counters[task.workload_key] = 0

        for inp, res in auto_scheduler.load_records(log_file):
            counters[inp.task.workload_key] += 1

        for task in tasks:
            assert counters[task.workload_key] == num_trials_per_task

        # test continuous tuning (restoring the status)
        task_scheduler = auto_scheduler.TaskScheduler(tasks,
                                                      objective_func,
                                                      strategy="round-robin",
                                                      load_log_file=log_file)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=len(tasks),
            num_measures_per_round=1,
        )
        task_scheduler.tune(tune_option, search_policy="sketch.random")
        del measure_ctx
Beispiel #2
0
    def local_auto_scheduler(self,
                             repeat=1,
                             min_repeat_ms=300,
                             timeout=10,
                             num_measure_trials=200):
        # extract tasks
        tasks, task_weights = auto_scheduler.extract_tasks(
            self.mod["main"], self.params, self.target)
        for idx, task in enumerate(tasks):
            logger.debug("========== Task %d  (workload key: %s) ==========" %
                         (idx, task.workload_key))
            logger.debug(task.compute_dag)

        # generate tuner
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)

        logging.info("Begin tuning...")
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(
            repeat=repeat, min_repeat_ms=min_repeat_ms, timeout=timeout)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=num_measure_trials,
            runner=measure_ctx.runner,
            measure_callbacks=[auto_scheduler.RecordToFile(self.log_file)],
        )
        tuner.tune(tune_option)

        # update self.lib
        with auto_scheduler.ApplyHistoryBest(self.log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                self._lib = relay.build(self.mod,
                                        target=self.target,
                                        params=self.params)
            logger.info(f"load optimized library from {self.log_file}")
Beispiel #3
0
def tune_network(network, target):
    # Extract tasks
    mod, params = get_network(network)
    target = tvm.target.Target(target)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params,
                                                       target)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tuning
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60)
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=100,
            num_measures_per_round=2,
            early_stopping=1,
            runner=measure_ctx.runner,
            builder=auto_scheduler.LocalBuilder(timeout=60),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")
        del measure_ctx

        # Compile with the history best
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                lib = relay.build(mod, target=target, params=params)
Beispiel #4
0
def schedule_tasks(
    tasks, task_weights, tuning_options, tuning_records=None, log_estimated_latency=False
):
    """Generate the schedules for the different tasks (i.e., subgraphs) contained in the module.
    Store the schedules in a json file that will be used later by the compiler.

    Parameters
    ----------
    tasks : list
        A list of auto_scheduler.SearchTask to tune.
    task_weights : list
        The weight (i.e. the number of appearance) of extracted tasks
    tuning_options: dict
        The options of tuning
    tuning_records : str, optional
        The json file used to preload the autoscheduler
    """
    if not log_estimated_latency:
        callbacks = [auto_scheduler.task_scheduler.PrintTableInfo()]
    else:
        callbacks = [
            auto_scheduler.task_scheduler.PrintTableInfo(),
            auto_scheduler.task_scheduler.LogEstimatedLatency(("total_latency.tsv")),
        ]

    # Create the scheduler
    tuner = auto_scheduler.TaskScheduler(
        tasks, task_weights, load_log_file=tuning_records, callbacks=callbacks
    )

    # Tune the tasks
    tuner.tune(tuning_options)
def test_tuning_cuda():
    auto_scheduler.enable_relay_integration()

    # Extract tasks
    mod, params = get_network("mlp")
    target = tvm.target.Target("cuda")
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
    objective = lambda costs: sum(c * w for c, w in zip(costs, task_weights))

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tuning
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=100)
        tuner = auto_scheduler.TaskScheduler(tasks, objective)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=2,
            num_measures_per_round=1,
            runner=measure_ctx.runner,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")
        del measure_ctx

        # Compile with the history best
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(opt_level=3):
                lib = relay.build(mod, target=target, params=params)

    # Todo(merrymercy): compile without any history to test the fallback mechanism

    auto_scheduler.enable_relay_integration(False)
def auto_scheduler_tune(network, target, input_name, log_file):
    if os.path.exists(log_file):
        os.remove(log_file)
    mod, net_params, input_shape, output_shape = get_network(network)
    if network not in ["bert"]:
        # convert to NHWC layout
        desired_layouts = {'nn.conv2d': ['NHWC', 'default']}
        seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
                                        relay.transform.ConvertLayout(desired_layouts)])
        with tvm.transform.PassContext(opt_level=3):
            mod = seq(mod)

    if "cpu" in target.keys:
        tuning_opt = auto_scheduler.TuningOptions(
            num_measure_trials=20000,  # change this to 20000 to achieve the best performance
            runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
    else:
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
        tuning_opt = auto_scheduler.TuningOptions(
            num_measure_trials=20000,  # change this to 20000 to achieve the best performance
            runner=measure_ctx.runner,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )

    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], net_params, target)
    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tuner.tune(tuning_opt)
Beispiel #7
0
def run_tuning():
    print("Begin tuning...")
    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=
        200,  # change this to 20000 to achieve the best performance
        runner=auto_scheduler.LocalRunner(repeat=10,
                                          enable_cpu_cache_flush=True),
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    if use_sparse:
        from tvm.topi.sparse.utils import sparse_sketch_rules

        search_policy = [
            auto_scheduler.SketchPolicy(
                task,
                program_cost_model=auto_scheduler.XGBModel(),
                init_search_callbacks=sparse_sketch_rules(),
            ) for task in tasks
        ]

        tuner.tune(tune_option, search_policy=search_policy)
    else:
        tuner.tune(tune_option)
Beispiel #8
0
def tune_and_evaluate():
    print("Begin tuning...")
    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=
        200,  # change this to 20000 to achieve the best performance
        builder=auto_scheduler.LocalBuilder(
            build_func="ndk" if use_ndk else "default"),
        runner=auto_scheduler.RPCRunner(
            device_key,
            host=rpc_host,
            port=rpc_port,
            timeout=30,
            repeat=1,
            min_repeat_ms=200,
            enable_cpu_cache_flush=True,
        ),
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    tuner.tune(tune_option)

    # Compile with the history best
    print("Compile...")
    with auto_scheduler.ApplyHistoryBest(log_file):
        with tvm.transform.PassContext(
                opt_level=3, config={"relay.backend.use_auto_scheduler":
                                     True}):
            lib = relay.build(mod, target=target, params=params)

    # Export library
    tmp = tempdir()
    if use_ndk:
        from tvm.contrib import ndk

        filename = "net.so"
        lib.export_library(tmp.relpath(filename), ndk.create_shared)
    else:
        filename = "net.tar"
        lib.export_library(tmp.relpath(filename))

    # Upload module to device
    print("Upload...")
    remote = auto_scheduler.utils.request_remote(device_key,
                                                 rpc_host,
                                                 rpc_port,
                                                 timeout=10000)
    remote.upload(tmp.relpath(filename))
    rlib = remote.load_module(filename)

    # Create graph executor
    dev = remote.cpu()
    module = graph_executor.GraphModule(rlib["default"](dev))
    data_tvm = tvm.nd.array(
        (np.random.uniform(size=input_shape)).astype(dtype))
    module.set_input("data", data_tvm)

    # Evaluate
    print("Evaluate inference time cost...")
    print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
Beispiel #9
0
    def tune_and_evaluate():
        print("Begin tuning...")
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=200,
            builder=auto_scheduler.LocalBuilder(build_func="ndk"),
            runner=auto_scheduler.RPCRunner(
                device_key,
                host=rpc_host,
                port=rpc_port,
                timeout=30,
                repeat=1,
                min_repeat_ms=200,
                enable_cpu_cache_flush=True,
            ),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )

        tuner.tune(tune_option)

        # Compile with the history best
        print("Compile...")
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                lib = relay.build(mod, target=target, params=params)

        # Export library
        tmp = tempdir()
        filename = "net.so"
        lib.export_library(tmp.relpath(filename), ndk.create_shared)

        # Upload module to device
        print("Upload...")
        remote = auto_scheduler.utils.request_remote(device_key,
                                                     rpc_host,
                                                     rpc_port,
                                                     timeout=10000)
        remote.upload(tmp.relpath(filename))
        rlib = remote.load_module(filename)

        # Create graph executor
        dev = remote.cpu()
        module = graph_executor.GraphModule(rlib["default"](dev))
        for key, value in shape_dict.items():
            data_tvm = tvm.nd.array(
                (np.random.uniform(size=value)).astype("float32"))
            module.set_input(key, data_tvm)

        # Evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run",
                                              dev,
                                              repeat=3,
                                              min_repeat_ms=500)
        prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
Beispiel #10
0
def tune_network(network, target):
    # Extract tasks
    mod, params = get_network(network)
    target = tvm.target.Target(target)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params,
                                                       target)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tuning
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60)
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=100,
            num_measures_per_round=2,
            early_stopping=1,
            runner=measure_ctx.runner,
            builder=auto_scheduler.LocalBuilder(timeout=60),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")
        del measure_ctx

        # Compile with the history best
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                lib = relay.build(mod, target=target, params=params)

        # Compile without auto-scheduler and any other optimization for correctness check
        with tvm.transform.PassContext(opt_level=0):
            lib2 = relay.build(mod, target=target, params=params)

        # Check the correctness
        def get_output(data, lib):
            ctx = tvm.gpu()
            module = graph_runtime.GraphModule(lib["default"](ctx))
            module.set_input("data", data)
            module.run()
            return module.get_output(0).asnumpy()

        np.random.seed(0)
        if network == "mlp":
            data = np.random.uniform(size=(1, 32))
        elif network == "winograd-test":
            data = np.random.uniform(size=(1, 23, 40, 32))
        else:
            raise ValueError("Unknown network: " + network)

        actual_output = get_output(data, lib)
        expected_output = get_output(data, lib2)

        tvm.testing.assert_allclose(actual_output,
                                    expected_output,
                                    rtol=1e-4,
                                    atol=1e-4)
Beispiel #11
0
def tune_and_evaluate():
    print("Begin tuning...")
    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=
        200,  # change this to 20000 to achieve the best performance
        builder=auto_scheduler.LocalBuilder(
            build_func="ndk" if use_ndk else "default"),
        runner=auto_scheduler.RPCRunner(device_key,
                                        host="0.0.0.0",
                                        port=9190,
                                        repeat=3,
                                        timeout=50),
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    tuner.tune(tune_option)

    # Compile the whole network
    print("Compile...")
    with auto_scheduler.ApplyHistoryBest(log_file):
        with tvm.transform.PassContext(
                opt_level=3, config={"relay.backend.use_auto_scheduler":
                                     True}):
            lib = relay.build(mod,
                              target=target,
                              target_host=target_host,
                              params=params)

    # Create graph runtime
    print("=============== Request Remote ===============")
    from tvm.auto_scheduler.utils import request_remote

    remote = request_remote(device_key, "0.0.0.0", 9190)
    ctx = remote.cl()
    from tvm.contrib import utils, ndk

    temp = utils.tempdir()
    filename = "deploy_lib.so"
    path_lib = temp.relpath(filename)
    lib.export_library(path_lib, ndk.create_shared)
    remote.upload(path_lib)
    loaded_lib = remote.load_module(filename)
    module = graph_runtime.GraphModule(loaded_lib["default"](ctx))
    data = (np.random.uniform(size=input_shape)).astype(dtype)
    data_tvm = tvm.nd.array(data)
    module.set_input("data", data_tvm)

    # Evaluate
    print("Evaluate inference time cost...")
    ftimer = module.module.time_evaluator("run",
                                          ctx,
                                          repeat=3,
                                          min_repeat_ms=500)
    prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
    print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
          (np.mean(prof_res), np.std(prof_res)))
Beispiel #12
0
def run_tuning():
    print("Begin tuning...")
    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)

    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=200,  # change this to 20000 to achieve the best performance
        runner=measure_ctx.runner,
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    tuner.tune(tune_option)
Beispiel #13
0
def run_tuning():
    print("Begin tuning...")
    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=
        200,  # change this to 20000 to achieve the best performance
        runner=auto_scheduler.LocalRunner(repeat=10,
                                          enable_cpu_cache_flush=True),
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    tuner.tune(tune_option)
def tune_and_check(mod, data, weight):
    # Extract tasks from a relay program
    target = tvm.target.Target("llvm")
    tasks, task_weights = auto_scheduler.extract_tasks(
        mod, target=target, params={"weight": weight})

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tune tasks
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights, callbacks=[])
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=1,
            num_measures_per_round=1,
            builder=auto_scheduler.LocalBuilder(timeout=60),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")

        # Compile
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True},
            ):
                lib = relay.build(mod,
                                  target=target,
                                  params={"weight": weight})

        # Compile without auto-scheduler for correctness check
        with tvm.transform.PassContext(opt_level=0):
            lib2 = relay.build(mod, target=target, params={"weight": weight})

        def get_output(data, lib):
            dev = tvm.cpu()
            module = graph_executor.GraphModule(lib["default"](dev))
            module.set_input("data", data)
            module.run()

            return module.get_output(0).numpy()

        # Check correctness
        actual_output = get_output(data, lib)
        expected_output = get_output(data, lib2)

        tvm.testing.assert_allclose(actual_output,
                                    expected_output,
                                    rtol=1e-4,
                                    atol=2e-4)
Beispiel #15
0
def run_tuning(tasks, task_weights, log_file):
    print("Begin tuning...")
    measure_runner = auto_scheduler.RPCRunner("m1",
                                              "127.0.0.1",
                                              9190,
                                              min_repeat_ms=300,
                                              timeout=30,
                                              repeat=2)
    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=10000,
        runner=measure_runner,
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        verbose=2,
    )
    tuner.tune(tune_option)
Beispiel #16
0
def test_task_scheduler_gradient():
    tasks = []
    for n in [2, 4]:
        tasks.append(
            auto_scheduler.SearchTask(
                func=matmul_auto_scheduler_test, args=(n, n, n), target="llvm"
            )
        )

    def objective_func(costs):
        return costs[0]

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        n_trials = 5

        # Tune all tasks
        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=n_trials,
            runner=measure_ctx.runner,
            num_measures_per_round=1,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        task_scheduler = auto_scheduler.TaskScheduler(
            tasks, objective_func=objective_func, callbacks=[]
        )

        # Forcely rewrite the initial values.
        # This can make this test more stable on the slow CI machines
        task_scheduler.best_costs = np.array([1e2, 1e-8])

        task_scheduler.tune(tune_option, search_policy="sketch.random")

        # Check the allocation results
        counters = {}
        for task in tasks:
            counters[task.workload_key] = 0

        for inp, _ in auto_scheduler.load_records(log_file):
            counters[inp.task.workload_key] += 1

        assert counters[tasks[0].workload_key] == n_trials - 1
        assert counters[tasks[1].workload_key] == 1
        del measure_ctx
Beispiel #17
0
def test_check_auto_schedule_tuning(host, port):  # pylint: disable=too-many-locals
    log_file = TEMPORARY_DIRECTORY.relpath("ios_tuning_stat.log")
    target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin")
    mod, params = relay.testing.mlp.get_workload(batch_size=4,
                                                 image_shape=(1, 4, 4))

    try:
        status_ok = True
        measure_runner = auto_scheduler.RPCRunner(
            DEVICE_KEY,
            host,
            port,
            min_repeat_ms=1,
            timeout=10,
            n_parallel=multiprocessing.cpu_count(),
        )
        builder = auto_scheduler.LocalBuilder(timeout=10,
                                              build_func=ios_create_dylib)
        tune_option = auto_scheduler.TuningOptions(
            builder=builder,
            num_measure_trials=2,
            num_measures_per_round=1,
            runner=measure_runner,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
            verbose=0,
        )

        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"], params, target)
        tasks, task_weights = tasks[:2], task_weights[:2]
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
        tuner.tune(tune_option, search_policy="sketch.random")

        # Check tuning log
        tuning_statistic = list(load_records(log_file))
        for _, measure_result in tuning_statistic:
            if measure_result.error_no != MeasureErrorNo.NO_ERROR:
                raise ValueError(
                    f"Error for MeasureResult. Error code: {measure_result.error_no},"
                    f" for details see MeasureErrorNO.")

    except Exception as e:  # pylint: disable=broad-except
        status_ok = False
        print(e)

    assert status_ok, "Tuning failed, see logs."
Beispiel #18
0
def auto_scheduler_tune(network, batch_size, dtype, target, log_file):
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    #if os.path.exists(log_file):
    #    os.remove(log_file)

    layout = "NHWC"
    mod, params, input_name, input_shape, output_shape = get_network(
        network, batch_size, dtype, layout)

    n_trials = network_to_n_trials[(network, batch_size, dtype,
                                    str(target.kind))]

    if "cpu" in target.keys:
        tuning_opt = auto_scheduler.TuningOptions(
            num_measure_trials=n_trials,
            runner=auto_scheduler.LocalRunner(repeat=10,
                                              enable_cpu_cache_flush=True),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
    else:
        min_repeat_ms = 450 if network in ["bert"] else 300
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(
            repeat=1, min_repeat_ms=min_repeat_ms, timeout=10)
        tuning_opt = auto_scheduler.TuningOptions(
            num_measure_trials=n_trials,
            runner=measure_ctx.runner,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )

    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params,
                                                       target)
    print(log_file)
    update_file(log_file, tasks)
    return
    for idx, task in enumerate(tasks):
        print("========== Task %d  (workload key: %s) ==========" %
              (idx, task.workload_key))
        print(task.compute_dag)

    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tuner.tune(tuning_opt)
Beispiel #19
0
def schedule_tasks(
    tasks: List[auto_scheduler.SearchTask],
    task_weights: List[float],
    tuning_options: auto_scheduler.TuningOptions,
    prior_records: Optional[str] = None,
    log_estimated_latency: bool = False,
):
    """Generate the schedules for the different tasks (i.e., subgraphs) contained in the module.
    Store the schedules in a json file that will be used later by the compiler.

    Parameters
    ----------
    tasks : list
        A list of auto_scheduler.SearchTask to tune.
    task_weights : list
        The weight (i.e. the number of appearance) of extracted tasks
    tuning_options: auto_scheduler.TuningOptions
        The options of tuning
    prior_records : str, optional
        The json file used to preload the autoscheduler
    log_estimated_latency : bool, optional
        If true, writes the estimated runtime of the model during each step of tuning to file.
    """
    if not log_estimated_latency:
        callbacks = [auto_scheduler.task_scheduler.PrintTableInfo()]
    else:
        callbacks = [
            auto_scheduler.task_scheduler.PrintTableInfo(),
            auto_scheduler.task_scheduler.LogEstimatedLatency(
                ("total_latency.tsv")),
        ]

    # Create the scheduler
    tuner = auto_scheduler.TaskScheduler(tasks,
                                         task_weights,
                                         load_log_file=prior_records,
                                         callbacks=callbacks)

    # Tune the tasks
    tuner.tune(tuning_options)
Beispiel #20
0
    def remote_auto_scheduler(self, device_key, rpc_host, rpc_port):
        # generate tasks
        tasks, task_weights = auto_scheduler.extract_tasks(
            self.mod["main"], self.params, self.target)
        for idx, task in enumerate(tasks):
            logger.debug("========== Task %d  (workload key: %s) ==========" %
                         (idx, task.workload_key))
            logger.debug(task.compute_dag)

        # generate tuner
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)

        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=200,
            builder=auto_scheduler.LocalBuilder(),
            runner=auto_scheduler.RPCRunner(
                device_key,
                host=rpc_host,
                port=rpc_port,
                timeout=30,
                repeat=1,
                min_repeat_ms=200,
                enable_cpu_cache_flush=True,
            ),
            measure_callbacks=[auto_scheduler.RecordToFile(self.log_file)],
        )
        tuner.tune(tune_option)

        # update self.lib
        with auto_scheduler.ApplyHistoryBest(self.log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                self._lib = relay.build(self.mod,
                                        target=self.target,
                                        params=self.params)
            logger.info(f"load optimized library from {self.log_file}")
Beispiel #21
0
def tune_network(network, target):
    auto_scheduler.enable_relay_integration()

    # Extract tasks
    mod, params = get_network(network)
    target = tvm.target.Target(target)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params,
                                                       target)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tuning
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60)
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=100,
            num_measures_per_round=2,
            early_stopping=1,
            runner=measure_ctx.runner,
            builder=auto_scheduler.LocalBuilder(timeout=60),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")
        del measure_ctx

        # Compile with the history best
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(opt_level=3):
                lib = relay.build(mod, target=target, params=params)

    # Todo(merrymercy): when the cpu backend is upstreamed, do the following things:
    # 1. compile without history to test the fallback mechanism
    # 2. check the correctness of layout rewrite / winograd pre-transform

    auto_scheduler.enable_relay_integration(False)
        "nico2_v100_32",  # change the device key to your key
        "0.0.0.0",
        9190,
        n_parallel=2,
        number=5,
        repeat=1,
        timeout=20,
        min_repeat_ms=300,
    ),
    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    verbose=2,
)

print("Begin tuning multiple convs...")
# measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400, timeout=10)
tuner = auto_scheduler.TaskScheduler(tasks, strategy='round-robin')
tuner.tune(tune_option)
sys.exit()

######################################################################
# Run the search
# ^^^^^^^^^^^^^^
# Now we get all inputs ready. Pretty simple, isn't it?
# We can kick off the search and let the auto-scheduler do its magic.
# After some measurement trials, it will return the best schedule it found.

sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)

# Kill the process for measurement
del measure_ctx
def tune_network(network, target):
    # Extract tasks
    mod, params = get_network(network)
    target = tvm.target.Target(target)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        # Tuning
        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60, device=0)
        tuner = auto_scheduler.TaskScheduler(tasks, task_weights, callbacks=[])
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=100,
            num_measures_per_round=2,
            early_stopping=1,
            runner=measure_ctx.runner,
            builder=auto_scheduler.LocalBuilder(timeout=60),
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        tuner.tune(tune_option, search_policy="sketch.random")
        del measure_ctx

        # Compile with the history best
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                opt_level=3, config={"relay.backend.use_auto_scheduler": True}
            ):
                lib = relay.build(mod, target=target, params=params)

        # Also test that multiple log files can be loaded.
        with auto_scheduler.ApplyHistoryBest([log_file, log_file]) as best:
            assert isinstance(
                best, auto_scheduler.dispatcher.ApplyHistoryBest
            ), "Unable to load multiple log files jointly."

        # Confirm iterables can be directly loaded.
        loaded_recs = auto_scheduler.dispatcher.load_records(log_file)
        with auto_scheduler.ApplyHistoryBest(iter(loaded_recs)) as best:
            assert isinstance(
                best, auto_scheduler.dispatcher.ApplyHistoryBest
            ), "Unable to ingest logs from an interator."

        # Sample a schedule when missing
        with auto_scheduler.ApplyHistoryBestOrSample(None, num_measure=2):
            with tvm.transform.PassContext(
                opt_level=3, config={"relay.backend.use_auto_scheduler": True}
            ):
                lib2 = relay.build(mod, target=target, params=params)

        # Compile without auto-scheduler and any other optimization for correctness check
        with tvm.transform.PassContext(opt_level=0):
            ref_lib = relay.build(mod, target=target, params=params)

        # Check the correctness
        def get_output(data, lib):
            dev = tvm.cuda()
            module = graph_executor.GraphModule(lib["default"](dev))
            module.set_input("data", data)
            module.run()
            return module.get_output(0).numpy()

        np.random.seed(0)
        if network == "mlp":
            data = np.random.uniform(size=(1, 32))
        elif network == "winograd-test":
            data = np.random.uniform(size=(1, 23, 40, 32))
        else:
            raise ValueError("Unknown network: " + network)

        actual_output1 = get_output(data, lib)
        actual_output2 = get_output(data, lib2)
        expected_output = get_output(data, ref_lib)

        tvm.testing.assert_allclose(actual_output1, expected_output, rtol=1e-4, atol=1e-4)
        tvm.testing.assert_allclose(actual_output2, expected_output, rtol=1e-4, atol=1e-4)
Beispiel #24
0
def _tvm_compile(fx_module,
                 example_inputs,
                 target=None,
                 tuning_logfile=None,
                 use_ansor_tuning=False):
    import tvm
    from tvm import relay, auto_scheduler
    from tvm.contrib import graph_executor
    import os

    # Find the target and device for TVM.
    dev = tvm.cpu(0)
    if target is None:
        raise ValueError("Setup the TVM target correctly.")
    elif isinstance(target, str):
        if "cuda" in target:
            dev = tvm.cuda(0)
        target = tvm.target.Target(target)
    elif isinstance(target, tvm.target.target.Target):
        if "cuda" in target.keys:
            dev = tvm.cuda(0)

    # JIT the model and pass it to Torchscript to Relay frontend parser. TVM
    # tutorials suggest tracing instead of scripting. The main reason is to
    # avoid Pythonic computation to show up in JIT module. However, with Python
    # key tracing, AOT Autograd leads to simpler graphs. Therefore, we use
    # scripting here to retrieve the JIT module.
    jit_mod = torch.jit.script(fx_module)
    shape_list = [(f"inp_{idx}", i.shape)
                  for idx, i in enumerate(example_inputs)]
    mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)

    # TVM Autotuning
    if use_ansor_tuning:
        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"], params, target)
        if tuning_logfile is None:
            log_file = f"{time.time()}.json"
        else:
            log_file = f"{tuning_logfile}.json"
        if len(tasks) != 0:
            tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
            tune_option = auto_scheduler.TuningOptions(
                num_measure_trials=20000,
                measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
                # early_stopping=1000,
                # verbose=2,
            )
            tuner.tune(tune_option)
    elif tuning_logfile is not None:
        log_file = f"{tuning_logfile}.json"

    if use_ansor_tuning or tuning_logfile is not None:
        assert os.path.exists(log_file)
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True}):
                lib = relay.build(mod, target=target, params=params)
    else:
        with tvm.transform.PassContext(opt_level=3):
            lib = relay.build(mod, target=target, params=params)

    # Get a graph executor graph module
    m = graph_executor.GraphModule(lib["default"](dev))

    def exec_tvm(*args):
        for idx, arg in enumerate(args, 0):
            if arg.dim() != 0:
                m.set_input(
                    f"inp_{idx}",
                    tvm.nd.from_dlpack(
                        torch.utils.dlpack.to_dlpack(arg.contiguous())),
                )
        m.run()
        outs = [
            torch.utils.dlpack.from_dlpack(m.get_output(i).to_dlpack())
            for i in range(m.get_num_outputs())
        ]
        return outs

    return exec_tvm
Beispiel #25
0
def main():
    log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json")

    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=cpu_count(logical=True),
        number=ARGS.number,
        repeat=ARGS.repeat,
        min_repeat_ms=ARGS.min_repeat_ms,
        enable_cpu_cache_flush=ARGS.cpu_flush,
    )

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(
                ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(
                ARGS.target.attrs["max_threads_per_block"]),
            # The value `max_local_memory_per_block` is not used in AutoScheduler,
            # but is required by the API.
            max_local_memory_per_block=12345678,
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")

    describe()
    print(f"Workload: {ARGS.model_name}")
    onnx_model = onnx.load(ARGS.onnx_path)
    shape_dict = {}
    for item in ARGS.input_shape:
        print(f"  input_name: {item['name']}")
        print(f"  input_shape: {item['shape']}")
        print(f"  input_dtype: {item['dtype']}")
        shape_dict[item["name"]] = item["shape"]
    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
    tasks, task_weights = auto_scheduler.extract_tasks(
        mod["main"],
        params,
        target=ARGS.target,
        hardware_params=hardware_params,
    )
    for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
        print(
            f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) ====="
        )
        print(task.compute_dag)

    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tuner.tune(
        auto_scheduler.TuningOptions(
            num_measure_trials=ARGS.num_trials,
            runner=runner,
            measure_callbacks=[
                auto_scheduler.RecordToFile(log_file),
            ],
        ))

    with auto_scheduler.ApplyHistoryBest(log_file):
        with tvm.transform.PassContext(
                opt_level=3,
                config={"relay.backend.use_auto_scheduler": True},
        ):
            lib = relay.build(
                mod,
                target=ARGS.target,
                params=params,
            )
    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
    input_data = {}
    for item in ARGS.input_shape:
        input_name, input_shape, input_dtype = item["name"], item[
            "shape"], item["dtype"]
        if input_dtype.startswith("float"):
            input_data[input_name] = np.random.uniform(
                size=input_shape).astype(input_dtype)
        else:
            input_data[input_name] = np.random.randint(low=0,
                                                       high=10000,
                                                       size=input_shape,
                                                       dtype=input_dtype)

    def f_timer(rt_mod, dev, input_data):
        # pylint: disable=import-outside-toplevel
        from tvm.contrib.graph_executor import GraphModule

        # pylint: enable=import-outside-toplevel

        mod = GraphModule(rt_mod["default"](dev))
        for input_name, input_value in input_data.items():
            mod.set_input(input_name, input_value)
        ftimer = mod.module.time_evaluator(
            "run",
            dev,
            min_repeat_ms=500,
            repeat=3,
        )
        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
        print("Running time in time_evaluator: ", results)

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=lib,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=f_timer,
    )

    def f_per_layer(rt_mod, dev, input_data):
        # pylint: disable=import-outside-toplevel
        from tvm.contrib.debugger.debug_executor import create

        # pylint: enable=import-outside-toplevel
        mod = create(graph, rt_mod, dev)
        for input_name, input_value in input_data.items():
            mod.set_input(input_name, input_value)
        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
        graph_time = mod.run_individual(number=10,
                                        repeat=1,
                                        min_repeat_ms=5000)
        print("|graph_nodes| = ", len(graph_nodes))
        print("|graph_time| = ", len(graph_time))
        graph_nodes_time = {
            k: float(v)
            for k, v in zip(graph_nodes, graph_time)
        }
        for k, v in graph_nodes_time.items():
            print(f"{k} : {v:.3f}")

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=rt_mod,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=f_per_layer,
    )
Beispiel #26
0
def main():
    log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json")

    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=cpu_count(logical=True),
        number=ARGS.number,
        repeat=ARGS.repeat,
        min_repeat_ms=ARGS.min_repeat_ms,
        enable_cpu_cache_flush=ARGS.cpu_flush,
        timeout=ARGS.rpc_config.session_timeout_sec,
    )

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(
                ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(
                ARGS.target.attrs["max_threads_per_block"]),
            # The value `max_local_memory_per_block` is not used in AutoScheduler,
            # but is required by the API.
            max_local_memory_per_block=12345678,
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")

    describe()
    print(f"Workload: {ARGS.model_name}")
    onnx_model = onnx.load(ARGS.onnx_path)
    shape_dict = {}
    for item in ARGS.input_shape:
        print(f"  input_name : {item['name']}")
        print(f"  input_shape: {item['shape']}")
        print(f"  input_dtype: {item['dtype']}")
        shape_dict[item["name"]] = item["shape"]
    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
    input_data = {
        item["name"]: generate_input_data(item["shape"], item["dtype"])
        for item in ARGS.input_shape
    }

    with ms.Profiler() as profiler:
        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"],
            params,
            target=ARGS.target,
            hardware_params=hardware_params,
        )
        for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
            print(f"==== Task {idx}: {task.desc} "
                  f"(weight {task_weight} key: {task.workload_key}) =====")
            print(task.compute_dag)

        if ARGS.num_trials > 0:
            tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
            tuner.tune(
                auto_scheduler.TuningOptions(
                    num_measure_trials=ARGS.num_trials,
                    runner=runner,
                    measure_callbacks=[
                        auto_scheduler.RecordToFile(log_file),
                    ],
                ),
                adaptive_training=ARGS.adaptive_training,
            )

        relay_build = {
            "graph": relay.build,
            "vm": relay.vm.compile
        }[ARGS.backend]
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True},
            ):
                lib = relay_build(
                    mod,
                    target=ARGS.target,
                    params=params,
                )
    print("Tuning Time:")
    print(profiler.table())

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=lib,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=create_timer(ARGS.backend),
        backend=ARGS.backend,
    )
Beispiel #27
0
def tune_and_evaluate():
    print("Begin tuning...")
    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=
        200,  # change this to 20000 to achieve the best performance
        runner=auto_scheduler.RPCRunner(
            device_key,
            host="0.0.0.0",
            port=9191,
            timeout=30,
            repeat=1,
            min_repeat_ms=200,
            enable_cpu_cache_flush=True,
        ),
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    tuner.tune(tune_option)

    # Compile with the history best
    print("Compile...")
    with auto_scheduler.ApplyHistoryBest(log_file):
        with tvm.transform.PassContext(
                opt_level=3, config={"relay.backend.use_auto_scheduler":
                                     True}):
            lib = relay.build(mod, target=target, params=params)

    # Export library
    tmp = tempdir()
    if use_ndk:
        from tvm.contrib import ndk

        filename = "net.so"
        lib.export_library(tmp.relpath(filename), ndk.create_shared)
    else:
        filename = "net.tar"
        lib.export_library(tmp.relpath(filename))

    # Upload module to device
    print("Upload...")
    remote = auto_scheduler.utils.request_remote(device_key,
                                                 "0.0.0.0",
                                                 9191,
                                                 timeout=10000)
    remote.upload(tmp.relpath(filename))
    rlib = remote.load_module(filename)

    # Create graph runtime
    dev = remote.cpu()
    module = graph_runtime.GraphModule(rlib["default"](dev))
    data_tvm = tvm.nd.array(
        (np.random.uniform(size=input_shape)).astype(dtype))
    module.set_input("data", data_tvm)

    # Evaluate
    print("Evaluate inference time cost...")
    ftimer = module.module.time_evaluator("run",
                                          dev,
                                          repeat=3,
                                          min_repeat_ms=500)
    prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
    print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
          (np.mean(prof_res), np.std(prof_res)))