Ejemplo n.º 1
0
    def __init__(
        self,
        mod: Optional[IRModule] = None,
        target: Optional[Target] = None,
        task_name: Optional[str] = None,
        rand_state: int = -1,
        num_threads: Optional[int] = None,
    ):
        """Constructor.

        Parameters
        ----------
        mod : Optional[IRModule] = None
            The workload to be optimized.
        target : Optional[Target] = None
            The target to be optimized for.
        task_name : Optional[str] = None
            The name of the tuning task.
        rand_state : int = -1
            The random state.
            Need to be in integer in [1, 2^31-1], -1 means using random number.
        num_threads : Optional[int] = None
            The number of threads to be used, None means using the logical cpu count.
        """
        if num_threads is None:
            num_threads = cpu_count()

        self.__init_handle_by_constructor__(
            _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
            mod,
            target,
            task_name,
            rand_state,
            num_threads,
        )
Ejemplo n.º 2
0
    def __init__(
        self,
        mod: Optional[IRModule] = None,
        target: Optional[Target] = None,
        space_generator: Optional["SpaceGenerator"] = None,
        search_strategy: Optional["SearchStrategy"] = None,
        sch_rules: Optional[List["ScheduleRule"]] = None,
        task_name: Optional[str] = None,
        rand_state: int = -1,
        num_threads: Optional[int] = None,
    ):
        if num_threads is None:
            num_threads = cpu_count()

        self.__init_handle_by_constructor__(
            _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
            mod,
            target,
            space_generator,
            search_strategy,
            sch_rules,
            task_name,
            rand_state,
            num_threads,
        )
Ejemplo n.º 3
0
    def __init__(
        self,
        mod: Optional[IRModule] = None,
        *,
        target: Optional[Target] = None,
        space_generator: Optional["SpaceGenerator"] = None,
        search_strategy: Optional["SearchStrategy"] = None,
        sch_rules: Optional[List["ScheduleRule"]] = None,
        postprocs: Optional[List["Postproc"]] = None,
        mutator_probs: Optional[Dict["Mutator", float]] = None,
        task_name: str = "main",
        rand_state: int = -1,
        num_threads: Optional[int] = None,
    ):
        if isinstance(mod, PrimFunc):
            mod = IRModule.from_expr(mod)
        if num_threads is None:
            num_threads = cpu_count()

        self.__init_handle_by_constructor__(
            _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
            mod,
            target,
            space_generator,
            search_strategy,
            sch_rules,
            postprocs,
            mutator_probs,
            task_name,
            rand_state,
            num_threads,
        )
Ejemplo n.º 4
0
def main():
    log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json")

    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=cpu_count(logical=True),
        number=ARGS.number,
        repeat=ARGS.repeat,
        min_repeat_ms=ARGS.min_repeat_ms,
        enable_cpu_cache_flush=ARGS.cpu_flush,
        timeout=ARGS.rpc_config.session_timeout_sec,
    )

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(
                ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(
                ARGS.target.attrs["max_threads_per_block"]),
            # The value `max_local_memory_per_block` is not used in AutoScheduler,
            # but is required by the API.
            max_local_memory_per_block=12345678,
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")

    describe()
    print(f"Workload: {ARGS.model_name}")
    onnx_model = onnx.load(ARGS.onnx_path)
    shape_dict = {}
    for item in ARGS.input_shape:
        print(f"  input_name : {item['name']}")
        print(f"  input_shape: {item['shape']}")
        print(f"  input_dtype: {item['dtype']}")
        shape_dict[item["name"]] = item["shape"]
    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
    input_data = {
        item["name"]: generate_input_data(item["shape"], item["dtype"])
        for item in ARGS.input_shape
    }

    with ms.Profiler() as profiler:
        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"],
            params,
            target=ARGS.target,
            hardware_params=hardware_params,
        )
        for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
            print(f"==== Task {idx}: {task.desc} "
                  f"(weight {task_weight} key: {task.workload_key}) =====")
            print(task.compute_dag)

        if ARGS.num_trials > 0:
            tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
            tuner.tune(
                auto_scheduler.TuningOptions(
                    num_measure_trials=ARGS.num_trials,
                    runner=runner,
                    measure_callbacks=[
                        auto_scheduler.RecordToFile(log_file),
                    ],
                ),
                adaptive_training=ARGS.adaptive_training,
            )

        relay_build = {
            "graph": relay.build,
            "vm": relay.vm.compile
        }[ARGS.backend]
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True},
            ):
                lib = relay_build(
                    mod,
                    target=ARGS.target,
                    params=params,
                )
    print("Tuning Time:")
    print(profiler.table())

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=lib,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=create_timer(ARGS.backend),
        backend=ARGS.backend,
    )
Ejemplo n.º 5
0
def main():
    log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json")

    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=cpu_count(logical=True),
        number=ARGS.number,
        repeat=ARGS.repeat,
        min_repeat_ms=ARGS.min_repeat_ms,
        enable_cpu_cache_flush=ARGS.cpu_flush,
    )

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(
                ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(
                ARGS.target.attrs["max_threads_per_block"]),
            # The value `max_local_memory_per_block` is not used in AutoScheduler,
            # but is required by the API.
            max_local_memory_per_block=12345678,
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")

    describe()
    print(f"Workload: {ARGS.model_name}")
    onnx_model = onnx.load(ARGS.onnx_path)
    shape_dict = {}
    for item in ARGS.input_shape:
        print(f"  input_name: {item['name']}")
        print(f"  input_shape: {item['shape']}")
        print(f"  input_dtype: {item['dtype']}")
        shape_dict[item["name"]] = item["shape"]
    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
    tasks, task_weights = auto_scheduler.extract_tasks(
        mod["main"],
        params,
        target=ARGS.target,
        hardware_params=hardware_params,
    )
    for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
        print(
            f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) ====="
        )
        print(task.compute_dag)

    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tuner.tune(
        auto_scheduler.TuningOptions(
            num_measure_trials=ARGS.num_trials,
            runner=runner,
            measure_callbacks=[
                auto_scheduler.RecordToFile(log_file),
            ],
        ))

    with auto_scheduler.ApplyHistoryBest(log_file):
        with tvm.transform.PassContext(
                opt_level=3,
                config={"relay.backend.use_auto_scheduler": True},
        ):
            lib = relay.build(
                mod,
                target=ARGS.target,
                params=params,
            )
    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
    input_data = {}
    for item in ARGS.input_shape:
        input_name, input_shape, input_dtype = item["name"], item[
            "shape"], item["dtype"]
        if input_dtype.startswith("float"):
            input_data[input_name] = np.random.uniform(
                size=input_shape).astype(input_dtype)
        else:
            input_data[input_name] = np.random.randint(low=0,
                                                       high=10000,
                                                       size=input_shape,
                                                       dtype=input_dtype)

    def f_timer(rt_mod, dev, input_data):
        # pylint: disable=import-outside-toplevel
        from tvm.contrib.graph_executor import GraphModule

        # pylint: enable=import-outside-toplevel

        mod = GraphModule(rt_mod["default"](dev))
        for input_name, input_value in input_data.items():
            mod.set_input(input_name, input_value)
        ftimer = mod.module.time_evaluator(
            "run",
            dev,
            min_repeat_ms=500,
            repeat=3,
        )
        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
        print("Running time in time_evaluator: ", results)

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=lib,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=f_timer,
    )

    def f_per_layer(rt_mod, dev, input_data):
        # pylint: disable=import-outside-toplevel
        from tvm.contrib.debugger.debug_executor import create

        # pylint: enable=import-outside-toplevel
        mod = create(graph, rt_mod, dev)
        for input_name, input_value in input_data.items():
            mod.set_input(input_name, input_value)
        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
        graph_time = mod.run_individual(number=10,
                                        repeat=1,
                                        min_repeat_ms=5000)
        print("|graph_nodes| = ", len(graph_nodes))
        print("|graph_time| = ", len(graph_time))
        graph_nodes_time = {
            k: float(v)
            for k, v in zip(graph_nodes, graph_time)
        }
        for k, v in graph_nodes_time.items():
            print(f"{k} : {v:.3f}")

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=rt_mod,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=f_per_layer,
    )
Ejemplo n.º 6
0
    def __init__(
        self,
        mod: Optional[IRModule] = None,
        *,
        target: Optional[Target] = None,
        space_generator: Union[None, "SCH_FN_TYPE", "ScheduleFn",
                               "SpaceGenerator"] = None,
        search_strategy: Union[None, "SearchStrategy", "TuneConfig"] = None,
        sch_rules: Union[None, str, List["ScheduleRule"]] = None,
        postprocs: Union[None, str, List["Postproc"]] = None,
        mutator_probs: Union[None, str, Dict["Mutator", float]] = None,
        task_name: str = "main",
        logger: Optional[logging.Logger] = None,
        rand_state: int = -1,
        num_threads: Optional[int] = None,
    ):
        # pylint: disable=import-outside-toplevel
        from . import default_config
        from .space_generator import ScheduleFn
        from .tune import TuneConfig

        # pylint: enable=import-outside-toplevel
        if isinstance(mod, PrimFunc):
            mod = IRModule.from_expr(mod)
        if callable(space_generator):
            space_generator = ScheduleFn(space_generator)
        if isinstance(search_strategy, TuneConfig):
            search_strategy = search_strategy.create_strategy()
        if isinstance(sch_rules, str):
            if sch_rules == "default":
                if target is None:
                    raise ValueError(
                        "target is required when sch_rules is 'default'")
                sch_rules = default_config.schedule_rules(None, target)
            else:
                raise ValueError(
                    "sch_rules should be a list of ScheduleRule or 'default'")
        if isinstance(postprocs, str):
            if postprocs == "default":
                if target is None:
                    raise ValueError(
                        "target is required when postprocs is 'default'")
                postprocs = default_config.postproc(None, target)
            else:
                raise ValueError(
                    "postprocs should be a list of Postproc or 'default'")
        if isinstance(mutator_probs, str):
            if mutator_probs == "default":
                if target is None:
                    raise ValueError(
                        "target is required when mutator_probs is 'default'")
                mutator_probs = default_config.mutator_probs(None, target)
        if logger is None:
            self.logger = logging.getLogger(__name__)
        else:
            self.logger = None
        if num_threads is None:
            num_threads = cpu_count()
        self.__init_handle_by_constructor__(
            _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
            mod,
            target,
            space_generator,
            search_strategy,
            sch_rules,
            postprocs,
            mutator_probs,
            task_name,
            make_logging_func(logger),
            rand_state,
            num_threads,
        )
        _ffi_api.TuneContextInitialize(self)  # type: ignore # pylint: disable=no-member
Ejemplo n.º 7
0
def main():
    describe()
    print(f"Workload: {ARGS.workload}")
    log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
    workload_func, params = CONFIGS[ARGS.workload]
    params = params[0]  # type: ignore
    workload_func = auto_scheduler.register_workload(workload_func)

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(
                ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(
                ARGS.target.attrs["max_threads_per_block"]),
            # The value `max_local_memory_per_block` is not used in AutoScheduler,
            # but is required by the API.
            max_local_memory_per_block=12345678,
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")
    task = auto_scheduler.SearchTask(
        func=workload_func,
        args=params,
        target=ARGS.target,
        hardware_params=hardware_params,
    )
    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=cpu_count(logical=True),
        number=ARGS.number,
        repeat=ARGS.repeat,
        min_repeat_ms=ARGS.min_repeat_ms,
        enable_cpu_cache_flush=ARGS.cpu_flush,
    )

    # Inspect the computational graph
    print("Computational DAG:")
    print(task.compute_dag)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=ARGS.num_trials,
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        verbose=2,
        runner=runner,
    )
    print("Running AutoTuning:")
    task.tune(tune_option)
    print("History Best:")
    print(task.print_best(log_file))
    sch, args = task.apply_best(log_file)
    print("Lowered TIR:")
    print(tvm.lower(sch, args, simple_mode=True))