def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option=None): """Common part for building a configuration""" target, task, config = measure_input target, task.target_host = Target.check_and_update_host_consist(target, task.target_host) with target: s, args = task.instantiate(config) # check invalidity of template and code hash consistency if not config.valid(): raise InstantiationError(config.errors) opts = build_option or {} if check_gpu: # Add verify pass to filter out invalid configs in advance. opts["tir.add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))] # if target is vta, we need to use vta build if ( hasattr(measure_input.target, "device_name") and measure_input.target.device_name == "vta" ): # pylint: disable=import-outside-toplevel import vta func = vta.build(s, args, target_host=task.target_host) else: with tvm.ir.transform.PassContext(config=opts): func = build(s, args, target_host=task.target_host, runtime=runtime) return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
def __init__( self, func=None, args=None, compute_dag=None, workload_key=None, target=None, target_host=None, hardware_params=None, layout_rewrite_option=None, task_inputs=None, task_inputs_overwrite=False, task_inputs_save_to_file=False, desc="", ): assert ( func is not None or workload_key is not None ), "Either a workload generation function or a workload key should be provided" if func is not None: workload_key = make_workload_key(func, args) if compute_dag is None: compute_dag = ComputeDAG(workload_key) assert target is not None, "Must specify a target." target, target_host = Target.check_and_update_host_consist( target, target_host) if layout_rewrite_option is None: layout_rewrite_option = LayoutRewriteOption.get_target_default( target) task_input_names = [] if isinstance(task_inputs, list): task_input_names = task_inputs elif isinstance(task_inputs, dict): for input_name in task_inputs: register_task_input_buffer( workload_key, input_name, task_inputs[input_name], task_inputs_overwrite, task_inputs_save_to_file, ) task_input_names.append(input_name) elif task_inputs is not None: raise ValueError("task_inputs should be a dict or a list.") self.__init_handle_by_constructor__( _ffi_api.SearchTask, compute_dag, workload_key, target, target_host, hardware_params, layout_rewrite_option, task_input_names, desc, )
def lower(self, mod, target=None, target_host=None): """Lower the module to VM bytecode. Parameters ---------- mod : tvm.IRModule The Relay module to build. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver to setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. """ target = self._update_target(target) target_host = self._update_target_host(target, target_host) target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False ) tophub_context = self._tophub_context(target) with tophub_context: self._lower(mod, target, target_host)
def get_sample_task(target=tvm.target.cuda(), target_host=None): target, target_host = Target.check_and_update_host_consist(target, target_host) """return a sample task for testing""" task = autotvm.task.create( "testing/conv2d_no_batching", args=(1, 7, 7, 512, 512, 3, 3), target=target ) return task, target
def test_check_and_update_host_consist_1(): target = None host = "llvm" with pytest.raises( AssertionError, match=r"Target host is not empty when target is empty."): target, host = Target.check_and_update_host_consist(target, host)
def __setstate__(self, state): # Register the workload if needed try: workload = json.loads(state["workload_key"]) except Exception: # pylint: disable=broad-except raise RuntimeError("Invalid workload key %s" % state["workload_key"]) # workload[0] is either the compute function name or the ComputeDAG hash. # The compute functions are already registered when importing TVM, so here # we only register the ComputeDAG workloads. If the same workload has # already been registered, the later registration overrides the prvious one. if workload[0] not in WORKLOAD_FUNC_REGISTRY: register_workload_tensors(state["workload_key"], state["compute_dag"].tensors) state["target"], state["target_host"] = Target.check_and_update_host_consist( state["target"], state["target_host"] ) self.__init_handle_by_constructor__( _ffi_api.SearchTask, state["compute_dag"], state["workload_key"], state["target"], state["target"].host, state["hardware_params"], state["layout_rewrite_option"], state["task_input_names"], state["desc"], )
def _build_module_no_factory(mod, target=None, target_host=None, params=None, mod_name="default"): """A wrapper around build which discards the Python GraphFactoryRuntime. This wrapper is suitable to be used from other programming languages as the runtime::Module can be freely passed between language boundaries. """ target, target_host = Target.check_and_update_host_consist(target, target_host) return build(mod, target, params=params, mod_name=mod_name).module
def main(): """Main function""" parser = argparse.ArgumentParser() parser.add_argument("--target", type=str, default="llvm", help="The build target") parser.add_argument("--target-host", type=str, default=None, help="The host code compilation target") parser.add_argument("--rpc-host", type=str, default="127.0.0.1", help="the hostname of the server") parser.add_argument("--rpc-port", type=int, default=9090, help="The port of the RPC") args = parser.parse_args() logging.basicConfig(level=logging.INFO) args.target, args.target_host = Target.check_and_update_host_consist( args.target, args.target_host) measure_peak_all(args.target, args.target_host, args.rpc_host, args.rpc_port)
def extract_from_program(mod, params, target, target_host=None, ops=None): """Extract tuning tasks from a relay program. This function is the single program version of extract_from_multiple_program. Parameters ---------- mod: tvm.IRModule or relay.function.Function The module or function to tune params: dict of str to numpy array The associated parameters of the program target: tvm.target.Target The compilation target target_host: tvm.target.Target The host compilation target ops: List[tvm.ir.Op] or None List of relay ops to be tuned. If not specified, all tunable ops will be extracted. Returns ------- task: Array of autotvm.task.Task collected tasks """ target, target_host = Target.check_and_update_host_consist( target, target_host) return extract_from_multiple_program([mod], [params], target, ops=ops)
def test_check_and_update_host_consist_3(): target = Target(target="cuda", host="llvm") host = None target, host = Target.check_and_update_host_consist(target, host) assert target.kind.name == "cuda" assert target.host.kind.name == "llvm" assert host.kind.name == "llvm" assert target.host == host
def measure_bandwidth_all_types( total_item, item_per_thread, n_times, target, target_host, remote, dev, verbose=True ): """measure memory bandwidth for all types Parameters ---------- total_item: int number of elements in input array item_per_thread: int number of elements each thread accmulates n_times: int number of runs for averaging target: :any:`tvm.target.Target` the target and option of the compilation. target_host : str or :any:`tvm.target.Target` host compilation target remote: tvm.rpc.RPCSession remote rpc session dev: Device the device of array verbose: bool whether outputs immediate result Returns ------- result: list a list of (type_name, GBPS) pairs """ target, target_host = Target.check_and_update_host_consist(target, target_host) max_threads = target.max_num_threads result = [] for base_type in ["float"]: for bits in [32]: for lanes in [1, 2, 4, 8, 16]: max_speed = -1e9 # try different strides for stride in [max_threads, total_item // (lanes * item_per_thread)]: speed = measure_bandwidth_sum( total_item, item_per_thread, stride, base_type, bits, lanes, target, target_host, remote, dev, n_times, ) max_speed = max(max_speed, speed) type_name = base_type + str(bits) result.append(["%sx%d" % (type_name, lanes), max_speed]) if verbose: logging.info("\t%-10s %.2f GBPS", result[-1][0], result[-1][1]) return result
def build(self, mod, target=None, target_host=None, params=None): """ Parameters ---------- mod : :py:class:`~tvm.IRModule` The IRModule to build. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver to setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. Returns ------- factory_module : tvm.relay.backend.graph_executor_factory.GraphExecutorFactoryModule The runtime factory for the TVM graph executor. """ target = _update_target(target) target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False) # Setup the params. if params: self._set_params(params) # Build the IR module. If auto_scheduler is not enabled, # then use the TOPI-defined schedule. use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False) # Turn off AutoTVM config not found warnings if auto_scheduler is enabled. old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler self._build(mod, target, target_host) autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent # Get artifacts graph_json = self.get_json() mod = self.get_module() params = self.get_params() return graph_json, mod, params
def test_check_and_update_host_consist_4(): """Test `check_and_update_host_consist` by using TVM Objects""" cuda_device_type = tvm.device("cuda").device_type target = {cuda_device_type: Target(target="cuda", host="llvm")} host = None target_1, host_1 = Target.check_and_update_host_consist(target, host) assert isinstance(target_1, dict) assert target_1[cuda_device_type].kind.name == "cuda" assert target_1[cuda_device_type].host.kind.name == "llvm" assert host_1 is None target = {cuda_device_type: Target(tvm.runtime.container.String("cuda"))} host = Target(tvm.runtime.container.String("llvm")) target = tvm.runtime.convert(target) assert isinstance(target, tvm.ir.container.Map) target_2, host_2 = Target.check_and_update_host_consist(target, host) assert isinstance(target_2, dict) assert target_2[cuda_device_type].kind.name == "cuda" assert host_2.kind.name == "llvm"
def __setstate__(self, state): import cloudpickle # pylint: disable=import-outside-toplevel self.name = state["name"] self.args = state["args"] self.kwargs = state["kwargs"] self.config_space = state["config_space"] self.func = cloudpickle.loads(state["func"]) self.flop = state["flop"] self.target, self.target_host = Target.check_and_update_host_consist( state["target"], state["target_host"])
def __getstate__(self): self.target, self.target_host = Target.check_and_update_host_consist( self.target, self.target_host) return { "compute_dag": self.compute_dag, "workload_key": self.workload_key, "target": self.target, "target_host": self.target_host, "hardware_params": self.hardware_params, "layout_rewrite_option": self.layout_rewrite_option, "task_input_names": self.task_input_names, "desc": self.desc, }
def autoscheduler_get_tuning_tasks( mod: tvm.IRModule, params: Dict[str, tvm.nd.NDArray], target: str, target_host: Optional[str] = None, alter_layout: Optional[str] = None, hardware_params: Optional[HardwareParams] = None, include_simple_tasks: bool = False, ): """Get the autoscheduler tuning tasks for a given relay module. Parameters ---------- mod : tvm.IRModule The relay module from which to extract tuning tasks. params : dict The params for the relay module. target : tvm.target.Target The compilation target. target_host : str, optional The compilation target for the host. alter_layout : str, optional The layout to convert the graph to. Note, the convert layout pass doesn't currently guarantee the whole of the graph will be converted to the chosen layout. hardware_params : Optional[HardwareParams] Hardware parameters used for the search tasks Returns ------- tasks : list of autotvm.Tasks list of tasks to be tuned weights : List[int] the weight (i.e. the number of appearance) of extracted tasks """ target, target_host = Target.check_and_update_host_consist( target, target_host) if alter_layout: mod = common.convert_graph_layout(mod, alter_layout) # Extract the tasks tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target=target, hardware_params=hardware_params, include_simple_tasks=include_simple_tasks, ) return tasks, task_weights
def compile(mod, target=None, target_host=None, params=None): """Compile the module to VM executable. A helper function for VMCompiler. Parameters ---------- mod : tvm.IRModule The Relay module to build. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver to setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. Returns ------- exec : tvm.runtime.vm.Executable The VM executable that contains both library code and bytecode. """ if target_host is not None: warnings.warn( "target_host parameter is going to be deprecated. " "Please pass in tvm.target.Target(target, host=target_host) instead." ) target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False ) compiler = VMCompiler() if params: compiler.set_params(params) compiler.lower(mod, target) compiler.codegen() return compiler.get_exec()
def __getstate__(self): # custom pickle implementation is required for # some unpickable local task functions. # So we only pickle the name of the function # and restore the function by name when unpickling it. import cloudpickle # pylint: disable=import-outside-toplevel self.target, self.target_host = Target.check_and_update_host_consist( self.target, self.target_host) return { "name": self.name, "args": self.args, "kwargs": self.kwargs, "config_space": self.config_space, "flop": self.flop, "target": self.target, "target_host": self.target_host, "func": cloudpickle.dumps(self.func), }
def _local_build_worker(inp_serialized, build_func, verbose): tic = time.time() inp = MeasureInput.deserialize(inp_serialized) task = inp.task task.target, task.target_host = Target.check_and_update_host_consist( task.target, task.target_host ) error_no = MeasureErrorNo.NO_ERROR error_msg = None args = [] try: sch, args = task.compute_dag.apply_steps_from_state( inp.state, layout_rewrite=task.layout_rewrite_option ) # pylint: disable=broad-except except Exception: error_no = MeasureErrorNo.INSTANTIATION_ERROR error_msg = make_traceback_info() if error_no == 0: dirname = tempfile.mkdtemp() filename = os.path.join(dirname, "tmp_func." + build_func.output_format) try: with transform.PassContext(): func = build_module.build(sch, args, target=task.target) func.export_library(filename, build_func) # pylint: disable=broad-except except Exception: error_no = MeasureErrorNo.COMPILE_HOST error_msg = make_traceback_info() else: filename = "" if verbose >= 1: if error_no == MeasureErrorNo.NO_ERROR: print(".", end="", flush=True) else: print(".E", end="", flush=True) # Build error return filename, args, error_no, error_msg, time.time() - tic
def autotvm_get_tuning_tasks( mod: tvm.IRModule, params: Dict[str, tvm.nd.NDArray], target: str, target_host: Optional[str] = None, alter_layout: Optional[str] = None, ): """Get the autotvm tuning tasks for a given relay module. Parameters ---------- mod : tvm.IRModule The relay module from which to extract tuning tasks. params : dict The params for the relay module. target : tvm.target.Target The compilation target. target_host : str, optional The compilation target for the host. alter_layout : str, optional The layout to convert the graph to. Note, the convert layout pass doesn't currently guarantee the whole of the graph will be converted to the chosen layout. Returns ------- tasks : list of autotvm.Tasks list of tasks to be tuned """ target, target_host = Target.check_and_update_host_consist( target, target_host) if alter_layout: mod = common.convert_graph_layout(mod, alter_layout) tasks = autotvm.task.extract_from_program( mod["main"], target=target, params=params, ) return tasks
def create(task_name, args, target, target_host=None): """Create a tuning task and initialize its search space Parameters ---------- task_name : str The AutoTVM task name args : List Positional arguments target : Target The compilation target target_host: Target, optional The compilation target for host side Returns ------- tsk: Task a task object """ args = serialize_args(args) ret = Task(task_name, args) if isinstance(target, str): target = Target(target) target, target_host = Target.check_and_update_host_consist( target, target_host) # init config space ret.config_space = ConfigSpace() ctx = ApplyConfig(ret.config_space) with ctx: with target: sch, _ = ret.func(*args) ret.config_space.code_hash = getattr(sch, "code_hash", None) ret.flop = ret.config_space.flop or compute_flop(sch) ret.target = target ret.target_host = target_host return ret
def optimize(self, mod, target=None, target_host=None, params=None): """Helper method that optimizes a Relay module via VM. Parameters ---------- mod : tvm.IRModule target : str, :any:`tvm.target.Target`, or dict of str (i.e. device/context name) to str/tvm.target.Target, optional target_host : str or :any:`tvm.target.Target`, optional The compilation target for host. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. Returns ------- mod : tvm.IRModule The optimized relay module. params : dict The parameters of the final module. """ if target_host is not None: warnings.warn( "target_host parameter is going to be deprecated. " "Please pass in tvm.target.Target(target, host=target_host) instead." ) target = self._update_target(target) target_host = self._update_target_host(target, target_host) target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False ) if params: self.set_params(params) return self._optimize(mod, target, target_host), self.get_params()
def recover_measure_input(inp, rebuild_state=False): """ Recover a deserialized MeasureInput by rebuilding the missing fields. 1. Rebuid the compute_dag in inp.task 2. (Optional) Rebuild the stages in inp.state Parameters ---------- inp: MeasureInput The deserialized MeasureInput rebuild_state: bool = False Whether rebuild the stages in MeasureInput.State Returns ------- new_input: MeasureInput The fully recovered MeasureInput with all fields rebuilt. """ # pylint: disable=import-outside-toplevel from .search_task import SearchTask # lazily import to avoid recursive dependency task = inp.task task.target, task.target_host = Target.check_and_update_host_consist( task.target, task.target_host ) new_task = SearchTask( workload_key=task.workload_key, target=task.target, hardware_params=task.hardware_params, layout_rewrite_option=task.layout_rewrite_option, task_inputs=list(task.task_input_names), ) if rebuild_state: new_state = new_task.compute_dag.infer_bound_from_state(inp.state) else: new_state = inp.state return MeasureInput(new_task, new_state)
def measure_peak_all(target, target_host, host, port): """measure memory bandwidth and peak compute for gpu devices Parameters ---------- target: str or :any:`tvm.target.Target` target_host: str host: str port: int """ target, target_host = Target.check_and_update_host_consist(target, target_host) remote = rpc.connect(host, port) n_times = 20 bandwidth_total_item = 1 << 25 bandwidth_item_per_thread = 32 compute_total_item = 1 << 21 compute_item_per_thread = 4096 if str(target).startswith("opencl"): dev = remote.cl() elif str(target).startswith("cuda"): dev = remote.cuda() elif str(target).startswith("metal"): dev = remote.metal() else: raise RuntimeError("Unsupported target") logging.info("========== measure memory bandwidth ==========") measure_bandwidth_all_types( bandwidth_total_item, bandwidth_item_per_thread, n_times, target, target_host, remote, dev ) logging.info("========== measure peak compute ==========") measure_compute_all_types( compute_total_item, compute_item_per_thread, n_times, target, target_host, remote, dev )
def build(mod, target, target_host=None): """Backend build function. Parameters ---------- mod : tvm.IRModule or Dict[str, tvm.IRModule] Input module target : tvm.Target The target to run the code on. target_host : tvm.Target The host target. Returns ------- module : tvm.Module The runtime module. """ target_host = None if target_host == "" else target_host target, target_host = Target.check_and_update_host_consist( target, target_host) return tvm.driver.build(mod, target=target)
def build(inputs, args=None, target=None, target_host=None, name="default_function", binds=None): """Build a function with arguments as signature. Code will be generated for devices coupled with target information. Parameters ---------- inputs : tvm.te.Schedule, IRModule, or dict of target to IRModule The schedule to be built args : list of Buffer or Tensor or Var, optional The argument lists to the function. target : str or :any:`tvm.target.Target`, optional The target and option of the compilation. target_host : str or :any:`tvm.target.Target` optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. name : str, optional The name of result function. binds : dict, optional Dictionary that maps the binding of symbolic buffer to Tensor. By default, a new buffer is created for each tensor in the argument. Returns ------- ret : tvm.module A module that combines both host and device code. Examples ________ There are two typical example uses of this function depending on the type of the argument `inputs`: 1. it is an IRModule. .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.te.create_schedule(C.op) m = tvm.lower(s, [A, B, C], name="test_add") rt_mod = tvm.build(m, target="llvm") 2. it is a dict of compilation target to IRModule. .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s1 = tvm.te.create_schedule(C.op) with tvm.target.cuda() as cuda_tgt: s2 = topi.cuda.schedule_injective(cuda_tgt, [C]) m1 = tvm.lower(s1, [A, B, C], name="test_add1") m2 = tvm.lower(s2, [A, B, C], name="test_add2") rt_mod = tvm.build({"llvm": m1, "cuda": m2}, target_host="llvm") Note ---- See the note on :any:`tvm.target` on target string format. """ if isinstance(inputs, schedule.Schedule): if args is None: raise ValueError("args must be given for build from schedule") input_mod = lower(inputs, args, name=name, binds=binds) elif isinstance(inputs, (list, tuple, container.Array)): merged_mod = tvm.IRModule({}) for x in inputs: merged_mod.update(x) input_mod = merged_mod elif isinstance(inputs, tvm.IRModule): input_mod = inputs elif not isinstance(inputs, (dict, container.Map)): raise ValueError( f"Inputs must be Schedule, IRModule or dict of target to IRModule, " f"but got {type(inputs)}.") if not isinstance(inputs, (dict, container.Map)): target = Target.current() if target is None else target target = target if target else "llvm" target_input_mod = {target: input_mod} else: target_input_mod = inputs for tar, mod in target_input_mod.items(): if not isinstance(tar, (str, Target)): raise ValueError("The key of inputs must be str or " "Target when inputs is dict.") if not isinstance(mod, tvm.IRModule): raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.") target_input_mod, target_host = Target.check_and_update_host_consist( target_input_mod, target_host) if not target_host: for tar, mod in target_input_mod.items(): tar = Target(tar) device_type = ndarray.device(tar.kind.name, 0).device_type if device_type == ndarray.cpu(0).device_type: target_host = tar break if not target_host: target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm" target_input_mod, target_host = Target.check_and_update_host_consist( target_input_mod, target_host) mod_host_all = tvm.IRModule({}) device_modules = [] for tar, input_mod in target_input_mod.items(): mod_host, mdev = _build_for_device(input_mod, tar, target_host) mod_host_all.update(mod_host) device_modules.append(mdev) # Generate a unified host module. rt_mod_host = codegen.build_module(mod_host_all, target_host) # Import all modules. for mdev in device_modules: if mdev: rt_mod_host.import_module(mdev) if not isinstance(target_host, Target): target_host = Target(target_host) if (target_host.attrs.get("runtime", tvm.runtime.String("c++")) == "c" and target_host.attrs.get("system-lib", 0) == 1): if target_host.kind.name == "c": create_csource_crt_metadata_module = tvm._ffi.get_global_func( "runtime.CreateCSourceCrtMetadataModule") return create_csource_crt_metadata_module([rt_mod_host], target_host) if target_host.kind.name == "llvm": create_llvm_crt_metadata_module = tvm._ffi.get_global_func( "runtime.CreateLLVMCrtMetadataModule") return create_llvm_crt_metadata_module([rt_mod_host], target_host) return rt_mod_host
def _build_for_device(input_mod, target, target_host): """Build the lowered functions for a device with the given compilation target. Parameters ---------- input_mod : IRModule The schedule to be built. target : str or :any:`tvm.target.Target` The target and option of the compilation. target_host : str or :any:`tvm.target.Target` The host compilation target. Returns ------- fhost : IRModule The host IRModule. mdev : tvm.module A module that contains device code. """ target, target_host = Target.check_and_update_host_consist( target, target_host) device_type = ndarray.device(target.kind.name, 0).device_type mod_mixed = input_mod mod_mixed = tvm.tir.transform.Apply( lambda f: f.with_attr("target", target))(mod_mixed) opt_mixed = [tvm.tir.transform.VerifyMemory()] if len(mod_mixed.functions) == 1: opt_mixed += [ tvm.tir.transform.Apply( lambda f: f.with_attr("tir.is_entry_func", True)) ] if PassContext.current().config.get("tir.detect_global_barrier", False): opt_mixed += [tvm.tir.transform.ThreadSync("global")] opt_mixed += [ tvm.tir.transform.ThreadSync("shared"), tvm.tir.transform.ThreadSync("warp"), tvm.tir.transform.InferFragment(), tvm.tir.transform.LowerThreadAllreduce(), tvm.tir.transform.MakePackedAPI(), tvm.tir.transform.SplitHostDevice(), ] mod_mixed = tvm.transform.Sequential(opt_mixed)(mod_mixed) # device optimizations opt_device = tvm.transform.Sequential([ tvm.tir.transform.Filter( lambda f: "calling_conv" in f.attrs and f.attrs[ "calling_conv"].value == CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.LowerWarpMemory(), tvm.tir.transform.Simplify(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerCustomDatatypes(), tvm.tir.transform.LowerIntrin(), ]) mod_dev = opt_device(mod_mixed) # host optimizations opt_host = tvm.transform.Sequential([ tvm.tir.transform.Filter( lambda f: "calling_conv" not in f.attrs or f.attrs[ "calling_conv"].value != CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.Apply(lambda f: f.with_attr("target", target_host)), tvm.tir.transform.LowerTVMBuiltin(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerCustomDatatypes(), tvm.tir.transform.LowerIntrin(), tvm.tir.transform.CombineContextCall(), ]) mod_host = opt_host(mod_mixed) if device_type == ndarray.cpu(0).device_type and target_host == target: assert len(mod_dev.functions) == 0 if "gpu" in target.keys and len(mod_dev.functions) == 0: warnings.warn( "Specified target %s, but cannot find device code, did you do " "bind?" % target) rt_mod_dev = codegen.build_module( mod_dev, target) if len(mod_dev.functions) != 0 else None return mod_host, rt_mod_dev
def build( inputs: Union[schedule.Schedule, PrimFunc, IRModule, Mapping[str, IRModule]], args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None, target: Optional[Union[str, Target]] = None, target_host: Optional[Union[str, Target]] = None, runtime: Optional[ "tvm.relay.backend.Runtime"] = None, # Type is annotated this way to avoid cyclic dependency name: Optional[str] = "default_function", binds: Optional[Mapping[tensor.Tensor, Buffer]] = None, ): """Build a function with arguments as signature. Code will be generated for devices coupled with target information. Parameters ---------- inputs : Union[tvm.te.schedule.Schedule, tvm.tir.PrimFunc, IRModule, Mapping[str, IRModule]] The input to be built args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, Var]]] The argument lists to the function. target : Optional[Union[str, Target]] The target and option of the compilation. target_host : Optional[Union[str, Target]] Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm interpreter is used. runtime : Optional[Runtime] Runtime to generate artifacts for name : Optional[str] The name of result function. binds : Optional[Mapping[tensor.Tensor, tvm.tir.Buffer]] Dictionary that maps the binding of symbolic buffer to Tensor. By default, a new buffer is created for each tensor in the argument. Returns ------- ret : tvm.module A module that combines both host and device code. Examples ________ There are two typical example uses of this function depending on the type of the argument `inputs`: 1. it is an IRModule. .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.te.create_schedule(C.op) m = tvm.lower(s, [A, B, C], name="test_add") rt_mod = tvm.build(m, target="llvm") 2. it is a dict of compilation target to IRModule. .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s1 = tvm.te.create_schedule(C.op) with tvm.target.cuda() as cuda_tgt: s2 = topi.cuda.schedule_injective(cuda_tgt, [C]) m1 = tvm.lower(s1, [A, B, C], name="test_add1") m2 = tvm.lower(s2, [A, B, C], name="test_add2") rt_mod = tvm.build({"llvm": m1, "cuda": m2}) Note ---- See the note on :any:`tvm.target` on target string format. """ if isinstance(inputs, schedule.Schedule): if args is None: raise ValueError("args must be given for build from schedule") input_mod = lower(inputs, args, name=name, binds=binds) elif isinstance(inputs, (list, tuple, container.Array)): merged_mod = tvm.IRModule({}) for x in inputs: merged_mod.update(lower(x)) input_mod = merged_mod elif isinstance(inputs, PrimFunc): input_mod = lower(inputs, name=name) elif isinstance(inputs, tvm.IRModule): if name is not None: warnings.warn("Specifying name with IRModule input is useless") input_mod = lower(inputs) elif not isinstance(inputs, (dict, container.Map)): raise ValueError( f"Inputs must be Schedule, IRModule or dict of target to IRModule, " f"but got {type(inputs)}.") if target_host is not None: warnings.warn( "target_host parameter is going to be deprecated. " "Please pass in tvm.target.Target(target, host=target_host) instead." ) if not isinstance(inputs, (dict, container.Map)): target = Target.current() if target is None else target target = target if target else "llvm" target_input_mod = {target: input_mod} else: target_input_mod = inputs # Because modules can be created from a variety of sources, we annotate them # with the relevant attributes here to ensure they propagate annotated_mods = {} for tar, mod in target_input_mod.items(): if not isinstance(tar, (str, Target)): raise ValueError("The key of inputs must be str or " "Target when inputs is dict.") if not isinstance(mod, tvm.IRModule): raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.") annotated_mods[tar] = mod.with_attr("runtime", runtime) annotated_mods, target_host = Target.check_and_update_host_consist( annotated_mods, target_host) if not target_host: for tar, mod in annotated_mods.items(): tar = Target(tar) device_type = ndarray.device(tar.kind.name, 0).device_type if device_type == ndarray.cpu(0).device_type: target_host = tar break if not target_host: target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm" annotated_mods, target_host = Target.check_and_update_host_consist( annotated_mods, target_host) rt_mod_host = _driver_ffi.preprocess_module(annotated_mods, target_host) annotated_mods, target_host = Target.check_and_update_host_consist( annotated_mods, target_host) if not isinstance(target_host, Target): target_host = Target(target_host) if str(runtime) == "crt" and runtime["system-lib"]: if target_host.kind.name == "c": create_csource_crt_metadata_module = tvm._ffi.get_global_func( "runtime.CreateCSourceCrtMetadataModule") to_return = create_csource_crt_metadata_module([rt_mod_host], target_host, runtime) elif target_host.kind.name == "llvm": create_llvm_crt_metadata_module = tvm._ffi.get_global_func( "runtime.CreateLLVMCrtMetadataModule") to_return = create_llvm_crt_metadata_module([rt_mod_host], target_host, runtime) else: to_return = rt_mod_host return OperatorModule.from_module(to_return, ir_module_by_target=annotated_mods, name=name)
def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"): # fmt: off # pylint: disable=line-too-long """Helper function that builds a Relay function to run on TVM graph executor. Parameters ---------- ir_mod : :py:class:`~tvm.IRModule` The IR module to build. Using relay.Function is deprecated. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. mod_name: Optional[str] The module name we will build Returns ------- factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule The runtime factory for the TVM graph executor. """ # pylint: enable=line-too-long # fmt: on if not isinstance(ir_mod, (IRModule, _function.Function)): raise ValueError("Type of input parameter mod must be tvm.IRModule") if isinstance(ir_mod, _function.Function): if params: ir_mod = bind_params_by_name(ir_mod, params) ir_mod = IRModule.from_expr(ir_mod) warnings.warn( "Please use input parameter mod (tvm.IRModule) " "instead of deprecated parameter mod (tvm.relay.function.Function)", DeprecationWarning, ) target = build_target_by_device_type_map(target) if isinstance(target_host, (str, Target)): target_host = Target(target_host) elif target_host: raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None") target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False) # Retrieve the executor from the target executor = get_executor_from_target(target, target_host) # If current dispatch context is fallback context (the default root context), # then load pre-tuned parameters from TopHub if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext): tophub_context = autotvm.tophub.context(list(target.values())) else: tophub_context = autotvm.utils.EmptyContext() with tophub_context: bld_mod = BuildModule() executor_config, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params, executor=executor, mod_name=mod_name) func_metadata = bld_mod.get_function_metadata() if executor == "aot": executor_factory = _executor_factory.AOTExecutorFactoryModule( ir_mod, target, runtime_mod, mod_name, params, func_metadata) elif executor == "graph": executor_factory = _executor_factory.GraphExecutorFactoryModule( ir_mod, target, executor_config, runtime_mod, mod_name, params, func_metadata) else: assert False, "Executor " + executor + " not supported" return executor_factory
def tune_model( tvmc_model: TVMCModel, target: str, tuning_records: Optional[str] = None, prior_records: Optional[str] = None, enable_autoscheduler: bool = False, rpc_key: Optional[str] = None, hostname: Optional[str] = None, port: Optional[Union[int, str]] = 9090, trials: int = 10000, target_host: Optional[str] = None, tuner: str = "xgb", min_repeat_ms: Optional[int] = None, early_stopping: Optional[int] = None, desired_layout: Optional[str] = None, timeout: int = 10, repeat: int = 1, number: int = 10, parallel: int = 4, hardware_params: Optional[HardwareParams] = None, include_simple_tasks: bool = False, log_estimated_latency: bool = False, additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None, ): """Use tuning to automatically optimize the functions in a model. Parameters ---------- tvmc_model : TVMCModel The model to be optimized. target : str Compilation target as plain string, inline JSON or path to a JSON file. tuning_records: str, optional The path to a file that tuning results will be saved to. If not specified, a temporary file will be used. prior_records: str, optional A path to previous tuning results that will be used to hot-start the tuning cost model if provided. enable_autoscheduler : bool, optional When true, use autoscheduling rather than autotvm. This should produce faster kernels for compatible model-target pairs. rpc_key : str, optional The RPC tracker key of the target device. Required when rpc_tracker is provided. hostname : str, optional The IP address of an RPC tracker, used when benchmarking remotely. port : int or str, optional The port of the RPC tracker to connect to. Defaults to 9090. trials : int, optional The number of schedules to try out for the entire model. Note that the default value is chosen as a decent average for most models, but larger models may need more trials to reach a good result while smaller models will converge with fewer trials. tuner : str, optional The type of tuner to use when tuning with autotvm. Can be one of "ga", "gridsearch", "random", "xgb", "xgb_knob", and "xgb-rank". min_repeat_ms : int, optional Minimum time to run each trial. Defaults to 0 on x86 and 1000 on other targets. early_stopping : int, optional When specified, stop tuning after this number of trials if results aren't improving. desired_layout : str, optional Can be one of "NCHW" or "NHWC". When specified, compatible operations in the graph will have their layout set to this format. Tasks will then be tuned using this specified layout. timeout : int, optional, If a kernel trial lasts longer than this duration in seconds, it will be considered a failure. repeat : int, optional How many times each measurement should be repeated. number : int, optional The number of runs a single repeat is made of. parallel : int, optional The maximum number of parallel devices to use when tuning. hardware_params : auto_scheduler.HardwareParams, optional When using the autoscheduler, this object defines the configuration of the target hardware. include_simple_tasks : bool, optional Whether to extract simple operations or only computationally intensive ones when using the autoscheduler. log_estimated_latency : bool, optional If using the autoscheduler, write the estimated latency at each step of tuning to file. additional_target_options: Optional[Dict[str, Dict[str, Any]]] Additional target options in a dictionary to combine with initial Target arguments Returns ------- tuning_records : str The path to the produced tuning log file. """ target, extra_targets = common.target_from_cli(target, additional_target_options) target, target_host = Target.check_and_update_host_consist( target, target_host) # TODO(jwfromm) Remove this deepcopy once AlterOpLayout bug that mutates source # model is fixed. For now, creating a clone avoids the issue. mod = deepcopy(tvmc_model.mod) params = tvmc_model.params if tuning_records is None: tuning_records = tvmc_model.default_tuning_records_path() for codegen_from_cli in extra_targets: codegen = composite_target.get_codegen_by_target( codegen_from_cli["name"]) partition_function = codegen["pass_pipeline"] mod = partition_function(mod, params, **codegen_from_cli["opts"]) # min_repeat_ms should be: # a. the value provided by the user, if any, or # b. 0ms in case target is "cpu"; otherwise 1000ms if min_repeat_ms is None: min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000 logger.info("Default --min-repeat-ms for this target is %s", min_repeat_ms) if rpc_key: if hostname is None or port is None: raise common.TVMCException( "You must provide a hostname and port to connect to a remote RPC device." ) if isinstance(port, str): port = int(port) logger.info("Tuning will be performed on device %s at %s:%d.", rpc_key, hostname, port) runner_ctor = auto_scheduler.RPCRunner if enable_autoscheduler else autotvm.RPCRunner runner = runner_ctor( key=rpc_key, host=hostname, port=port, number=number, repeat=repeat, n_parallel=parallel, timeout=timeout, min_repeat_ms=min_repeat_ms, ) else: logger.info("Starting localhost tuning.") runner_ctor = (auto_scheduler.LocalRPCMeasureContext if enable_autoscheduler else autotvm.LocalRunner) local_server = runner_ctor( number=number, repeat=repeat, timeout=timeout, min_repeat_ms=min_repeat_ms, ) # For autoscheduling on some devices, we need to maintain a LocalRPCMeasureContext object. if enable_autoscheduler: runner = local_server.runner else: runner = local_server if enable_autoscheduler: tasks, weights = autoscheduler_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=desired_layout, hardware_params=hardware_params, include_simple_tasks=include_simple_tasks, ) # Create the autoscheduler tuning options tuning_options = auto_scheduler.TuningOptions( num_measure_trials=trials, measure_callbacks=[auto_scheduler.RecordToFile(tuning_records)], runner=runner, early_stopping=early_stopping, ) logger.info("Autoscheduling with configuration: %s", tuning_options) # Schedule the tasks (i.e., produce a schedule for each task) schedule_tasks(tasks, weights, tuning_options, prior_records, log_estimated_latency) else: tasks = autotvm_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=desired_layout, ) # In autotvm, trials is specified per task. We can convert the per-model input # provided to per-task trials by dividing by the number of tasks. trials = int(trials / len(tasks)) logger.info("Autotuning with %d trials per task.", trials) tuning_options = { "tuner": tuner, "trials": trials, "early_stopping": early_stopping, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="default"), runner=runner), "tuning_records": prior_records, } logger.info("Autotuning with configuration: %s", tuning_options) tune_tasks(tasks, tuning_records, **tuning_options) return tuning_records