def build(inputs, args=None, target=None, target_host=None, name="default_function", binds=None): """Build a function with arguments as signature. Code will be generated for devices coupled with target information. Parameters ---------- inputs : tvm.te.Schedule, IRModule, or dict of target to IRModule The schedule to be built args : list of Buffer or Tensor or Var, optional The argument lists to the function. target : str or :any:`tvm.target.Target`, optional The target and option of the compilation. target_host : str or :any:`tvm.target.Target` optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. name : str, optional The name of result function. binds : dict, optional Dictionary that maps the binding of symbolic buffer to Tensor. By default, a new buffer is created for each tensor in the argument. Returns ------- ret : tvm.module A module that combines both host and device code. Examples ________ There are two typical example uses of this function depending on the type of the argument `inputs`: 1. it is an IRModule. .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.te.create_schedule(C.op) m = tvm.lower(s, [A, B, C], name="test_add") rt_mod = tvm.build(m, target="llvm") 2. it is a dict of compilation target to IRModule. .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s1 = tvm.te.create_schedule(C.op) with tvm.target.cuda() as cuda_tgt: s2 = topi.cuda.schedule_injective(cuda_tgt, [C]) m1 = tvm.lower(s1, [A, B, C], name="test_add1") m2 = tvm.lower(s2, [A, B, C], name="test_add2") rt_mod = tvm.build({"llvm": m1, "cuda": m2}, target_host="llvm") Note ---- See the note on :any:`tvm.target` on target string format. """ if isinstance(inputs, schedule.Schedule): if args is None: raise ValueError("args must be given for build from schedule") input_mod = lower(inputs, args, name=name, binds=binds) elif isinstance(inputs, (list, tuple, container.Array)): merged_mod = tvm.IRModule({}) for x in inputs: merged_mod.update(x) input_mod = merged_mod elif isinstance(inputs, tvm.IRModule): input_mod = inputs elif not isinstance(inputs, (dict, container.Map)): raise ValueError( f"Inputs must be Schedule, IRModule or dict of target to IRModule, " f"but got {type(inputs)}.") if not isinstance(inputs, (dict, container.Map)): target = Target.current() if target is None else target target = target if target else "llvm" target_input_mod = {target: input_mod} else: target_input_mod = inputs for tar, mod in target_input_mod.items(): if not isinstance(tar, (str, Target)): raise ValueError("The key of inputs must be str or " "Target when inputs is dict.") if not isinstance(mod, tvm.IRModule): raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.") target_input_mod, target_host = Target.check_and_update_host_consist( target_input_mod, target_host) if not target_host: for tar, mod in target_input_mod.items(): tar = Target(tar) device_type = ndarray.device(tar.kind.name, 0).device_type if device_type == ndarray.cpu(0).device_type: target_host = tar break if not target_host: target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm" target_input_mod, target_host = Target.check_and_update_host_consist( target_input_mod, target_host) mod_host_all = tvm.IRModule({}) device_modules = [] for tar, input_mod in target_input_mod.items(): mod_host, mdev = _build_for_device(input_mod, tar, target_host) mod_host_all.update(mod_host) device_modules.append(mdev) # Generate a unified host module. rt_mod_host = codegen.build_module(mod_host_all, target_host) # Import all modules. for mdev in device_modules: if mdev: rt_mod_host.import_module(mdev) if not isinstance(target_host, Target): target_host = Target(target_host) if (target_host.attrs.get("runtime", tvm.runtime.String("c++")) == "c" and target_host.attrs.get("system-lib", 0) == 1): if target_host.kind.name == "c": create_csource_crt_metadata_module = tvm._ffi.get_global_func( "runtime.CreateCSourceCrtMetadataModule") return create_csource_crt_metadata_module([rt_mod_host], target_host) if target_host.kind.name == "llvm": create_llvm_crt_metadata_module = tvm._ffi.get_global_func( "runtime.CreateLLVMCrtMetadataModule") return create_llvm_crt_metadata_module([rt_mod_host], target_host) return rt_mod_host
def _build_for_device(input_mod, target, target_host): """Build the lowered functions for a device with the given compilation target. Parameters ---------- input_mod : IRModule The schedule to be built. target : str or :any:`tvm.target.Target` The target and option of the compilation. target_host : str or :any:`tvm.target.Target` The host compilation target. Returns ------- fhost : IRModule The host IRModule. mdev : tvm.module A module that contains device code. """ target, target_host = Target.check_and_update_host_consist( target, target_host) device_type = ndarray.device(target.kind.name, 0).device_type mod_mixed = input_mod mod_mixed = tvm.tir.transform.Apply( lambda f: f.with_attr("target", target))(mod_mixed) opt_mixed = [tvm.tir.transform.VerifyMemory()] if len(mod_mixed.functions) == 1: opt_mixed += [ tvm.tir.transform.Apply( lambda f: f.with_attr("tir.is_entry_func", True)) ] if PassContext.current().config.get("tir.detect_global_barrier", False): opt_mixed += [tvm.tir.transform.ThreadSync("global")] opt_mixed += [ tvm.tir.transform.ThreadSync("shared"), tvm.tir.transform.ThreadSync("warp"), tvm.tir.transform.InferFragment(), tvm.tir.transform.LowerThreadAllreduce(), tvm.tir.transform.MakePackedAPI(), tvm.tir.transform.SplitHostDevice(), ] mod_mixed = tvm.transform.Sequential(opt_mixed)(mod_mixed) # device optimizations opt_device = tvm.transform.Sequential([ tvm.tir.transform.Filter( lambda f: "calling_conv" in f.attrs and f.attrs[ "calling_conv"].value == CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.LowerWarpMemory(), tvm.tir.transform.Simplify(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerCustomDatatypes(), tvm.tir.transform.LowerIntrin(), ]) mod_dev = opt_device(mod_mixed) # host optimizations opt_host = tvm.transform.Sequential([ tvm.tir.transform.Filter( lambda f: "calling_conv" not in f.attrs or f.attrs[ "calling_conv"].value != CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.Apply(lambda f: f.with_attr("target", target_host)), tvm.tir.transform.LowerTVMBuiltin(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerCustomDatatypes(), tvm.tir.transform.LowerIntrin(), tvm.tir.transform.CombineContextCall(), ]) mod_host = opt_host(mod_mixed) if device_type == ndarray.cpu(0).device_type and target_host == target: assert len(mod_dev.functions) == 0 if "gpu" in target.keys and len(mod_dev.functions) == 0: warnings.warn( "Specified target %s, but cannot find device code, did you do " "bind?" % target) rt_mod_dev = codegen.build_module( mod_dev, target) if len(mod_dev.functions) != 0 else None return mod_host, rt_mod_dev
def build(inputs, args=None, target=None, target_host=None, name="default_function", binds=None): """Build a function with arguments as signature. Code will be generated for devices coupled with target information. Parameters ---------- inputs : tvm.te.Schedule, LoweredFunc, or dict of target to LoweredFunc list The schedule to be built args : list of Buffer or Tensor or Var, optional The argument lists to the function. target : str or :any:`tvm.target.Target`, optional The target and option of the compilation. target_host : str or :any:`tvm.target.Target` optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. name : str, optional The name of result function. binds : dict, optional Dictionary that maps the binding of symbolic buffer to Tensor. By default, a new buffer is created for each tensor in the argument. Returns ------- ret : tvm.module A module that combines both host and device code. Examples ________ There are two typical example uses of this function depending on the type of the argument `inputs`: 1. it is a list of lowered functions: .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.te.create_schedule(C.op) f = tvm.lower(s, [A, B, C], name="test_add") m = tvm.build(f, target="llvm") 2. it is a dict of compilation target to list of lowered functions: .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s1 = tvm.te.create_schedule(C.op) with tvm.target.cuda() as cuda_tgt: s2 = topi.cuda.schedule_injective(cuda_tgt, [C]) f1 = tvm.lower(s1, [A, B, C], name="test_add1") f2 = tvm.lower(s2, [A, B, C], name="test_add2") m = tvm.build({"llvm": [f1], "cuda": [f2]}, target_host="llvm") Note ---- See the note on :any:`tvm.target` on target string format. """ if isinstance(inputs, schedule.Schedule): if args is None: raise ValueError("args must be given for build from schedule") flist = lower(inputs, args, name=name, binds=binds) if isinstance(flist, LoweredFunc): flist = [flist] elif isinstance(inputs, LoweredFunc): if args: raise ValueError("args must be done when build from LoweredFunc.") flist = [inputs] elif isinstance(inputs, (list, tuple, container.Array)): flist = inputs elif not isinstance(inputs, (dict, container.Map)): raise ValueError("inputs must be Schedule, LoweredFunc, list of " "LoweredFunc, or dict of target to list of " "LoweredFunc.") if not isinstance(inputs, (dict, container.Map)): target = _target.Target.current() if target is None else target target = target if target else "llvm" target_flist = {target: flist} else: target_flist = inputs for tar, flist in target_flist.items(): if not isinstance(tar, (str, _target.Target)): raise ValueError("The key of inputs must be str or " "_target.Target when inputs is dict.") fname_set = set() for x in flist: if not isinstance(x, LoweredFunc): raise ValueError("inputs must be Schedule, LoweredFunc, list " "of LoweredFunc, or dict of str to list of " "LoweredFunc.") if x.name in fname_set: raise ValueError("Duplicate function name %s" % x.name) fname_set.add(x.name) if not target_host: for tar, _ in target_flist.items(): tar = _target.create(tar) device_type = ndarray.context(tar.target_name, 0).device_type if device_type == ndarray.cpu(0).device_type: target_host = tar break if not target_host: target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm" mod_host_all = tvm.IRModule({}) device_modules = [] for tar, flist in target_flist.items(): mod_host, mdev = _build_for_device(flist, tar, target_host) mod_host_all.update(mod_host) device_modules.append(mdev) # Generate a unified host module. rt_mod_host = codegen.build_module(mod_host_all, target_host) # Import all modules. for mdev in device_modules: if mdev: rt_mod_host.import_module(mdev) return rt_mod_host
def _build_for_device(flist, target, target_host): """Build the lowered functions for a device with the given compilation target. Parameters ---------- flist : list of LoweredFunc The schedule to be built. target : str or :any:`tvm.target.Target` The target and option of the compilation. target_host : str or :any:`tvm.target.Target` The host compilation target. Returns ------- fhost : list of LoweredFunc A list of lowered functions for the host. mdev : tvm.module A module that contains device code. """ target = _target.create(target) target_host = _target.create(target_host) device_type = ndarray.context(target.target_name, 0).device_type for func in flist: if not ir_pass.VerifyMemory(func, device_type): raise ValueError( "Direct host side access to device memory is detected in %s. " "Did you forget to bind?" % func.name) mod_mixed = tvm.testing.LoweredFuncsToIRModule(flist) opt_mixed = [tvm.tir.transform.Apply(lambda f: f.with_attr("target", target))] if BuildConfig.current().detect_global_barrier: opt_mixed += [tvm.tir.transform.ThreadSync("global")] opt_mixed += [tvm.tir.transform.ThreadSync("shared"), tvm.tir.transform.ThreadSync("warp"), tvm.tir.transform.InferFragment(), tvm.tir.transform.LowerThreadAllreduce(), tvm.tir.transform.BindDeviceType(), tvm.tir.transform.SplitHostDevice()] mod_mixed = tvm.ir.transform.Sequential(opt_mixed)(mod_mixed) # device optimizations opt_device = tvm.ir.transform.Sequential( [tvm.tir.transform.Filter( lambda f: "calling_conv" in f.attrs and f.attrs["calling_conv"].value == CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.LowerWarpMemory(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerIntrin()]) mod_dev = opt_device(mod_mixed) # host optimizations opt_host = tvm.ir.transform.Sequential( [tvm.tir.transform.Filter( lambda f: "calling_conv" not in f.attrs or f.attrs["calling_conv"].value != CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.Apply(lambda f: f.with_attr("target", target)), tvm.tir.transform.LowerTVMBuiltin(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerIntrin(), tvm.tir.transform.CombineContextCall()]) mod_host = opt_host(mod_mixed) if device_type == ndarray.cpu(0).device_type and target_host == target: assert len(mod_dev.functions) == 0 if "gpu" in target.keys and len(mod_dev.functions) == 0: warnings.warn( "Specified target %s, but cannot find device code, did you do " "bind?" % target) rt_mod_dev = codegen.build_module(mod_dev, target) if len(mod_dev.functions) != 0 else None return mod_host, rt_mod_dev
def _build_for_device(flist, target, target_host): """Build the lowered functions for a device with the given compilation target. Parameters ---------- flist : list of LoweredFunc The schedule to be built. target : str or :any:`tvm.target.Target` The target and option of the compilation. target_host : str or :any:`tvm.target.Target` The host compilation target. Returns ------- fhost : list of LoweredFunc A list of lowered functions for the host. mdev : tvm.module A module that contains device code. """ target = _target.create(target) device_type = ndarray.context(target.target_name, 0).device_type fhost = [] fdevice = [] for func in flist: if not ir_pass.VerifyMemory(func, device_type): raise ValueError( "Direct host side access to device memory is detected in %s. " "Did you forget to bind?" % func.name) if func.func_type == LoweredFunc.MixedFunc: if current_build_config().detect_global_barrier: func = ir_pass.ThreadSync(func, "global") func = ir_pass.ThreadSync(func, "shared") func = ir_pass.ThreadSync(func, "warp") func = ir_pass.InferFragment(func) warp_size = target.thread_warp_size func = ir_pass.LowerThreadAllreduce(func, warp_size) fsplits = list(ir_pass.SplitHostDevice(func)) fhost.append(fsplits[0]) for x in fsplits[1:]: fdevice.append(x) elif func.func_type == LoweredFunc.HostFunc: fhost.append(func) elif func.func_type == LoweredFunc.DeviceFunc: fdevice.append(func) else: raise ValueError("unknown function type %d" % func.func_type) for i, func in enumerate(fdevice): warp_size = target.thread_warp_size fdevice[i] = ir_pass.LowerWarpMemory(func, warp_size) if "gpu" in target.keys and not fdevice: warnings.warn( "Specified target %s, but cannot find device code, did you do " "bind?" % target) fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost] fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost] if device_type == ndarray.cpu(0).device_type and target_host == target: assert not fdevice target_host = _target.create(target_host) fdevice = [ir_pass.LowerDeviceStorageAccessInfo(x) for x in fdevice] fhost = [ir_pass.LowerDeviceStorageAccessInfo(x) for x in fhost] fdevice = [ir_pass.LowerIntrin(x, target.target_name) for x in fdevice] fhost = [ir_pass.LowerIntrin(x, target_host.target_name) for x in fhost] fhost = [ir_pass.CombineContextCall(x) for x in fhost] mdev = codegen.build_module(fdevice, str(target)) if fdevice else None return fhost, mdev
def _build_for_device(flist, target, target_host): """Build the lowered functions for a device with the given compilation target. Parameters ---------- flist : list of LoweredFunc The schedule to be built. target : str or :any:`tvm.target.Target` The target and option of the compilation. target_host : str or :any:`tvm.target.Target` The host compilation target. Returns ------- fhost : list of LoweredFunc A list of lowered functions for the host. mdev : tvm.module A module that contains device code. """ @tvm.tir.transform.prim_func_pass(opt_level=0) class BindTarget: def __init__(self, target): self.target = target # pylint: disable=unused-argument def transform_function(self, func, mod, ctx): return func.with_attr("target", self.target) target = _target.create(target) device_type = ndarray.context(target.target_name, 0).device_type fhost = [] fdevice = [] for func in flist: if not ir_pass.VerifyMemory(func, device_type): raise ValueError( "Direct host side access to device memory is detected in %s. " "Did you forget to bind?" % func.name) if func.func_type == LoweredFunc.MixedFunc: if BuildConfig.current().detect_global_barrier: func = ir_pass.ThreadSync(func, "global") func = ir_pass.ThreadSync(func, "shared") func = ir_pass.ThreadSync(func, "warp") func = ir_pass.InferFragment(func) warp_size = target.thread_warp_size func = ir_pass.LowerThreadAllreduce(func, warp_size) fsplits = list(ir_pass.SplitHostDevice(func)) fhost.append(fsplits[0]) for x in fsplits[1:]: fdevice.append(x) elif func.func_type == LoweredFunc.HostFunc: fhost.append(func) elif func.func_type == LoweredFunc.DeviceFunc: fdevice.append(func) else: raise ValueError("unknown function type %d" % func.func_type) if "gpu" in target.keys and not fdevice: warnings.warn( "Specified target %s, but cannot find device code, did you do " "bind?" % target) fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost] if device_type == ndarray.cpu(0).device_type and target_host == target: assert not fdevice target_host = _target.create(target_host) # device optimizations mod_dev = tvm.testing.LoweredFuncsToIRModule(fdevice) opt_device = tvm.ir.transform.Sequential( [BindTarget(target), tvm.tir.transform.LowerWarpMemory(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerIntrin()]) mod_dev = opt_device(mod_dev) # host optimizations mod_host = tvm.testing.LoweredFuncsToIRModule(fhost) opt_host = tvm.ir.transform.Sequential( [BindTarget(target_host), tvm.tir.transform.LowerTVMBuiltin(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerIntrin(), tvm.tir.transform.CombineContextCall()]) mod_host = opt_host(mod_host) rt_mod_dev = codegen.build_module(mod_dev, target) if fdevice else None return mod_host, rt_mod_dev