def add(self, sig): """ Compile the DUFunc for the given signature. """ args, return_type = sigutils.normalize_signature(sig) return self._compile_for_argtys(args, return_type)
def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None, opt=True, cache=False, **kws): """ JIT compile a python function conforming to the CUDA Python specification. If a signature is supplied, then a function is returned that takes a function to compile. :param func_or_sig: A function to JIT compile, or a signature of a function to compile. If a function is supplied, then a :class:`numba.cuda.compiler.AutoJitCUDAKernel` is returned. If a signature is supplied, then a function is returned. The returned function accepts another function, which it will compile and then return a :class:`numba.cuda.compiler.AutoJitCUDAKernel`. .. note:: A kernel cannot have any return value. :param device: Indicates whether this is a device function. :type device: bool :param link: A list of files containing PTX source to link with the function :type link: list :param debug: If True, check for exceptions thrown when executing the kernel. Since this degrades performance, this should only be used for debugging purposes. If set to True, then ``opt`` should be set to False. Defaults to False. (The default value can be overridden by setting environment variable ``NUMBA_CUDA_DEBUGINFO=1``.) :param fastmath: When True, enables fastmath optimizations as outlined in the :ref:`CUDA Fast Math documentation <cuda-fast-math>`. :param max_registers: Request that the kernel is limited to using at most this number of registers per thread. The limit may not be respected if the ABI requires a greater number of registers than that requested. Useful for increasing occupancy. :param opt: Whether to compile from LLVM IR to PTX with optimization enabled. When ``True``, ``-opt=3`` is passed to NVVM. When ``False``, ``-opt=0`` is passed to NVVM. Defaults to ``True``. :type opt: bool :param lineinfo: If True, generate a line mapping between source code and assembly code. This enables inspection of the source code in NVIDIA profiling tools and correlation with program counter sampling. :type lineinfo: bool :param cache: If True, enables the file-based cache for this function. :type cache: bool """ if link and config.ENABLE_CUDASIM: raise NotImplementedError('Cannot link PTX in the simulator') if kws.get('boundscheck'): raise NotImplementedError("bounds checking is not supported for CUDA") if kws.get('argtypes') is not None: msg = _msg_deprecated_signature_arg.format('argtypes') raise DeprecationError(msg) if kws.get('restype') is not None: msg = _msg_deprecated_signature_arg.format('restype') raise DeprecationError(msg) if kws.get('bind') is not None: msg = _msg_deprecated_signature_arg.format('bind') raise DeprecationError(msg) debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug fastmath = kws.get('fastmath', False) extensions = kws.get('extensions', []) if debug and opt: msg = ("debug=True with opt=True (the default) " "is not supported by CUDA. This may result in a crash" " - set debug=False or opt=False.") warn(NumbaInvalidConfigWarning(msg)) if device and kws.get('link'): raise ValueError("link keyword invalid for device function") if sigutils.is_signature(func_or_sig): if config.ENABLE_CUDASIM: def jitwrapper(func): return FakeCUDAKernel(func, device=device, fastmath=fastmath) return jitwrapper argtypes, restype = sigutils.normalize_signature(func_or_sig) if restype and not device and restype != types.void: raise TypeError("CUDA kernel must have void return type.") def _jit(func): targetoptions = kws.copy() targetoptions['debug'] = debug targetoptions['link'] = link targetoptions['opt'] = opt targetoptions['fastmath'] = fastmath targetoptions['device'] = device targetoptions['extensions'] = extensions disp = CUDADispatcher(func, targetoptions=targetoptions) if cache: disp.enable_caching() if device: from numba.core import typeinfer with typeinfer.register_dispatcher(disp): disp.compile_device(argtypes) else: disp.compile(argtypes) disp._specialized = True disp.disable_compile() return disp return _jit else: if func_or_sig is None: if config.ENABLE_CUDASIM: def autojitwrapper(func): return FakeCUDAKernel(func, device=device, fastmath=fastmath) else: def autojitwrapper(func): return jit(func, device=device, debug=debug, opt=opt, link=link, cache=cache, **kws) return autojitwrapper # func_or_sig is a function else: if config.ENABLE_CUDASIM: return FakeCUDAKernel(func_or_sig, device=device, fastmath=fastmath) else: targetoptions = kws.copy() targetoptions['debug'] = debug targetoptions['opt'] = opt targetoptions['link'] = link targetoptions['fastmath'] = fastmath targetoptions['device'] = device targetoptions['extensions'] = extensions disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions) if cache: disp.enable_caching() return disp
def get_overload(self, sig): """ Return the compiled function for the given signature. """ args, return_type = sigutils.normalize_signature(sig) return self.overloads[tuple(args)].entry_point
def __init__(self, py_func, sigs, targetoptions): self.py_func = py_func self.sigs = [] self.link = targetoptions.pop('link', (),) self._can_compile = True self._type = self._numba_type_ # The compiling counter is only used when compiling device functions as # it is used to detect recursion - recursion is not possible when # compiling a kernel. self._compiling_counter = CompilingCounter() # Specializations for given sets of argument types self.specializations = {} # A mapping of signatures to compile results self.overloads = collections.OrderedDict() self.targetoptions = targetoptions # defensive copy self.targetoptions['extensions'] = \ list(self.targetoptions.get('extensions', [])) self.typingctx = self.targetdescr.typing_context self._tm = default_type_manager pysig = utils.pysignature(py_func) arg_count = len(pysig.parameters) argnames = tuple(pysig.parameters) default_values = self.py_func.__defaults__ or () defargs = tuple(OmittedArg(val) for val in default_values) can_fallback = False # CUDA cannot fallback to object mode try: lastarg = list(pysig.parameters.values())[-1] except IndexError: has_stararg = False else: has_stararg = lastarg.kind == lastarg.VAR_POSITIONAL exact_match_required = False _dispatcher.Dispatcher.__init__(self, self._tm.get_pointer(), arg_count, self._fold_args, argnames, defargs, can_fallback, has_stararg, exact_match_required) if sigs: if len(sigs) > 1: raise TypeError("Only one signature supported at present") if targetoptions.get('device'): argtypes, restype = sigutils.normalize_signature(sigs[0]) self.compile_device(argtypes) else: self.compile(sigs[0]) self._can_compile = False if targetoptions.get('device'): self._register_device_function()
def wrappped(func): fn_argtys, fn_retty = sigutils.normalize_signature(sig) signature = typing.signature(fn_retty, *fn_argtys) entry = ExportEntry(symbol=sym, signature=signature, function=func) export_registry.append(entry)
def compile_instance(func, sig, target: TargetInfo, typing_context, target_context, pipeline_class, main_library, debug=False): """Compile a function with given signature. Return function name when succesful. """ flags = compiler.Flags() flags.set('no_compile') flags.set('no_cpython_wrapper') if get_version('numba') >= (0, 49): flags.set('no_cfunc_wrapper') fname = func.__name__ + sig.mangling() args, return_type = sigutils.normalize_signature( sig.tonumba(bool_is_int8=True)) try: cres = compiler.compile_extra(typingctx=typing_context, targetctx=target_context, func=func, args=args, return_type=return_type, flags=flags, library=main_library, locals={}, pipeline_class=pipeline_class) except UnsupportedError as msg: for m in re.finditer(r'[|]UnsupportedError[|](.*?)\n', str(msg), re.S): warnings.warn(f'Skipping {fname}: {m.group(0)[18:]}') return except nb_errors.TypingError as msg: for m in re.finditer(r'[|]UnsupportedError[|](.*?)\n', str(msg), re.S): warnings.warn(f'Skipping {fname}: {m.group(0)[18:]}') break else: raise return except Exception: raise result = get_called_functions(cres.library, cres.fndesc.llvm_func_name) for f in result['declarations']: if target.supports(f): continue warnings.warn(f'Skipping {fname} that uses undefined function `{f}`') return nvvmlib = libfuncs.Library.get('nvvm') llvmlib = libfuncs.Library.get('llvm') for f in result['intrinsics']: if target.is_gpu: if f in nvvmlib: continue if target.is_cpu: if f in llvmlib: continue warnings.warn( f'Skipping {fname} that uses unsupported intrinsic `{f}`') return make_wrapper(fname, args, return_type, cres, target, verbose=debug) main_module = main_library._final_module for lib in result['libraries']: main_module.link_in( lib._get_module_for_linking(), preserve=True, ) return fname
def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None, opt=True, **kws): """ JIT compile a python function conforming to the CUDA Python specification. If a signature is supplied, then a function is returned that takes a function to compile. :param func_or_sig: A function to JIT compile, or a signature of a function to compile. If a function is supplied, then a :class:`numba.cuda.compiler.AutoJitCUDAKernel` is returned. If a signature is supplied, then a function is returned. The returned function accepts another function, which it will compile and then return a :class:`numba.cuda.compiler.AutoJitCUDAKernel`. .. note:: A kernel cannot have any return value. :param device: Indicates whether this is a device function. :type device: bool :param link: A list of files containing PTX source to link with the function :type link: list :param debug: If True, check for exceptions thrown when executing the kernel. Since this degrades performance, this should only be used for debugging purposes. Defaults to False. (The default value can be overridden by setting environment variable ``NUMBA_CUDA_DEBUGINFO=1``.) :param fastmath: If true, enables flush-to-zero and fused-multiply-add, disables precise division and square root. This parameter has no effect on device function, whose fastmath setting depends on the kernel function from which they are called. :param max_registers: Request that the kernel is limited to using at most this number of registers per thread. The limit may not be respected if the ABI requires a greater number of registers than that requested. Useful for increasing occupancy. :param opt: Whether to compile from LLVM IR to PTX with optimization enabled. When ``True``, ``-opt=3`` is passed to NVVM. When ``False``, ``-opt=0`` is passed to NVVM. Defaults to ``True``. :type opt: bool """ debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug if link and config.ENABLE_CUDASIM: raise NotImplementedError('Cannot link PTX in the simulator') if kws.get('boundscheck'): raise NotImplementedError("bounds checking is not supported for CUDA") if kws.get('argtypes') is not None: msg = _msg_deprecated_signature_arg.format('argtypes') raise DeprecationError(msg) if kws.get('restype') is not None: msg = _msg_deprecated_signature_arg.format('restype') raise DeprecationError(msg) if kws.get('bind') is not None: msg = _msg_deprecated_signature_arg.format('bind') raise DeprecationError(msg) fastmath = kws.get('fastmath', False) if not sigutils.is_signature(func_or_sig): if func_or_sig is None: if config.ENABLE_CUDASIM: def autojitwrapper(func): return FakeCUDAKernel(func, device=device, fastmath=fastmath, debug=debug) else: def autojitwrapper(func): return jit(func, device=device, debug=debug, opt=opt, **kws) return autojitwrapper # func_or_sig is a function else: if config.ENABLE_CUDASIM: return FakeCUDAKernel(func_or_sig, device=device, fastmath=fastmath, debug=debug) elif device: return jitdevice(func_or_sig, debug=debug, opt=opt, **kws) else: targetoptions = kws.copy() targetoptions['debug'] = debug targetoptions['opt'] = opt targetoptions['link'] = link sigs = None return Dispatcher(func_or_sig, sigs, targetoptions=targetoptions) else: if config.ENABLE_CUDASIM: def jitwrapper(func): return FakeCUDAKernel(func, device=device, fastmath=fastmath, debug=debug) return jitwrapper if isinstance(func_or_sig, list): msg = 'Lists of signatures are not yet supported in CUDA' raise ValueError(msg) elif sigutils.is_signature(func_or_sig): sigs = [func_or_sig] else: raise ValueError("Expecting signature or list of signatures") for sig in sigs: argtypes, restype = sigutils.normalize_signature(sig) if restype and not device and restype != types.void: raise TypeError("CUDA kernel must have void return type.") def kernel_jit(func): targetoptions = kws.copy() targetoptions['debug'] = debug targetoptions['link'] = link targetoptions['opt'] = opt return Dispatcher(func, sigs, targetoptions=targetoptions) def device_jit(func): return compile_device(func, restype, argtypes, inline=inline, debug=debug) if device: return device_jit else: return kernel_jit
def declare_device(name, sig): argtypes, restype = sigutils.normalize_signature(sig) return declare_device_function(name, restype, argtypes)
def _compile_element_wise_function(nb_func, targetoptions, sig): # Do compilation # Return CompileResult to test cres = nb_func.compile(sig, **targetoptions) args, return_type = sigutils.normalize_signature(sig) return cres, args, return_type
def compile(self, sig): args, _ = sigutils.normalize_signature(sig) sig = (types.ffi_forced_object, ) * len(args) return super().compile(sig)
def compile_to_LLVM(functions_and_signatures, target: TargetInfo, pipeline_class=compiler.Compiler, debug=False): """Compile functions with given signatures to target specific LLVM IR. Parameters ---------- functions_and_signatures : list Specify a list of Python function and its signatures pairs. target : TargetInfo Specify target device information. debug : bool Returns ------- module : llvmlite.binding.ModuleRef LLVM module instance. To get the IR string, use `str(module)`. """ target_desc = registry.cpu_target if target is None: target = TargetInfo.host() typing_context = target_desc.typing_context target_context = target_desc.target_context else: typing_context = typing.Context() target_context = RemoteCPUContext(typing_context, target) # Bring over Array overloads (a hack): target_context._defns = target_desc.target_context._defns typing_context.target_info = target target_context.target_info = target codegen = target_context.codegen() main_library = codegen.create_library('rbc.irtools.compile_to_IR') main_module = main_library._final_module flags = compiler.Flags() flags.set('no_compile') flags.set('no_cpython_wrapper') function_names = [] for func, signatures in functions_and_signatures: for sig in signatures: fname = func.__name__ + sig.mangling function_names.append(fname) args, return_type = sigutils.normalize_signature( sig.tonumba(bool_is_int8=True)) cres = compiler.compile_extra(typingctx=typing_context, targetctx=target_context, func=func, args=args, return_type=return_type, flags=flags, library=main_library, locals={}, pipeline_class=pipeline_class) make_wrapper(fname, args, return_type, cres) seen = set() for _library in main_library._linking_libraries: if _library not in seen: seen.add(_library) main_module.link_in( _library._get_module_for_linking(), preserve=True, ) main_library._optimize_final_module() # Catch undefined functions: used_functions = set(function_names) for fname in function_names: deps = get_function_dependencies(main_module, fname) for fn, descr in deps.items(): used_functions.add(fn) if descr == 'undefined': if fn.startswith('numba_') and target.has_numba: continue if fn.startswith('Py') and target.has_cpython: continue raise RuntimeError('function `%s` is undefined' % (fn)) # for global_variable in main_module.global_variables: # global_variable.linkage = llvm.Linkage.private unused_functions = [ f.name for f in main_module.functions if f.name not in used_functions ] if debug: print('compile_to_IR: the following functions are used') for fname in used_functions: lf = main_module.get_function(fname) print(' [ALIVE]', fname, 'with', lf.linkage) if unused_functions: if debug: print('compile_to_IR: the following functions are not used' ' and will be removed:') for fname in unused_functions: lf = main_module.get_function(fname) if lf.is_declaration: # if the function is a declaration, # we just put the linkage as external lf.linkage = llvm.Linkage.external else: # but if the function is not a declaration, # we change the linkage to private lf.linkage = llvm.Linkage.private if debug: print(' [DEAD]', fname, 'with', lf.linkage) main_library._optimize_final_module() # TODO: determine unused global_variables and struct_types main_module.verify() main_library._finalized = True main_module.triple = target.triple main_module.data_layout = target.datalayout return main_module