def compile_device(self, args): """Compile the device function for the given argument types. Each signature is compiled once by caching the compiled function inside this object. Returns the `CompileResult`. """ if args not in self.overloads: debug = self.targetoptions.get('debug') inline = self.targetoptions.get('inline') nvvm_options = { 'debug': debug, 'opt': 3 if self.targetoptions.get('opt') else 0 } cres = compile_cuda(self.py_func, None, args, debug=debug, inline=inline, nvvm_options=nvvm_options) self.overloads[args] = cres # The inserted function uses the id of the CompileResult as a key, # consistent with get_overload() above. cres.target_context.insert_user_function(id(cres), cres.fndesc, [cres.library]) else: cres = self.overloads[args] return cres
def compile_device(self, args): """Compile the device function for the given argument types. Each signature is compiled once by caching the compiled function inside this object. Returns the `CompileResult`. """ if args not in self.overloads: debug = self.targetoptions.get('debug') inline = self.targetoptions.get('inline') fastmath = self.targetoptions.get('fastmath') nvvm_options = { 'debug': debug, 'opt': 3 if self.targetoptions.get('opt') else 0, 'fastmath': fastmath } cres = compile_cuda(self.py_func, None, args, debug=debug, inline=inline, fastmath=fastmath, nvvm_options=nvvm_options) self.overloads[args] = cres cres.target_context.insert_user_function(cres.entry_point, cres.fndesc, [cres.library]) else: cres = self.overloads[args] return cres
def __init__(self, py_func, argtypes, link=None, debug=False, lineinfo=False, inline=False, fastmath=False, extensions=None, max_registers=None, opt=True, device=False): if device: raise RuntimeError('Cannot compile a device function as a kernel') super().__init__() self.py_func = py_func self.argtypes = argtypes self.debug = debug self.lineinfo = lineinfo self.extensions = extensions or [] nvvm_options = { 'debug': self.debug, 'lineinfo': self.lineinfo, 'fastmath': fastmath, 'opt': 3 if opt else 0 } cres = compile_cuda(self.py_func, types.void, self.argtypes, debug=self.debug, lineinfo=self.lineinfo, inline=inline, fastmath=fastmath, nvvm_options=nvvm_options) tgt_ctx = cres.target_context code = self.py_func.__code__ filename = code.co_filename linenum = code.co_firstlineno lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc, debug, nvvm_options, filename, linenum, max_registers) if not link: link = [] # A kernel needs cooperative launch if grid_sync is being used. self.cooperative = 'cudaCGGetIntrinsicHandle' in lib.get_asm_str() # We need to link against cudadevrt if grid sync is being used. if self.cooperative: link.append(get_cudalib('cudadevrt', static=True)) for filepath in link: lib.add_linking_file(filepath) # populate members self.entry_name = kernel.name self.signature = cres.signature self._type_annotation = cres.type_annotation self._codelibrary = lib self.call_helper = cres.call_helper
print(f'Compute capability {cc} / argtypes {types}:\n') print(llir) print() # LLVM to PTX from numba.core.compiler_lock import global_compiler_lock # noqa from numba.cuda.cudadrv import nvvm # noqa from numba.cuda.compiler import compile_cuda # noqa from numba import float32, int32, void # noqa # Have to cheat a bit here to get everything needed to give to NVVM with global_compiler_lock: argtys = (float32[:], int32, float32[:], float32[:]) returnty = void cres = compile_cuda(axpy.py_func, void, argtys, debug=False, inline=False) fname = cres.fndesc.llvm_func_name lib, kernel = cres.target_context.prepare_cuda_kernel(cres.library, fname, cres.signature.args, debug=False) llvm_module = lib._final_module cc = (5, 2) arch = nvvm.get_arch_option(*cc) llvmir = str(llvm_module) ptx = nvvm.llvm_to_ptx(llvmir, opt=3, arch=arch) print(ptx.decode('utf-8')) # PTX to module
def __init__(self, py_func, argtypes, link=None, debug=False, lineinfo=False, inline=False, fastmath=False, extensions=None, max_registers=None, opt=True, device=False): if device: raise RuntimeError('Cannot compile a device function as a kernel') super().__init__() # _DispatcherBase.nopython_signatures() expects this attribute to be # present, because it assumes an overload is a CompileResult. In the # CUDA target, _Kernel instances are stored instead, so we provide this # attribute here to avoid duplicating nopython_signatures() in the CUDA # target with slight modifications. self.objectmode = False # The finalizer constructed by _DispatcherBase._make_finalizer also # expects overloads to be a CompileResult. It uses the entry_point to # remove a CompileResult from a target context. However, since we never # insert kernels into a target context (there is no need because they # cannot be called by other functions, only through the dispatcher) it # suffices to pretend we have an entry point of None. self.entry_point = None self.py_func = py_func self.argtypes = argtypes self.debug = debug self.lineinfo = lineinfo self.extensions = extensions or [] nvvm_options = { 'debug': self.debug, 'lineinfo': self.lineinfo, 'fastmath': fastmath, 'opt': 3 if opt else 0 } cres = compile_cuda(self.py_func, types.void, self.argtypes, debug=self.debug, lineinfo=self.lineinfo, inline=inline, fastmath=fastmath, nvvm_options=nvvm_options) tgt_ctx = cres.target_context code = self.py_func.__code__ filename = code.co_filename linenum = code.co_firstlineno lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc, debug, nvvm_options, filename, linenum, max_registers) if not link: link = [] # A kernel needs cooperative launch if grid_sync is being used. self.cooperative = 'cudaCGGetIntrinsicHandle' in lib.get_asm_str() # We need to link against cudadevrt if grid sync is being used. if self.cooperative: link.append(get_cudalib('cudadevrt', static=True)) for filepath in link: lib.add_linking_file(filepath) # populate members self.entry_name = kernel.name self.signature = cres.signature self._type_annotation = cres.type_annotation self._codelibrary = lib self.call_helper = cres.call_helper