def numba_buffer_adaptor(self, buf, ptr): fnty = Type.function(Type.void(), [ir.PointerType(self.py_buffer_t), self.voidptr]) fn = self._get_function(fnty, name="numba_adapt_buffer") fn.args[0].add_attribute(lc.ATTR_NO_CAPTURE) fn.args[1].add_attribute(lc.ATTR_NO_CAPTURE) return self.builder.call(fn, (buf, ptr))
def restore_thread(self, thread_state): """ Restore the given thread state by reacquiring the GIL. """ fnty = Type.function(Type.void(), [self.voidptr]) fn = self._get_function(fnty, name="PyEval_RestoreThread") self.builder.call(fn, [thread_state])
def wavebarrier_impl(context, builder, sig, args): assert not args fnty = Type.function(Type.void(), []) fn = builder.module.get_or_insert_function(fnty, name="__hsail_wavebarrier") fn.calling_convention = target.CC_SPIR_FUNC builder.call(fn, []) return _void_value
def object_dump(self, obj): """ Dump a Python object on C stderr. For debugging purposes. """ fnty = Type.function(Type.void(), [self.pyobj]) fn = self._get_function(fnty, name="_PyObject_Dump") return self.builder.call(fn, (obj,))
def ptx_warp_sync(context, builder, sig, args): fname = 'llvm.nvvm.bar.warp.sync' lmod = builder.module fnty = Type.function(Type.void(), (Type.int(32),)) sync = lmod.get_or_insert_function(fnty, name=fname) builder.call(sync, args) return context.get_dummy_value()
def complex128_power_impl(context, builder, sig, args): [ca, cb] = args a = Complex128(context, builder, value=ca) b = Complex128(context, builder, value=cb) c = Complex128(context, builder) module = builder.module pa = a._getpointer() pb = b._getpointer() pc = c._getpointer() # Optimize for square because cpow looses a lot of precsiion TWO = context.get_constant(types.float64, 2) ZERO = context.get_constant(types.float64, 0) b_real_is_two = builder.fcmp(lc.FCMP_OEQ, b.real, TWO) b_imag_is_zero = builder.fcmp(lc.FCMP_OEQ, b.imag, ZERO) b_is_two = builder.and_(b_real_is_two, b_imag_is_zero) with builder.if_else(b_is_two) as (then, otherwise): with then: # Lower as multiplication res = complex_mul_impl(context, builder, sig, (ca, ca)) cres = Complex128(context, builder, value=res) c.real = cres.real c.imag = cres.imag with otherwise: # Lower with call to external function fnty = Type.function(Type.void(), [pa.type] * 3) cpow = module.get_or_insert_function(fnty, name="numba.math.cpow") builder.call(cpow, (pa, pb, pc)) res = builder.load(pc) return impl_ret_untracked(context, builder, sig.return_type, res)
def ptx_threadfence_device(context, builder, sig, args): assert not args fname = "llvm.nvvm.membar.gl" lmod = builder.module fnty = Type.function(Type.void(), ()) sync = lmod.get_or_insert_function(fnty, name=fname) builder.call(sync, ()) return context.get_dummy_value()
def err_set_string(self, exctype, msg): fnty = Type.function(Type.void(), [self.pyobj, self.cstring]) fn = self._get_function(fnty, name="PyErr_SetString") if isinstance(exctype, str): exctype = self.get_c_object(exctype) if isinstance(msg, str): msg = self.context.insert_const_string(self.module, msg) return self.builder.call(fn, (exctype, msg))
def nrt_adapt_buffer_from_python(self, buf, ptr): assert self.context.enable_nrt fnty = Type.function(Type.void(), [Type.pointer(self.py_buffer_t), self.voidptr]) fn = self._get_function(fnty, name="NRT_adapt_buffer_from_python") fn.args[0].add_attribute(lc.ATTR_NO_CAPTURE) fn.args[1].add_attribute(lc.ATTR_NO_CAPTURE) return self.builder.call(fn, (buf, ptr))
def ptx_syncthreads(context, builder, sig, args): assert not args fname = 'llvm.nvvm.barrier0' lmod = cgutils.get_module(builder) fnty = Type.function(Type.void(), ()) sync = lmod.get_or_insert_function(fnty, name=fname) builder.call(sync, ()) return context.get_dummy_value()
def gil_release(self, gil): """ Release the acquired GIL by gil_ensure(). Must be paired with a gil_ensure(). """ gilptrty = Type.pointer(self.gil_state) fnty = Type.function(Type.void(), [gilptrty]) fn = self._get_function(fnty, "numba_gil_release") return self.builder.call(fn, [gil])
def raise_object(self, exc=None): """ Raise an arbitrary exception (type or value or (type, args) or None - if reraising). A reference to the argument is consumed. """ fnty = Type.function(Type.void(), [self.pyobj]) fn = self._get_function(fnty, name="numba_do_raise") if exc is None: exc = self.get_null_object() return self.builder.call(fn, (exc,))
def gil_ensure(self): """ Ensure the GIL is acquired. The returned value must be consumed by gil_release(). """ gilptrty = Type.pointer(self.gil_state) fnty = Type.function(Type.void(), [gilptrty]) fn = self._get_function(fnty, "numba_gil_ensure") gilptr = cgutils.alloca_once(self.builder, self.gil_state) self.builder.call(fn, [gilptr]) return gilptr
def lower_finalize_func(self, lower): """ Lower the generator's finalizer. """ fnty = Type.function(Type.void(), [self.context.get_value_type(self.gentype)]) function = lower.module.get_or_insert_function(fnty, name=self.gendesc.llvm_finalizer_name) entry_block = function.append_basic_block("entry") builder = Builder.new(entry_block) genptrty = self.context.get_value_type(self.gentype) genptr = builder.bitcast(function.args[0], genptrty) self.lower_finalize_func_body(builder, genptr)
def test_nvvm_from_llvm(self): m = Module("test_nvvm_from_llvm") fty = Type.function(Type.void(), [Type.int()]) kernel = m.add_function(fty, name='mycudakernel') bldr = Builder(kernel.append_basic_block('entry')) bldr.ret_void() set_cuda_kernel(kernel) fix_data_layout(m) ptx = llvm_to_ptx(str(m)).decode('utf8') self.assertTrue('mycudakernel' in ptx) if is64bit: self.assertTrue('.address_size 64' in ptx) else: self.assertTrue('.address_size 32' in ptx)
def test_inline_rsqrt(self): mod = Module.new(__name__) fnty = Type.function(Type.void(), [Type.pointer(Type.float())]) fn = mod.add_function(fnty, "cu_rsqrt") bldr = Builder.new(fn.append_basic_block("entry")) rsqrt_approx_fnty = Type.function(Type.float(), [Type.float()]) inlineasm = InlineAsm.get(rsqrt_approx_fnty, "rsqrt.approx.f32 $0, $1;", "=f,f", side_effect=True) val = bldr.load(fn.args[0]) res = bldr.call(inlineasm, [val]) bldr.store(res, fn.args[0]) bldr.ret_void() # generate ptx nvvm.fix_data_layout(mod) nvvm.set_cuda_kernel(fn) nvvmir = str(mod) ptx = nvvm.llvm_to_ptx(nvvmir) self.assertTrue("rsqrt.approx.f32" in str(ptx))
def complex_power_impl(context, builder, sig, args): [ca, cb] = args ty = sig.args[0] fty = ty.underlying_float a = context.make_helper(builder, ty, value=ca) b = context.make_helper(builder, ty, value=cb) c = context.make_helper(builder, ty) module = builder.module pa = a._getpointer() pb = b._getpointer() pc = c._getpointer() # Optimize for square because cpow loses a lot of precision TWO = context.get_constant(fty, 2) ZERO = context.get_constant(fty, 0) b_real_is_two = builder.fcmp_ordered('==', b.real, TWO) b_imag_is_zero = builder.fcmp_ordered('==', b.imag, ZERO) b_is_two = builder.and_(b_real_is_two, b_imag_is_zero) with builder.if_else(b_is_two) as (then, otherwise): with then: # Lower as multiplication res = complex_mul_impl(context, builder, sig, (ca, ca)) cres = context.make_helper(builder, ty, value=res) c.real = cres.real c.imag = cres.imag with otherwise: # Lower with call to external function func_name = { types.complex64: "numba_cpowf", types.complex128: "numba_cpow", }[ty] fnty = Type.function(Type.void(), [pa.type] * 3) cpow = module.get_or_insert_function(fnty, name=func_name) builder.call(cpow, (pa, pb, pc)) res = builder.load(pc) return impl_ret_untracked(context, builder, sig.return_type, res)
def from_native_generator(self, val, typ, env=None): """ Make a Numba generator (a _dynfunc.Generator instance) from a generator structure pointer *val*. *env* is an optional _dynfunc.Environment instance to be wrapped in the generator. """ llty = self.context.get_data_type(typ) assert not llty.is_pointer gen_struct_size = self.context.get_abi_sizeof(llty) gendesc = self.context.get_generator_desc(typ) # This is the PyCFunctionWithKeywords generated by PyCallWrapper genfnty = Type.function(self.pyobj, [self.pyobj, self.pyobj, self.pyobj]) genfn = self._get_function(genfnty, name=gendesc.llvm_cpython_wrapper_name) # This is the raw finalizer generated by _lower_generator_finalize_func() finalizerty = Type.function(Type.void(), [self.voidptr]) if typ.has_finalizer: finalizer = self._get_function(finalizerty, name=gendesc.llvm_finalizer_name) else: finalizer = Constant.null(Type.pointer(finalizerty)) # PyObject *numba_make_generator(state_size, initial_state, nextfunc, finalizer, env) fnty = Type.function(self.pyobj, [self.py_ssize_t, self.voidptr, Type.pointer(genfnty), Type.pointer(finalizerty), self.voidptr]) fn = self._get_function(fnty, name="numba_make_generator") state_size = ir.Constant(self.py_ssize_t, gen_struct_size) initial_state = self.builder.bitcast(val, self.voidptr) if env is None: env = self.get_null_object() env = self.builder.bitcast(env, self.voidptr) return self.builder.call(fn, (state_size, initial_state, genfn, finalizer, env))
def test_inline_rsqrt(self): mod = Module(__name__) fnty = Type.function(Type.void(), [Type.pointer(Type.float())]) fn = mod.add_function(fnty, 'cu_rsqrt') bldr = Builder(fn.append_basic_block('entry')) rsqrt_approx_fnty = Type.function(Type.float(), [Type.float()]) inlineasm = InlineAsm.get(rsqrt_approx_fnty, 'rsqrt.approx.f32 $0, $1;', '=f,f', side_effect=True) val = bldr.load(fn.args[0]) res = bldr.call(inlineasm, [val]) bldr.store(res, fn.args[0]) bldr.ret_void() # generate ptx nvvm.fix_data_layout(mod) nvvm.set_cuda_kernel(fn) nvvmir = str(mod) ptx = nvvm.llvm_to_ptx(nvvmir) self.assertTrue('rsqrt.approx.f32' in str(ptx))
def complex_power_impl(context, builder, sig, args): [ca, cb] = args ty = sig.args[0] fty = ty.underlying_float a = context.make_helper(builder, ty, value=ca) b = context.make_helper(builder, ty, value=cb) c = context.make_helper(builder, ty) module = builder.module pa = a._getpointer() pb = b._getpointer() pc = c._getpointer() # Optimize for square because cpow loses a lot of precision TWO = context.get_constant(fty, 2) ZERO = context.get_constant(fty, 0) b_real_is_two = builder.fcmp(lc.FCMP_OEQ, b.real, TWO) b_imag_is_zero = builder.fcmp(lc.FCMP_OEQ, b.imag, ZERO) b_is_two = builder.and_(b_real_is_two, b_imag_is_zero) with builder.if_else(b_is_two) as (then, otherwise): with then: # Lower as multiplication res = complex_mul_impl(context, builder, sig, (ca, ca)) cres = context.make_helper(builder, ty, value=res) c.real = cres.real c.imag = cres.imag with otherwise: # Lower with call to external function fnty = Type.function(Type.void(), [pa.type] * 3) cpow = module.get_or_insert_function(fnty, name="numba.math.cpow") builder.call(cpow, (pa, pb, pc)) res = builder.load(pc) return impl_ret_untracked(context, builder, sig.return_type, res)
def build(self): byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = self.context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function( Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = self.library.create_ir_module('') func_type = self.call_conv.get_function_type(self.fndesc.restype, self.fndesc.argtypes) func = wrapper_module.add_function(func_type, name=self.func.name) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, "__gufunc__." + self.func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") # Unpack shapes unique_syms = set() for grp in (self.sin, self.sout): for syms in grp: unique_syms |= set(syms) sym_map = {} for syms in self.sin: for s in syms: if s not in sym_map: sym_map[s] = len(sym_map) sym_dim = {} for s, i in sym_map.items(): sym_dim[s] = builder.load( builder.gep(arg_dims, [self.context.get_constant(types.intp, i + 1)])) # Prepare inputs arrays = [] step_offset = len(self.sin) + len(self.sout) for i, (typ, sym) in enumerate( zip(self.signature.args, self.sin + self.sout)): ary = GUArrayArg(self.context, builder, arg_args, arg_dims, arg_steps, i, step_offset, typ, sym, sym_dim) if not ary.as_scalar: step_offset += ary.ndim arrays.append(ary) bbreturn = cgutils.get_function(builder).append_basic_block('.return') # Prologue self.gen_prologue(builder) # Loop with cgutils.for_range(builder, loopcount, intp=intp_t) as ind: args = [a.array_value for a in arrays] innercall, error = self.gen_loop_body(builder, func, args) # If error, escape cgutils.cbranch_or_continue(builder, error, bbreturn) for a in arrays: a.next(ind) builder.branch(bbreturn) builder.position_at_end(bbreturn) # Epilogue self.gen_epilogue(builder) builder.ret_void() self.library.add_ir_module(wrapper_module) wrapper = self.library.get_function(wrapper.name) # Set core function to internal so that it is not generated self.func.linkage = LINKAGE_INTERNAL return wrapper, self.env
def generate_kernel_wrapper(self, func, argtypes): module = func.module arginfo = self.get_arg_packer(argtypes) argtys = list(arginfo.argument_types) wrapfnty = Type.function(Type.void(), argtys) wrapper_module = self.create_module("cuda.kernel.wrapper") fnty = Type.function(Type.int(), [self.call_conv.get_return_type(types.pyobject)] + argtys) func = wrapper_module.add_function(fnty, name=func.name) wrapfn = wrapper_module.add_function(wrapfnty, name="cudaPy_" + func.name) builder = Builder.new(wrapfn.append_basic_block('')) # Define error handling variables def define_error_gv(postfix): gv = wrapper_module.add_global_variable(Type.int(), name=wrapfn.name + postfix) gv.initializer = Constant.null(gv.type.pointee) return gv gv_exc = define_error_gv("__errcode__") gv_tid = [] gv_ctaid = [] for i in 'xyz': gv_tid.append(define_error_gv("__tid%s__" % i)) gv_ctaid.append(define_error_gv("__ctaid%s__" % i)) callargs = arginfo.from_arguments(builder, wrapfn.args) status, _ = self.call_conv.call_function(builder, func, types.void, argtypes, callargs) # Check error status with cgutils.if_likely(builder, status.is_ok): builder.ret_void() with builder.if_then(builder.not_(status.is_python_exc)): # User exception raised old = Constant.null(gv_exc.type.pointee) # Use atomic cmpxchg to prevent rewriting the error status # Only the first error is recorded casfnty = lc.Type.function(old.type, [gv_exc.type, old.type, old.type]) casfn = wrapper_module.add_function(casfnty, name="___numba_cas_hack") xchg = builder.call(casfn, [gv_exc, old, status.code]) changed = builder.icmp(ICMP_EQ, xchg, old) # If the xchange is successful, save the thread ID. sreg = nvvmutils.SRegBuilder(builder) with builder.if_then(changed): for dim, ptr, in zip("xyz", gv_tid): val = sreg.tid(dim) builder.store(val, ptr) for dim, ptr, in zip("xyz", gv_ctaid): val = sreg.ctaid(dim) builder.store(val, ptr) builder.ret_void() # force inline # inline_function(status.code) nvvm.set_cuda_kernel(wrapfn) module.link_in(ll.parse_assembly(str(wrapper_module))) module.verify() wrapfn = module.get_function(wrapfn.name) return wrapfn
def build(self): byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = self.context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = self.library.create_ir_module('') func_type = self.call_conv.get_function_type(self.fndesc.restype, self.fndesc.argtypes) func = wrapper_module.add_function(func_type, name=self.func.name) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, "__gufunc__." + self.func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") pyapi = self.context.get_python_api(builder) # Unpack shapes unique_syms = set() for grp in (self.sin, self.sout): for syms in grp: unique_syms |= set(syms) sym_map = {} for syms in self.sin: for s in syms: if s not in sym_map: sym_map[s] = len(sym_map) sym_dim = {} for s, i in sym_map.items(): sym_dim[s] = builder.load(builder.gep(arg_dims, [self.context.get_constant( types.intp, i + 1)])) # Prepare inputs arrays = [] step_offset = len(self.sin) + len(self.sout) for i, (typ, sym) in enumerate(zip(self.signature.args, self.sin + self.sout)): ary = GUArrayArg(self.context, builder, arg_args, arg_steps, i, step_offset, typ, sym, sym_dim) step_offset += len(sym) arrays.append(ary) bbreturn = builder.append_basic_block('.return') # Prologue self.gen_prologue(builder, pyapi) # Loop with cgutils.for_range(builder, loopcount, intp=intp_t) as loop: args = [a.get_array_at_offset(loop.index) for a in arrays] innercall, error = self.gen_loop_body(builder, pyapi, func, args) # If error, escape cgutils.cbranch_or_continue(builder, error, bbreturn) builder.branch(bbreturn) builder.position_at_end(bbreturn) # Epilogue self.gen_epilogue(builder, pyapi) builder.ret_void() self.library.add_ir_module(wrapper_module) wrapper = self.library.get_function(wrapper.name) # Set core function to internal so that it is not generated self.func.linkage = LINKAGE_INTERNAL return wrapper, self.env
def _build_wrapper(self, library, name): """ The LLVM IRBuilder code to create the gufunc wrapper. The *library* arg is the CodeLibrary for which the wrapper should be added to. The *name* arg is the name of the wrapper function being created. """ byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = self.context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = library.create_ir_module('') func_type = self.call_conv.get_function_type(self.fndesc.restype, self.fndesc.argtypes) fname = self.fndesc.llvm_func_name func = wrapper_module.add_function(func_type, name=fname) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") pyapi = self.context.get_python_api(builder) # Unpack shapes unique_syms = set() for grp in (self.sin, self.sout): for syms in grp: unique_syms |= set(syms) sym_map = {} for syms in self.sin: for s in syms: if s not in sym_map: sym_map[s] = len(sym_map) sym_dim = {} for s, i in sym_map.items(): sym_dim[s] = builder.load(builder.gep(arg_dims, [self.context.get_constant( types.intp, i + 1)])) # Prepare inputs arrays = [] step_offset = len(self.sin) + len(self.sout) for i, (typ, sym) in enumerate(zip(self.signature.args, self.sin + self.sout)): ary = GUArrayArg(self.context, builder, arg_args, arg_steps, i, step_offset, typ, sym, sym_dim) step_offset += len(sym) arrays.append(ary) bbreturn = builder.append_basic_block('.return') # Prologue self.gen_prologue(builder, pyapi) # Loop with cgutils.for_range(builder, loopcount, intp=intp_t) as loop: args = [a.get_array_at_offset(loop.index) for a in arrays] innercall, error = self.gen_loop_body(builder, pyapi, func, args) # If error, escape cgutils.cbranch_or_continue(builder, error, bbreturn) builder.branch(bbreturn) builder.position_at_end(bbreturn) # Epilogue self.gen_epilogue(builder, pyapi) builder.ret_void() # Link library.add_ir_module(wrapper_module) library.add_linking_library(self.library)
def release_buffer(self, pbuf): fnty = Type.function(Type.void(), [ir.PointerType(self.py_buffer_t)]) fn = self._get_function(fnty, name="numba_release_buffer") return self.builder.call(fn, [pbuf])
def build_ufunc_wrapper(library, context, fname, signature, objmode, envptr, env): """ Wrap the scalar function with a loop that iterates over the arguments """ assert isinstance(fname, str) byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapperlib = context.codegen().create_library('ufunc_wrapper') wrapper_module = wrapperlib.create_ir_module('') if objmode: func_type = context.call_conv.get_function_type( types.pyobject, [types.pyobject] * len(signature.args)) else: func_type = context.call_conv.get_function_type( signature.return_type, signature.args) func = wrapper_module.add_function(func_type, name=fname) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, "__ufunc__." + func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") # Prepare inputs arrays = [] for i, typ in enumerate(signature.args): arrays.append(UArrayArg(context, builder, arg_args, arg_steps, i, typ)) # Prepare output out = UArrayArg(context, builder, arg_args, arg_steps, len(arrays), signature.return_type) # Setup indices offsets = [] zero = context.get_constant(types.intp, 0) for _ in arrays: p = cgutils.alloca_once(builder, intp_t) offsets.append(p) builder.store(zero, p) store_offset = cgutils.alloca_once(builder, intp_t) builder.store(zero, store_offset) unit_strided = cgutils.true_bit for ary in arrays: unit_strided = builder.and_(unit_strided, ary.is_unit_strided) pyapi = context.get_python_api(builder) if objmode: # General loop gil = pyapi.gil_ensure() with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_obj_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, pyapi, envptr, env) pyapi.gil_release(gil) builder.ret_void() else: with builder.if_else(unit_strided) as (is_unit_strided, is_strided): with is_unit_strided: with cgutils.for_range(builder, loopcount, intp=intp_t) as loop: fastloop = build_fast_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, loop.index, pyapi) with is_strided: # General loop with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_slow_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, pyapi) builder.ret_void() del builder # Link and finalize wrapperlib.add_ir_module(wrapper_module) wrapperlib.add_linking_library(library) return wrapperlib.get_pointer_to_function(wrapper.name)
def err_set_object(self, exctype, excval): fnty = Type.function(Type.void(), [self.pyobj, self.pyobj]) fn = self._get_function(fnty, name="PyErr_SetObject") return self.builder.call(fn, (exctype, excval))
def generate_kernel_wrapper(self, library, fname, argtypes): """ Generate the kernel wrapper in the given ``library``. The function being wrapped have the name ``fname`` and argument types ``argtypes``. The wrapper function is returned. """ arginfo = self.get_arg_packer(argtypes) argtys = list(arginfo.argument_types) wrapfnty = Type.function(Type.void(), argtys) wrapper_module = self.create_module("cuda.kernel.wrapper") fnty = Type.function(Type.int(), [self.call_conv.get_return_type(types.pyobject)] + argtys) func = wrapper_module.add_function(fnty, name=fname) wrapfn = wrapper_module.add_function(wrapfnty, name="cudaPy_" + func.name) builder = Builder(wrapfn.append_basic_block('')) # Define error handling variables def define_error_gv(postfix): gv = wrapper_module.add_global_variable(Type.int(), name=wrapfn.name + postfix) gv.initializer = Constant.null(gv.type.pointee) return gv gv_exc = define_error_gv("__errcode__") gv_tid = [] gv_ctaid = [] for i in 'xyz': gv_tid.append(define_error_gv("__tid%s__" % i)) gv_ctaid.append(define_error_gv("__ctaid%s__" % i)) callargs = arginfo.from_arguments(builder, wrapfn.args) status, _ = self.call_conv.call_function( builder, func, types.void, argtypes, callargs) # Check error status with cgutils.if_likely(builder, status.is_ok): builder.ret_void() with builder.if_then(builder.not_(status.is_python_exc)): # User exception raised old = Constant.null(gv_exc.type.pointee) # Use atomic cmpxchg to prevent rewriting the error status # Only the first error is recorded casfnty = lc.Type.function(old.type, [gv_exc.type, old.type, old.type]) casfn = wrapper_module.add_function(casfnty, name="___numba_cas_hack") xchg = builder.call(casfn, [gv_exc, old, status.code]) changed = builder.icmp(ICMP_EQ, xchg, old) # If the xchange is successful, save the thread ID. sreg = nvvmutils.SRegBuilder(builder) with builder.if_then(changed): for dim, ptr, in zip("xyz", gv_tid): val = sreg.tid(dim) builder.store(val, ptr) for dim, ptr, in zip("xyz", gv_ctaid): val = sreg.ctaid(dim) builder.store(val, ptr) builder.ret_void() nvvm.set_cuda_kernel(wrapfn) library.add_ir_module(wrapper_module) library.finalize() wrapfn = library.get_function(wrapfn.name) return wrapfn
def sys_write_stdout(self, fmt, *args): fnty = Type.function(Type.void(), [self.cstring], var_arg=True) fn = self._get_function(fnty, name="PySys_WriteStdout") return self.builder.call(fn, (fmt, ) + args)
def release_record_buffer(self, pbuf): fnty = Type.function(Type.void(), [self.voidptr]) fn = self._get_function(fnty, name="numba_release_record_buffer") return self.builder.call(fn, [pbuf])
def decref(self, obj): fnty = Type.function(Type.void(), [self.pyobj]) fn = self._get_function(fnty, name="Py_DecRef") self.builder.call(fn, [obj])
def __init__(self): """Initialize the instance.""" super(MiddleIrTypeVoid, self).__init__(Type.void())
def err_set_none(self, exctype): fnty = Type.function(Type.void(), [self.pyobj]) fn = self._get_function(fnty, name="PyErr_SetNone") if isinstance(exctype, str): exctype = self.get_c_object(exctype) return self.builder.call(fn, (exctype,))
def err_clear(self): fnty = Type.function(Type.void(), ()) fn = self._get_function(fnty, name="PyErr_Clear") return self.builder.call(fn, ())
def sys_write_stdout(self, fmt, *args): fnty = Type.function(Type.void(), [self.cstring], var_arg=True) fn = self._get_function(fnty, name="PySys_WriteStdout") return self.builder.call(fn, (fmt,) + args)
def wavebarrier_impl(context, builder, sig, args): assert not args fnty = Type.function(Type.void(), []) fn = builder.module.declare_intrinsic('llvm.amdgcn.wave.barrier', fnty=fnty) builder.call(fn, []) return _void_value
def err_write_unraisable(self, obj): fnty = Type.function(Type.void(), [self.pyobj]) fn = self._get_function(fnty, name="PyErr_WriteUnraisable") return self.builder.call(fn, (obj, ))
def build_ufunc_wrapper(library, context, func, signature, objmode, envptr, env): """ Wrap the scalar function with a loop that iterates over the arguments """ byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = library.create_ir_module('') if objmode: func_type = context.call_conv.get_function_type( types.pyobject, [types.pyobject] * len(signature.args)) else: func_type = context.call_conv.get_function_type( signature.return_type, signature.args) oldfunc = func func = wrapper_module.add_function(func_type, name=func.name) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, "__ufunc__." + func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") # Prepare inputs arrays = [] for i, typ in enumerate(signature.args): arrays.append(UArrayArg(context, builder, arg_args, arg_steps, i, typ)) # Prepare output out = UArrayArg(context, builder, arg_args, arg_steps, len(arrays), signature.return_type) # Setup indices offsets = [] zero = context.get_constant(types.intp, 0) for _ in arrays: p = cgutils.alloca_once(builder, intp_t) offsets.append(p) builder.store(zero, p) store_offset = cgutils.alloca_once(builder, intp_t) builder.store(zero, store_offset) unit_strided = cgutils.true_bit for ary in arrays: unit_strided = builder.and_(unit_strided, ary.is_unit_strided) pyapi = context.get_python_api(builder) if objmode: # General loop gil = pyapi.gil_ensure() with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_obj_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, pyapi, envptr, env) pyapi.gil_release(gil) builder.ret_void() else: with builder.if_else(unit_strided) as (is_unit_strided, is_strided): with is_unit_strided: with cgutils.for_range(builder, loopcount, intp=intp_t) as loop: fastloop = build_fast_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, loop.index, pyapi) with is_strided: # General loop with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_slow_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, pyapi) builder.ret_void() del builder # Run optimizer library.add_ir_module(wrapper_module) wrapper = library.get_function(wrapper.name) return wrapper
int64 = TCon("Int64") float32 = TCon("Float") double64 = TCon("Double") void = TCon("Void") array = lambda t: TApp(TCon("Array"), t) array_int32 = array(int32) array_int64 = array(int64) array_double64 = array(double64) pointer = Type.pointer int_type = Type.int() float_type = Type.float() double_type = Type.double() bool_type = Type.int(1) void_type = Type.void() void_ptr = pointer(Type.int(8)) struct_type = Type.struct([]) def array_type(elt_type): return Type.struct([ pointer(elt_type), # data int_type, # dimensions pointer(int_type), # shape 'ndarray_' + str(elt_type), # name ]) int32_array = pointer(array_type(int_type)) int64_array = pointer(array_type(Type.int(64)))
def err_write_unraisable(self, obj): fnty = Type.function(Type.void(), [self.pyobj]) fn = self._get_function(fnty, name="PyErr_WriteUnraisable") return self.builder.call(fn, (obj,))
def generate_kernel_wrapper(self, library, fname, argtypes, debug): """ Generate the kernel wrapper in the given ``library``. The function being wrapped have the name ``fname`` and argument types ``argtypes``. The wrapper function is returned. """ arginfo = self.get_arg_packer(argtypes) argtys = list(arginfo.argument_types) wrapfnty = Type.function(Type.void(), argtys) wrapper_module = self.create_module("cuda.kernel.wrapper") fnty = Type.function(Type.int(), [self.call_conv.get_return_type(types.pyobject)] + argtys) func = wrapper_module.add_function(fnty, name=fname) prefixed = itanium_mangler.prepend_namespace(func.name, ns='cudapy') wrapfn = wrapper_module.add_function(wrapfnty, name=prefixed) builder = Builder(wrapfn.append_basic_block('')) # Define error handling variables def define_error_gv(postfix): gv = wrapper_module.add_global_variable(Type.int(), name=wrapfn.name + postfix) gv.initializer = Constant.null(gv.type.pointee) return gv gv_exc = define_error_gv("__errcode__") gv_tid = [] gv_ctaid = [] for i in 'xyz': gv_tid.append(define_error_gv("__tid%s__" % i)) gv_ctaid.append(define_error_gv("__ctaid%s__" % i)) callargs = arginfo.from_arguments(builder, wrapfn.args) status, _ = self.call_conv.call_function(builder, func, types.void, argtypes, callargs) if debug: # Check error status with cgutils.if_likely(builder, status.is_ok): builder.ret_void() with builder.if_then(builder.not_(status.is_python_exc)): # User exception raised old = Constant.null(gv_exc.type.pointee) # Use atomic cmpxchg to prevent rewriting the error status # Only the first error is recorded casfnty = lc.Type.function(old.type, [gv_exc.type, old.type, old.type]) casfn = wrapper_module.add_function(casfnty, name="___numba_cas_hack") xchg = builder.call(casfn, [gv_exc, old, status.code]) changed = builder.icmp(ICMP_EQ, xchg, old) # If the xchange is successful, save the thread ID. sreg = nvvmutils.SRegBuilder(builder) with builder.if_then(changed): for dim, ptr, in zip("xyz", gv_tid): val = sreg.tid(dim) builder.store(val, ptr) for dim, ptr, in zip("xyz", gv_ctaid): val = sreg.ctaid(dim) builder.store(val, ptr) builder.ret_void() nvvm.set_cuda_kernel(wrapfn) library.add_ir_module(wrapper_module) library.finalize() wrapfn = library.get_function(wrapfn.name) return wrapfn
def fatal_error(self, msg): fnty = Type.function(Type.void(), [self.cstring]) fn = self._get_function(fnty, name="Py_FatalError") cstr = self.context.insert_const_string(self.module, msg) self.builder.call(fn, (cstr,))
def build_ufunc_wrapper(library, context, func, signature, objmode, env): """ Wrap the scalar function with a loop that iterates over the arguments """ byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = library.create_ir_module('') if objmode: func_type = context.call_conv.get_function_type( types.pyobject, [types.pyobject] * len(signature.args)) else: func_type = context.call_conv.get_function_type( signature.return_type, signature.args) oldfunc = func func = wrapper_module.add_function(func_type, name=func.name) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, "__ufunc__." + func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") actual_args = context.call_conv.get_arguments(func) # Prepare inputs arrays = [] for i, typ in enumerate(signature.args): arrays.append( UArrayArg(context, builder, arg_args, arg_steps, i, context.get_argument_type(typ))) # Prepare output valty = context.get_data_type(signature.return_type) out = UArrayArg(context, builder, arg_args, arg_steps, len(actual_args), valty) # Setup indices offsets = [] zero = context.get_constant(types.intp, 0) for _ in arrays: p = cgutils.alloca_once(builder, intp_t) offsets.append(p) builder.store(zero, p) store_offset = cgutils.alloca_once(builder, intp_t) builder.store(zero, store_offset) unit_strided = cgutils.true_bit for ary in arrays: unit_strided = builder.and_(unit_strided, ary.is_unit_strided) if objmode: # General loop pyapi = context.get_python_api(builder) gil = pyapi.gil_ensure() with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_obj_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, pyapi, env) pyapi.gil_release(gil) builder.ret_void() else: with cgutils.ifelse(builder, unit_strided) as (is_unit_strided, is_strided): with is_unit_strided: with cgutils.for_range(builder, loopcount, intp=intp_t) as ind: fastloop = build_fast_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, ind) builder.ret_void() with is_strided: # General loop with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_slow_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature) builder.ret_void() builder.ret_void() del builder # Run optimizer library.add_ir_module(wrapper_module) wrapper = library.get_function(wrapper.name) oldfunc.linkage = LINKAGE_INTERNAL return wrapper
def _build_wrapper(self, library, name): """ The LLVM IRBuilder code to create the gufunc wrapper. The *library* arg is the CodeLibrary for which the wrapper should be added to. The *name* arg is the name of the wrapper function being created. """ byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = self.context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function( Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = library.create_ir_module('') func_type = self.call_conv.get_function_type(self.fndesc.restype, self.fndesc.argtypes) fname = self.fndesc.llvm_func_name func = wrapper_module.add_function(func_type, name=fname) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") pyapi = self.context.get_python_api(builder) # Unpack shapes unique_syms = set() for grp in (self.sin, self.sout): for syms in grp: unique_syms |= set(syms) sym_map = {} for syms in self.sin: for s in syms: if s not in sym_map: sym_map[s] = len(sym_map) sym_dim = {} for s, i in sym_map.items(): sym_dim[s] = builder.load( builder.gep(arg_dims, [self.context.get_constant(types.intp, i + 1)])) # Prepare inputs arrays = [] step_offset = len(self.sin) + len(self.sout) for i, (typ, sym) in enumerate( zip(self.signature.args, self.sin + self.sout)): ary = GUArrayArg(self.context, builder, arg_args, arg_steps, i, step_offset, typ, sym, sym_dim) step_offset += len(sym) arrays.append(ary) bbreturn = builder.append_basic_block('.return') # Prologue self.gen_prologue(builder, pyapi) # Loop with cgutils.for_range(builder, loopcount, intp=intp_t) as loop: args = [a.get_array_at_offset(loop.index) for a in arrays] innercall, error = self.gen_loop_body(builder, pyapi, func, args) # If error, escape cgutils.cbranch_or_continue(builder, error, bbreturn) builder.branch(bbreturn) builder.position_at_end(bbreturn) # Epilogue self.gen_epilogue(builder, pyapi) builder.ret_void() # Link library.add_ir_module(wrapper_module) library.add_linking_library(self.library)