def _get_equal(context, module, datamodel, container_element_type): assert datamodel.contains_nrt_meminfo() fe_type = datamodel.fe_type data_ptr_ty = datamodel.get_data_type().as_pointer() wrapfnty = context.call_conv.get_function_type(types.int32, [fe_type, fe_type]) argtypes = [fe_type, fe_type] def build_wrapper(fn): builder = Builder(fn.append_basic_block()) args = context.call_conv.decode_arguments(builder, argtypes, fn) sig = typing.signature(types.boolean, fe_type, fe_type) op = operator.eq fnop = context.typing_context.resolve_value_type(op) fnop.get_call_type(context.typing_context, sig.args, {}) eqfn = context.get_function(fnop, sig) res = eqfn(builder, args) intres = context.cast(builder, res, types.boolean, types.int32) context.call_conv.return_value(builder, intres) wrapfn = cgutils.get_or_insert_function( module, wrapfnty, name='.numba_{}.{}_equal.wrap'.format(context.fndesc.mangled_name, container_element_type)) build_wrapper(wrapfn) equal_fnty = ir.FunctionType(ir.IntType(32), [data_ptr_ty, data_ptr_ty]) equal_fn = cgutils.get_or_insert_function( module, equal_fnty, name='.numba_{}.{}_equal'.format(context.fndesc.mangled_name, container_element_type), ) builder = Builder(equal_fn.append_basic_block()) lhs = datamodel.load_from_data_pointer(builder, equal_fn.args[0]) rhs = datamodel.load_from_data_pointer(builder, equal_fn.args[1]) status, retval = context.call_conv.call_function( builder, wrapfn, types.boolean, argtypes, [lhs, rhs], ) with builder.if_then(status.is_ok, likely=True): with builder.if_then(status.is_none): builder.ret(context.get_constant(types.int32, 0)) retval = context.cast(builder, retval, types.boolean, types.int32) builder.ret(retval) # Error out builder.ret(context.get_constant(types.int32, -1)) return equal_fn
def lower_finalize_func(self, lower): """ Lower the generator's finalizer. """ fnty = Type.function(Type.void(), [self.context.get_value_type(self.gentype)]) function = lower.module.get_or_insert_function( fnty, name=self.gendesc.llvm_finalizer_name) entry_block = function.append_basic_block('entry') builder = Builder(entry_block) genptrty = self.context.get_value_type(self.gentype) genptr = builder.bitcast(function.args[0], genptrty) self.lower_finalize_func_body(builder, genptr)
def test_nvvm_from_llvm(self): m = Module("test_nvvm_from_llvm") fty = Type.function(Type.void(), [Type.int()]) kernel = m.add_function(fty, name='mycudakernel') bldr = Builder(kernel.append_basic_block('entry')) bldr.ret_void() set_cuda_kernel(kernel) fix_data_layout(m) ptx = llvm_to_ptx(str(m)).decode('utf8') self.assertTrue('mycudakernel' in ptx) if is64bit: self.assertTrue('.address_size 64' in ptx) else: self.assertTrue('.address_size 32' in ptx)
def __init__(self, context, library, fndesc, interp): self.context = context self.library = library self.fndesc = fndesc self.blocks = utils.SortedMap(utils.iteritems(interp.blocks)) # Initialize LLVM self.module = self.library.create_ir_module(self.fndesc.unique_name) # Python execution environment (will be available to the compiled # function). self.env = _dynfunc.Environment( globals=self.fndesc.lookup_module().__dict__) # Mapping of error codes to exception classes or instances self.exceptions = {} # Setup function self.function = context.declare_function(self.module, fndesc) self.entry_block = self.function.append_basic_block('entry') self.builder = Builder.new(self.entry_block) # Internal states self.blkmap = {} self.varmap = {} self.firstblk = min(self.blocks.keys()) self.loc = -1 # Subclass initialization self.init()
def __init__(self, context, library, fndesc, interp): self.context = context self.library = library self.fndesc = fndesc self.blocks = utils.SortedMap(utils.iteritems(interp.blocks)) self.interp = interp self.call_conv = context.call_conv # Initialize LLVM self.module = self.library.create_ir_module(self.fndesc.unique_name) # Python execution environment (will be available to the compiled # function). self.env = _dynfunc.Environment( globals=self.fndesc.lookup_module().__dict__) # Setup function self.function = context.declare_function(self.module, fndesc) self.entry_block = self.function.append_basic_block('entry') self.builder = Builder.new(self.entry_block) self.call_helper = self.call_conv.init_call_helper(self.builder) # Internal states self.blkmap = {} self.varmap = {} self.firstblk = min(self.blocks.keys()) self.loc = -1 # Subclass initialization self.init()
def build_wrapper(fn): builder = Builder(fn.append_basic_block()) args = context.call_conv.decode_arguments(builder, argtypes, fn) sig = typing.signature(types.boolean, fe_type, fe_type) op = operator.eq fnop = context.typing_context.resolve_value_type(op) fnop.get_call_type(context.typing_context, sig.args, {}) eqfn = context.get_function(fnop, sig) res = eqfn(builder, args) intres = context.cast(builder, res, types.boolean, types.int32) context.call_conv.return_value(builder, intres)
def code(self, codegen): klass = codegen.current_class method_name = f'{klass.name}::{self.name}' func = list( filter(lambda f: f.name == method_name, codegen.module.functions))[0] codegen.function_stack.append(func) old_func = codegen.current_function old_builder = codegen.builder codegen.current_function = func entry_block = codegen.add_block('entry') exit_block = codegen.add_block('exit') codegen.exit_blocks.append(exit_block) codegen.builder = Builder(entry_block) if self.is_constructor: this = codegen.gep(func.args[0], INDICES) codegen.builder.store( codegen.module.get_global(f'{klass.name}_vtable'), this) body = self.body if body: ret = codegen.visit(body) else: ret = None codegen.branch(exit_block) if not ret: codegen.position_at_end(exit_block) codegen.builder.ret_void() codegen.current_function = old_func codegen.builder = old_builder codegen.exit_blocks.pop() codegen.function_stack.pop() return func
def test_inline_rsqrt(self): mod = Module.new(__name__) fnty = Type.function(Type.void(), [Type.pointer(Type.float())]) fn = mod.add_function(fnty, "cu_rsqrt") bldr = Builder.new(fn.append_basic_block("entry")) rsqrt_approx_fnty = Type.function(Type.float(), [Type.float()]) inlineasm = InlineAsm.get(rsqrt_approx_fnty, "rsqrt.approx.f32 $0, $1;", "=f,f", side_effect=True) val = bldr.load(fn.args[0]) res = bldr.call(inlineasm, [val]) bldr.store(res, fn.args[0]) bldr.ret_void() # generate ptx nvvm.fix_data_layout(mod) nvvm.set_cuda_kernel(fn) nvvmir = str(mod) ptx = nvvm.llvm_to_ptx(nvvmir) self.assertTrue("rsqrt.approx.f32" in str(ptx))
def test_inline_rsqrt(self): mod = Module(__name__) fnty = Type.function(Type.void(), [Type.pointer(Type.float())]) fn = mod.add_function(fnty, 'cu_rsqrt') bldr = Builder(fn.append_basic_block('entry')) rsqrt_approx_fnty = Type.function(Type.float(), [Type.float()]) inlineasm = InlineAsm.get(rsqrt_approx_fnty, 'rsqrt.approx.f32 $0, $1;', '=f,f', side_effect=True) val = bldr.load(fn.args[0]) res = bldr.call(inlineasm, [val]) bldr.store(res, fn.args[0]) bldr.ret_void() # generate ptx nvvm.fix_data_layout(mod) nvvm.set_cuda_kernel(fn) nvvmir = str(mod) ptx = nvvm.llvm_to_ptx(nvvmir) self.assertTrue('rsqrt.approx.f32' in str(ptx))
def build(self): wrapname = "wrapper.%s" % self.func.name # This is the signature of PyCFunctionWithKeywords # (see CPython's methodobject.h) pyobj = self.context.get_argument_type(types.pyobject) wrapty = Type.function(pyobj, [pyobj, pyobj, pyobj]) wrapper = self.module.add_function(wrapty, name=wrapname) builder = Builder.new(wrapper.append_basic_block('entry')) # - `closure` will receive the `self` pointer stored in the # PyCFunction object (see _dynfunc.c) # - `args` and `kws` will receive the tuple and dict objects # of positional and keyword arguments, respectively. closure, args, kws = wrapper.args closure.name = 'py_closure' args.name = 'py_args' kws.name = 'py_kws' api = self.context.get_python_api(builder) self.build_wrapper(api, builder, closure, args, kws) return wrapper, api
def build_ufunc_wrapper(library, context, fname, signature, objmode, envptr, env): """ Wrap the scalar function with a loop that iterates over the arguments """ assert isinstance(fname, str) byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapperlib = context.codegen().create_library('ufunc_wrapper') wrapper_module = wrapperlib.create_ir_module('') if objmode: func_type = context.call_conv.get_function_type( types.pyobject, [types.pyobject] * len(signature.args)) else: func_type = context.call_conv.get_function_type( signature.return_type, signature.args) func = wrapper_module.add_function(func_type, name=fname) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, "__ufunc__." + func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") # Prepare inputs arrays = [] for i, typ in enumerate(signature.args): arrays.append(UArrayArg(context, builder, arg_args, arg_steps, i, typ)) # Prepare output out = UArrayArg(context, builder, arg_args, arg_steps, len(arrays), signature.return_type) # Setup indices offsets = [] zero = context.get_constant(types.intp, 0) for _ in arrays: p = cgutils.alloca_once(builder, intp_t) offsets.append(p) builder.store(zero, p) store_offset = cgutils.alloca_once(builder, intp_t) builder.store(zero, store_offset) unit_strided = cgutils.true_bit for ary in arrays: unit_strided = builder.and_(unit_strided, ary.is_unit_strided) pyapi = context.get_python_api(builder) if objmode: # General loop gil = pyapi.gil_ensure() with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_obj_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, pyapi, envptr, env) pyapi.gil_release(gil) builder.ret_void() else: with builder.if_else(unit_strided) as (is_unit_strided, is_strided): with is_unit_strided: with cgutils.for_range(builder, loopcount, intp=intp_t) as loop: fastloop = build_fast_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, loop.index, pyapi) with is_strided: # General loop with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_slow_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, pyapi) builder.ret_void() del builder # Link and finalize wrapperlib.add_ir_module(wrapper_module) wrapperlib.add_linking_library(library) return wrapperlib.get_pointer_to_function(wrapper.name)
class CodeGenerator(Printable): def __init__(self): # TODO: come up with a less naive way of handling the symtab and types self.classes = None self.symtab = {} self.typetab = {} self.is_break = False self.current_class = None self.loop_end_blocks = [] self.loop_cond_blocks = [] context = ir.Context() self.module = Module(name='opal-lang', context=context) self.blocks = [] self.scope = {} self._add_builtins() func_ty = ir.FunctionType(ir.VoidType(), []) func = Function(self.module, func_ty, 'main') self.current_function = func entry_block = self.add_block('entry') exit_block = self.add_block('exit') self.function_stack = [func] self.builder = Builder(entry_block) self.exit_blocks = [exit_block] self.block_stack = [entry_block] def __str__(self): return str(self.module) def _add_builtins(self): malloc_ty = ir.FunctionType(Int8.as_llvm().as_pointer(), [Integer.as_llvm()]) ir.Function(self.module, malloc_ty, 'malloc') free_ty = ir.FunctionType(Any.as_llvm(), [Int8.as_llvm().as_pointer()]) ir.Function(self.module, free_ty, 'free') puts_ty = ir.FunctionType(Integer.as_llvm(), [Int8.as_llvm().as_pointer()]) ir.Function(self.module, puts_ty, 'puts') int_to_string_ty = ir.FunctionType(Int8.as_llvm().as_pointer(), [ Integer.as_llvm(), Int8.as_llvm().as_pointer(), Integer.as_llvm() ]) ir.Function(self.module, int_to_string_ty, 'int_to_string') printf_ty = ir.FunctionType(Integer.as_llvm(), [Int8.as_llvm().as_pointer()], var_arg=True) ir.Function(self.module, printf_ty, 'printf') vector_init_ty = ir.FunctionType(Any.as_llvm(), [List.as_llvm().as_pointer()]) ir.Function(self.module, vector_init_ty, 'vector_init') vector_append_ty = ir.FunctionType( Any.as_llvm(), [List.as_llvm().as_pointer(), Int8.as_llvm().as_pointer()]) ir.Function(self.module, vector_append_ty, 'vector_append') vector_get_ty = ir.FunctionType( Int8.as_llvm().as_pointer(), [List.as_llvm().as_pointer(), Integer.as_llvm()]) ir.Function(self.module, vector_get_ty, 'vector_get') vector_size_ty = ir.FunctionType(Integer.as_llvm(), [List.as_llvm().as_pointer()]) ir.Function(self.module, vector_size_ty, 'vector_size') def alloc(self, typ, name=''): return self.builder.alloca(typ, name=name) def alloc_and_store(self, val, typ, name=''): var_addr = self.alloc(typ, name) self.builder.store(val, var_addr) return var_addr def add_block(self, name): return self.current_function.append_basic_block(name) def assign(self, name, value, typ, is_class=False): if is_class: self.symtab[name] = value self.typetab[name] = typ return value old_val = self.symtab.get(name) if old_val: new_val = self.builder.store(value, old_val) self.symtab[name] = new_val.operands[1] return new_val var_address = self.alloc_and_store(value, typ, name=name) self.symtab[name] = var_address self.typetab[name] = typ return var_address def get_var(self, name): return self.symtab[name] def get_var_type(self, name): return self.typetab[name] # noinspection SpellCheckingInspection def bitcast(self, value, type_): return self.builder.bitcast(value, type_) def branch(self, block): return self.builder.branch(block) # noinspection SpellCheckingInspection def cbranch(self, cond, true_block, false_block): return self.builder.cbranch(cond, true_block, false_block) def gep(self, ptr, indices, inbounds=False, name=''): return self.builder.gep(ptr, indices, inbounds, name) def generate_code(self, code): visitor = ASTVisitor() ast = visitor.transform(parser.parse(f"{code}\n")) self.classes = visitor.classes for klass in self.classes: self.generate_classes_metadata(klass) assert isinstance(ast, Program) return ast.accept(self) def load(self, ptr, name=''): return self.builder.load(ptr, name) def position_at_end(self, block): return self.builder.position_at_end(block) def select(self, val, true, false): return self.builder.select(val, true, false) @staticmethod def insert_const_string(module, string): text = Constant.stringz(string) name = CodeGenerator.get_string_name(string) gv = module.globals.get(name) if gv is None: gv = module.add_global_variable(text.type, name=name) gv.linkage = PRIVATE_LINKAGE gv.unnamed_addr = True gv.global_constant = True gv.initializer = text return gv @staticmethod def get_string_name(string): m = sha3_256() m.update(string.encode('utf-8')) return '_'.join(['str', str(m.hexdigest())]) def call(self, name, args): func = self.module.get_global(name) return self.builder.call(func, args) def const(self, val): # has to come first because freaking `isinstance(True, int) == True` if isinstance(val, bool): return ir.Constant(Bool.as_llvm(), val and 1 or 0) if isinstance(val, int): return ir.Constant(Integer.as_llvm(), val) if isinstance(val, float): return ir.Constant(Float.as_llvm(), val) raise NotImplementedError @staticmethod def generic_codegen(node): raise NotImplementedError('No visit_{} method'.format( type(node).__name__.lower())) def visit(self, node: ASTNode): """ Dynamically invoke the code generator for each specific node :param node: ASTNode """ if self.is_break: return can_code_gen = hasattr(node, 'code') if can_code_gen: # noinspection PyUnresolvedReferences return node.code(codegen=self) method = 'visit_' + type(node).__name__.lower() # pragma: no cover return getattr(self, method, self.generic_codegen)(node) # pragma: no cover # TODO: refactor to create smaller, specific functions def generate_classes_metadata(self, klass: Klass): name = klass.name parent = klass.parent undefined_parent_class = name != 'Object' and parent not in [ c.name for c in self.classes ] if undefined_parent_class: raise CodegenError(f'Parent class {parent} not defined') vtable_typ_name = f"{name}_vtable_type" vtable_typ = self.module.context.get_identified_type(vtable_typ_name) type_ = self.module.context.get_identified_type(name) funk_types = OrderedDict() funktions = OrderedDict() object_type = self.module.context.get_identified_type('Object') for func in klass.functions: funk_name = f'{name}::{func.name}' signature = [ get_param_type(param.type, object_type) for param in func.params ] if func.ret_type: ret = get_param_type(func.ret_type, object_type) else: ret = ir.VoidType() func_ty = ir.FunctionType(ret, [type_.as_pointer()] + signature) funk_types[funk_name] = func_ty funk = Function(self.module, func_ty, funk_name) funktions[funk_name] = funk vtable_name = f"{name}_vtable" vtable_elements = [el.type for el in funktions.values()] vtable_type_name = f"{parent}_vtable_type" parent_type = \ parent and self.module.context.get_identified_type(vtable_type_name) or vtable_typ vtable_elements.insert(0, parent_type.as_pointer()) vtable_elements.insert(1, ir.IntType(8).as_pointer()) vtable_typ.set_body(*vtable_elements) # -- class_string = CodeGenerator.insert_const_string(self.module, name) if klass.parent: parent_table_typ = self.module.context.get_identified_type( f"{parent}_vtable_type") vtable_constant = ir.Constant( parent_table_typ.as_pointer(), self.module.get_global(f'{parent}_vtable').get_reference()) else: vtable_constant = ir.Constant(vtable_typ.as_pointer(), None) fields = [vtable_constant, class_string.gep(INDICES)] fields += [ ir.Constant(item.type, item.get_reference()) for item in funktions.values() ] vtable = self.module.add_global_variable(vtable_typ, name=vtable_name) vtable.linkage = PRIVATE_LINKAGE vtable.unnamed_addr = False vtable.global_constant = True vtable.initializer = vtable_typ(fields) type_ = self.module.context.get_identified_type(name) elements = [] elements.insert(0, vtable_typ.as_pointer()) type_.set_body(*elements) def vector_get(self, vector, index): val = self.call('vector_get', [vector, index]) val = self.builder.ptrtoint(val, Integer.as_llvm()) return val def cast(self, from_, to): if from_.type == Integer.as_llvm() and to is Bool: result = self.alloc_and_store(from_, Integer.as_llvm()) result = self.load(result) return self.builder.icmp_signed('!=', result, self.const(0)) if from_.type == Float.as_llvm() and to is Bool: result = self.alloc_and_store(from_, Float.as_llvm()) result = self.load(result) return self.builder.fcmp_ordered('!=', result, self.const(0.0)) raise NotImplementedError('Unsupported cast') def get_klass_by_name(self, name): for klass in self.classes: if klass.name == name: return klass
def generate_kernel_wrapper(self, func, argtypes): module = func.module argtys = [self.get_argument_type(ty) for ty in argtypes] wrapfnty = Type.function(Type.void(), argtys) wrapper_module = self.create_module("cuda.kernel.wrapper") fnty = Type.function(Type.int(), [self.get_return_type(types.pyobject)] + argtys) func = wrapper_module.add_function(fnty, name=func.name) wrapfn = wrapper_module.add_function(wrapfnty, name="cudaPy_" + func.name) builder = Builder.new(wrapfn.append_basic_block('')) # Define error handling variables def define_error_gv(postfix): gv = wrapper_module.add_global_variable(Type.int(), name=wrapfn.name + postfix) gv.initializer = Constant.null(gv.type.pointee) return gv gv_exc = define_error_gv("__errcode__") gv_tid = [] gv_ctaid = [] for i in 'xyz': gv_tid.append(define_error_gv("__tid%s__" % i)) gv_ctaid.append(define_error_gv("__ctaid%s__" % i)) callargs = [] for at, av in zip(argtypes, wrapfn.args): av = self.get_argument_value(builder, at, av) callargs.append(av) status, _ = self.call_function(builder, func, types.void, argtypes, callargs) # Check error status with cgutils.if_likely(builder, status.ok): builder.ret_void() with cgutils.ifthen(builder, builder.not_(status.exc)): # User exception raised old = Constant.null(gv_exc.type.pointee) # Use atomic cmpxchg to prevent rewriting the error status # Only the first error is recorded casfnty = lc.Type.function(old.type, [gv_exc.type, old.type, old.type]) casfn = wrapper_module.add_function(casfnty, name="___numba_cas_hack") xchg = builder.call(casfn, [gv_exc, old, status.code]) changed = builder.icmp(ICMP_EQ, xchg, old) # If the xchange is successful, save the thread ID. sreg = nvvmutils.SRegBuilder(builder) with cgutils.ifthen(builder, changed): for dim, ptr, in zip("xyz", gv_tid): val = sreg.tid(dim) builder.store(val, ptr) for dim, ptr, in zip("xyz", gv_ctaid): val = sreg.ctaid(dim) builder.store(val, ptr) builder.ret_void() # force inline # inline_function(status.code) nvvm.set_cuda_kernel(wrapfn) module.link_in(ll.parse_assembly(str(wrapper_module))) module.verify() wrapfn = module.get_function(wrapfn.name) return wrapfn
def setup_function(self, fndesc): # Setup function self.function = self.context.declare_function(self.module, fndesc) self.entry_block = self.function.append_basic_block('entry') self.builder = Builder(self.entry_block) self.call_helper = self.call_conv.init_call_helper(self.builder)
def build_ufunc_wrapper(library, context, func, signature, objmode, env): """ Wrap the scalar function with a loop that iterates over the arguments """ byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = library.create_ir_module('') if objmode: func_type = context.call_conv.get_function_type( types.pyobject, [types.pyobject] * len(signature.args)) else: func_type = context.call_conv.get_function_type( signature.return_type, signature.args) oldfunc = func func = wrapper_module.add_function(func_type, name=func.name) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, "__ufunc__." + func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") actual_args = context.call_conv.get_arguments(func) # Prepare inputs arrays = [] for i, typ in enumerate(signature.args): arrays.append( UArrayArg(context, builder, arg_args, arg_steps, i, context.get_argument_type(typ))) # Prepare output valty = context.get_data_type(signature.return_type) out = UArrayArg(context, builder, arg_args, arg_steps, len(actual_args), valty) # Setup indices offsets = [] zero = context.get_constant(types.intp, 0) for _ in arrays: p = cgutils.alloca_once(builder, intp_t) offsets.append(p) builder.store(zero, p) store_offset = cgutils.alloca_once(builder, intp_t) builder.store(zero, store_offset) unit_strided = cgutils.true_bit for ary in arrays: unit_strided = builder.and_(unit_strided, ary.is_unit_strided) if objmode: # General loop pyapi = context.get_python_api(builder) gil = pyapi.gil_ensure() with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_obj_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, pyapi, env) pyapi.gil_release(gil) builder.ret_void() else: with cgutils.ifelse(builder, unit_strided) as (is_unit_strided, is_strided): with is_unit_strided: with cgutils.for_range(builder, loopcount, intp=intp_t) as ind: fastloop = build_fast_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, ind) builder.ret_void() with is_strided: # General loop with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_slow_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature) builder.ret_void() builder.ret_void() del builder # Run optimizer library.add_ir_module(wrapper_module) wrapper = library.get_function(wrapper.name) oldfunc.linkage = LINKAGE_INTERNAL return wrapper
def generate_kernel_wrapper(self, func, argtypes): module = func.module arginfo = self.get_arg_packer(argtypes) argtys = list(arginfo.argument_types) wrapfnty = Type.function(Type.void(), argtys) wrapper_module = self.create_module("cuda.kernel.wrapper") fnty = Type.function(Type.int(), [self.call_conv.get_return_type(types.pyobject)] + argtys) func = wrapper_module.add_function(fnty, name=func.name) wrapfn = wrapper_module.add_function(wrapfnty, name="cudaPy_" + func.name) builder = Builder.new(wrapfn.append_basic_block('')) # Define error handling variables def define_error_gv(postfix): gv = wrapper_module.add_global_variable(Type.int(), name=wrapfn.name + postfix) gv.initializer = Constant.null(gv.type.pointee) return gv gv_exc = define_error_gv("__errcode__") gv_tid = [] gv_ctaid = [] for i in 'xyz': gv_tid.append(define_error_gv("__tid%s__" % i)) gv_ctaid.append(define_error_gv("__ctaid%s__" % i)) callargs = arginfo.from_arguments(builder, wrapfn.args) status, _ = self.call_conv.call_function(builder, func, types.void, argtypes, callargs) # Check error status with cgutils.if_likely(builder, status.is_ok): builder.ret_void() with builder.if_then(builder.not_(status.is_python_exc)): # User exception raised old = Constant.null(gv_exc.type.pointee) # Use atomic cmpxchg to prevent rewriting the error status # Only the first error is recorded casfnty = lc.Type.function(old.type, [gv_exc.type, old.type, old.type]) casfn = wrapper_module.add_function(casfnty, name="___numba_cas_hack") xchg = builder.call(casfn, [gv_exc, old, status.code]) changed = builder.icmp(ICMP_EQ, xchg, old) # If the xchange is successful, save the thread ID. sreg = nvvmutils.SRegBuilder(builder) with builder.if_then(changed): for dim, ptr, in zip("xyz", gv_tid): val = sreg.tid(dim) builder.store(val, ptr) for dim, ptr, in zip("xyz", gv_ctaid): val = sreg.ctaid(dim) builder.store(val, ptr) builder.ret_void() # force inline # inline_function(status.code) nvvm.set_cuda_kernel(wrapfn) module.link_in(ll.parse_assembly(str(wrapper_module))) module.verify() wrapfn = module.get_function(wrapfn.name) return wrapfn
def generate_kernel_wrapper(self, library, fname, argtypes, debug): """ Generate the kernel wrapper in the given ``library``. The function being wrapped have the name ``fname`` and argument types ``argtypes``. The wrapper function is returned. """ arginfo = self.get_arg_packer(argtypes) argtys = list(arginfo.argument_types) wrapfnty = Type.function(Type.void(), argtys) wrapper_module = self.create_module("cuda.kernel.wrapper") fnty = Type.function(Type.int(), [self.call_conv.get_return_type(types.pyobject)] + argtys) func = wrapper_module.add_function(fnty, name=fname) prefixed = itanium_mangler.prepend_namespace(func.name, ns='cudapy') wrapfn = wrapper_module.add_function(wrapfnty, name=prefixed) builder = Builder(wrapfn.append_basic_block('')) # Define error handling variables def define_error_gv(postfix): gv = wrapper_module.add_global_variable(Type.int(), name=wrapfn.name + postfix) gv.initializer = Constant.null(gv.type.pointee) return gv gv_exc = define_error_gv("__errcode__") gv_tid = [] gv_ctaid = [] for i in 'xyz': gv_tid.append(define_error_gv("__tid%s__" % i)) gv_ctaid.append(define_error_gv("__ctaid%s__" % i)) callargs = arginfo.from_arguments(builder, wrapfn.args) status, _ = self.call_conv.call_function(builder, func, types.void, argtypes, callargs) if debug: # Check error status with cgutils.if_likely(builder, status.is_ok): builder.ret_void() with builder.if_then(builder.not_(status.is_python_exc)): # User exception raised old = Constant.null(gv_exc.type.pointee) # Use atomic cmpxchg to prevent rewriting the error status # Only the first error is recorded casfnty = lc.Type.function(old.type, [gv_exc.type, old.type, old.type]) casfn = wrapper_module.add_function(casfnty, name="___numba_cas_hack") xchg = builder.call(casfn, [gv_exc, old, status.code]) changed = builder.icmp(ICMP_EQ, xchg, old) # If the xchange is successful, save the thread ID. sreg = nvvmutils.SRegBuilder(builder) with builder.if_then(changed): for dim, ptr, in zip("xyz", gv_tid): val = sreg.tid(dim) builder.store(val, ptr) for dim, ptr, in zip("xyz", gv_ctaid): val = sreg.ctaid(dim) builder.store(val, ptr) builder.ret_void() nvvm.set_cuda_kernel(wrapfn) library.add_ir_module(wrapper_module) library.finalize() wrapfn = library.get_function(wrapfn.name) return wrapfn
def setup_function(self, fndesc): # Setup function self.function = self.context.declare_function(self.module, fndesc) self.entry_block = self.function.append_basic_block('entry') self.builder = Builder.new(self.entry_block) self.call_helper = self.call_conv.init_call_helper(self.builder)
def generate_kernel_wrapper(self, library, fname, argtypes): """ Generate the kernel wrapper in the given ``library``. The function being wrapped have the name ``fname`` and argument types ``argtypes``. The wrapper function is returned. """ arginfo = self.get_arg_packer(argtypes) argtys = list(arginfo.argument_types) wrapfnty = Type.function(Type.void(), argtys) wrapper_module = self.create_module("cuda.kernel.wrapper") fnty = Type.function(Type.int(), [self.call_conv.get_return_type(types.pyobject)] + argtys) func = wrapper_module.add_function(fnty, name=fname) wrapfn = wrapper_module.add_function(wrapfnty, name="cudaPy_" + func.name) builder = Builder(wrapfn.append_basic_block('')) # Define error handling variables def define_error_gv(postfix): gv = wrapper_module.add_global_variable(Type.int(), name=wrapfn.name + postfix) gv.initializer = Constant.null(gv.type.pointee) return gv gv_exc = define_error_gv("__errcode__") gv_tid = [] gv_ctaid = [] for i in 'xyz': gv_tid.append(define_error_gv("__tid%s__" % i)) gv_ctaid.append(define_error_gv("__ctaid%s__" % i)) callargs = arginfo.from_arguments(builder, wrapfn.args) status, _ = self.call_conv.call_function( builder, func, types.void, argtypes, callargs) # Check error status with cgutils.if_likely(builder, status.is_ok): builder.ret_void() with builder.if_then(builder.not_(status.is_python_exc)): # User exception raised old = Constant.null(gv_exc.type.pointee) # Use atomic cmpxchg to prevent rewriting the error status # Only the first error is recorded casfnty = lc.Type.function(old.type, [gv_exc.type, old.type, old.type]) casfn = wrapper_module.add_function(casfnty, name="___numba_cas_hack") xchg = builder.call(casfn, [gv_exc, old, status.code]) changed = builder.icmp(ICMP_EQ, xchg, old) # If the xchange is successful, save the thread ID. sreg = nvvmutils.SRegBuilder(builder) with builder.if_then(changed): for dim, ptr, in zip("xyz", gv_tid): val = sreg.tid(dim) builder.store(val, ptr) for dim, ptr, in zip("xyz", gv_ctaid): val = sreg.ctaid(dim) builder.store(val, ptr) builder.ret_void() nvvm.set_cuda_kernel(wrapfn) library.add_ir_module(wrapper_module) library.finalize() wrapfn = library.get_function(wrapfn.name) return wrapfn
def build(self): byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = self.context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = self.library.create_ir_module('') func_type = self.call_conv.get_function_type(self.fndesc.restype, self.fndesc.argtypes) func = wrapper_module.add_function(func_type, name=self.func.name) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, "__gufunc__." + self.func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") pyapi = self.context.get_python_api(builder) # Unpack shapes unique_syms = set() for grp in (self.sin, self.sout): for syms in grp: unique_syms |= set(syms) sym_map = {} for syms in self.sin: for s in syms: if s not in sym_map: sym_map[s] = len(sym_map) sym_dim = {} for s, i in sym_map.items(): sym_dim[s] = builder.load(builder.gep(arg_dims, [self.context.get_constant( types.intp, i + 1)])) # Prepare inputs arrays = [] step_offset = len(self.sin) + len(self.sout) for i, (typ, sym) in enumerate(zip(self.signature.args, self.sin + self.sout)): ary = GUArrayArg(self.context, builder, arg_args, arg_steps, i, step_offset, typ, sym, sym_dim) step_offset += len(sym) arrays.append(ary) bbreturn = builder.append_basic_block('.return') # Prologue self.gen_prologue(builder, pyapi) # Loop with cgutils.for_range(builder, loopcount, intp=intp_t) as loop: args = [a.get_array_at_offset(loop.index) for a in arrays] innercall, error = self.gen_loop_body(builder, pyapi, func, args) # If error, escape cgutils.cbranch_or_continue(builder, error, bbreturn) builder.branch(bbreturn) builder.position_at_end(bbreturn) # Epilogue self.gen_epilogue(builder, pyapi) builder.ret_void() self.library.add_ir_module(wrapper_module) wrapper = self.library.get_function(wrapper.name) # Set core function to internal so that it is not generated self.func.linkage = LINKAGE_INTERNAL return wrapper, self.env
def build_ufunc_wrapper(library, context, func, signature, objmode, envptr, env): """ Wrap the scalar function with a loop that iterates over the arguments """ byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = library.create_ir_module('') if objmode: func_type = context.call_conv.get_function_type( types.pyobject, [types.pyobject] * len(signature.args)) else: func_type = context.call_conv.get_function_type( signature.return_type, signature.args) oldfunc = func func = wrapper_module.add_function(func_type, name=func.name) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, "__ufunc__." + func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") # Prepare inputs arrays = [] for i, typ in enumerate(signature.args): arrays.append(UArrayArg(context, builder, arg_args, arg_steps, i, typ)) # Prepare output out = UArrayArg(context, builder, arg_args, arg_steps, len(arrays), signature.return_type) # Setup indices offsets = [] zero = context.get_constant(types.intp, 0) for _ in arrays: p = cgutils.alloca_once(builder, intp_t) offsets.append(p) builder.store(zero, p) store_offset = cgutils.alloca_once(builder, intp_t) builder.store(zero, store_offset) unit_strided = cgutils.true_bit for ary in arrays: unit_strided = builder.and_(unit_strided, ary.is_unit_strided) pyapi = context.get_python_api(builder) if objmode: # General loop gil = pyapi.gil_ensure() with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_obj_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, pyapi, envptr, env) pyapi.gil_release(gil) builder.ret_void() else: with builder.if_else(unit_strided) as (is_unit_strided, is_strided): with is_unit_strided: with cgutils.for_range(builder, loopcount, intp=intp_t) as loop: fastloop = build_fast_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, loop.index, pyapi) with is_strided: # General loop with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_slow_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, pyapi) builder.ret_void() del builder # Run optimizer library.add_ir_module(wrapper_module) wrapper = library.get_function(wrapper.name) return wrapper
def _build_wrapper(self, library, name): """ The LLVM IRBuilder code to create the gufunc wrapper. The *library* arg is the CodeLibrary for which the wrapper should be added to. The *name* arg is the name of the wrapper function being created. """ byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = self.context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = library.create_ir_module('') func_type = self.call_conv.get_function_type(self.fndesc.restype, self.fndesc.argtypes) fname = self.fndesc.llvm_func_name func = wrapper_module.add_function(func_type, name=fname) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") pyapi = self.context.get_python_api(builder) # Unpack shapes unique_syms = set() for grp in (self.sin, self.sout): for syms in grp: unique_syms |= set(syms) sym_map = {} for syms in self.sin: for s in syms: if s not in sym_map: sym_map[s] = len(sym_map) sym_dim = {} for s, i in sym_map.items(): sym_dim[s] = builder.load(builder.gep(arg_dims, [self.context.get_constant( types.intp, i + 1)])) # Prepare inputs arrays = [] step_offset = len(self.sin) + len(self.sout) for i, (typ, sym) in enumerate(zip(self.signature.args, self.sin + self.sout)): ary = GUArrayArg(self.context, builder, arg_args, arg_steps, i, step_offset, typ, sym, sym_dim) step_offset += len(sym) arrays.append(ary) bbreturn = builder.append_basic_block('.return') # Prologue self.gen_prologue(builder, pyapi) # Loop with cgutils.for_range(builder, loopcount, intp=intp_t) as loop: args = [a.get_array_at_offset(loop.index) for a in arrays] innercall, error = self.gen_loop_body(builder, pyapi, func, args) # If error, escape cgutils.cbranch_or_continue(builder, error, bbreturn) builder.branch(bbreturn) builder.position_at_end(bbreturn) # Epilogue self.gen_epilogue(builder, pyapi) builder.ret_void() # Link library.add_ir_module(wrapper_module) library.add_linking_library(self.library)
def _build_wrapper(self, library, name): """ The LLVM IRBuilder code to create the gufunc wrapper. The *library* arg is the CodeLibrary for which the wrapper should be added to. The *name* arg is the name of the wrapper function being created. """ byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = self.context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function( Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = library.create_ir_module('') func_type = self.call_conv.get_function_type(self.fndesc.restype, self.fndesc.argtypes) fname = self.fndesc.llvm_func_name func = wrapper_module.add_function(func_type, name=fname) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") pyapi = self.context.get_python_api(builder) # Unpack shapes unique_syms = set() for grp in (self.sin, self.sout): for syms in grp: unique_syms |= set(syms) sym_map = {} for syms in self.sin: for s in syms: if s not in sym_map: sym_map[s] = len(sym_map) sym_dim = {} for s, i in sym_map.items(): sym_dim[s] = builder.load( builder.gep(arg_dims, [self.context.get_constant(types.intp, i + 1)])) # Prepare inputs arrays = [] step_offset = len(self.sin) + len(self.sout) for i, (typ, sym) in enumerate( zip(self.signature.args, self.sin + self.sout)): ary = GUArrayArg(self.context, builder, arg_args, arg_steps, i, step_offset, typ, sym, sym_dim) step_offset += len(sym) arrays.append(ary) bbreturn = builder.append_basic_block('.return') # Prologue self.gen_prologue(builder, pyapi) # Loop with cgutils.for_range(builder, loopcount, intp=intp_t) as loop: args = [a.get_array_at_offset(loop.index) for a in arrays] innercall, error = self.gen_loop_body(builder, pyapi, func, args) # If error, escape cgutils.cbranch_or_continue(builder, error, bbreturn) builder.branch(bbreturn) builder.position_at_end(bbreturn) # Epilogue self.gen_epilogue(builder, pyapi) builder.ret_void() # Link library.add_ir_module(wrapper_module) library.add_linking_library(self.library)
def build(self): byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = self.context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function( Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper_module = self.library.create_ir_module('') func_type = self.call_conv.get_function_type(self.fndesc.restype, self.fndesc.argtypes) func = wrapper_module.add_function(func_type, name=self.func.name) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, "__gufunc__." + self.func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") # Unpack shapes unique_syms = set() for grp in (self.sin, self.sout): for syms in grp: unique_syms |= set(syms) sym_map = {} for syms in self.sin: for s in syms: if s not in sym_map: sym_map[s] = len(sym_map) sym_dim = {} for s, i in sym_map.items(): sym_dim[s] = builder.load( builder.gep(arg_dims, [self.context.get_constant(types.intp, i + 1)])) # Prepare inputs arrays = [] step_offset = len(self.sin) + len(self.sout) for i, (typ, sym) in enumerate( zip(self.signature.args, self.sin + self.sout)): ary = GUArrayArg(self.context, builder, arg_args, arg_dims, arg_steps, i, step_offset, typ, sym, sym_dim) if not ary.as_scalar: step_offset += ary.ndim arrays.append(ary) bbreturn = cgutils.get_function(builder).append_basic_block('.return') # Prologue self.gen_prologue(builder) # Loop with cgutils.for_range(builder, loopcount, intp=intp_t) as ind: args = [a.array_value for a in arrays] innercall, error = self.gen_loop_body(builder, func, args) # If error, escape cgutils.cbranch_or_continue(builder, error, bbreturn) for a in arrays: a.next(ind) builder.branch(bbreturn) builder.position_at_end(bbreturn) # Epilogue self.gen_epilogue(builder) builder.ret_void() self.library.add_ir_module(wrapper_module) wrapper = self.library.get_function(wrapper.name) # Set core function to internal so that it is not generated self.func.linkage = LINKAGE_INTERNAL return wrapper, self.env