def close(self): '''end code generation ''' # Close declaration block with _change_block_temporarily(self.builder, self.declare_block): self.builder.branch(self.first_body_block) # Do the auto inlining for callinst in self._auto_inline_list: lc.inline_function(callinst)
def generate_kernel_wrapper(self, func, argtypes): module = func.module argtys = self.get_arguments(func.type.pointee) fnty = Type.function(Type.void(), argtys) wrapfn = module.add_function(fnty, name="cudaPy_" + func.name) builder = Builder.new(wrapfn.append_basic_block('')) callargs = [] for at, av in zip(argtypes, wrapfn.args): av = self.get_argument_value(builder, at, av) callargs.append(av) status, _ = self.call_function(builder, func, types.void, argtypes, callargs) # TODO handle status builder.ret_void() del builder # force inline inline_function(status.code) module.verify() return wrapfn
def test_inline_call(self): mod = Module.new(__name__) callee = mod.add_function(Type.function(Type.int(), [Type.int()]), name="bar") builder = Builder.new(callee.append_basic_block("entry")) builder.ret(builder.add(callee.args[0], callee.args[0])) caller = mod.add_function(Type.function(Type.int(), []), name="foo") builder = Builder.new(caller.append_basic_block("entry")) callinst = builder.call(callee, [Constant.int(Type.int(), 1234)]) builder.ret(callinst) pre_inlining = str(caller) self.assertIn("call", pre_inlining) self.assertTrue(inline_function(callinst)) post_inlining = str(caller) self.assertNotIn("call", post_inlining) self.assertIn("2468", post_inlining)
def test_inline_call(self): mod = Module.new(__name__) callee = mod.add_function(Type.function(Type.int(), [Type.int()]), name='bar') builder = Builder.new(callee.append_basic_block('entry')) builder.ret(builder.add(callee.args[0], callee.args[0])) caller = mod.add_function(Type.function(Type.int(), []), name='foo') builder = Builder.new(caller.append_basic_block('entry')) callinst = builder.call(callee, [Constant.int(Type.int(), 1234)]) builder.ret(callinst) pre_inlining = str(caller) self.assertIn('call', pre_inlining) self.assertTrue(inline_function(callinst)) post_inlining = str(caller) self.assertNotIn('call', post_inlining) self.assertIn('2468', post_inlining)
def generate_kernel_wrapper(self, func, argtypes): module = func.module argtys = self.get_arguments(func.type.pointee) fnty = Type.function(Type.void(), argtys) wrapfn = module.add_function(fnty, name="cudaPy_" + func.name) builder = Builder.new(wrapfn.append_basic_block('')) # Define error handling variables def define_error_gv(postfix): gv = module.add_global_variable(Type.int(), name=wrapfn.name + postfix) gv.initializer = Constant.null(gv.type.pointee) return gv gv_exc = define_error_gv("__errcode__") gv_tid = [] gv_ctaid = [] for i in 'xyz': gv_tid.append(define_error_gv("__tid%s__" % i)) gv_ctaid.append(define_error_gv("__ctaid%s__" % i)) callargs = [] for at, av in zip(argtypes, wrapfn.args): av = self.get_argument_value(builder, at, av) callargs.append(av) status, _ = self.call_function(builder, func, types.void, argtypes, callargs) # Check error status with cgutils.if_likely(builder, status.ok): builder.ret_void() with cgutils.ifthen(builder, builder.not_(status.exc)): # User exception raised old = Constant.null(gv_exc.type.pointee) # Use atomic cmpxchg to prevent rewriting the error status # Only the first error is recorded xchg = builder.atomic_cmpxchg(gv_exc, old, status.code, "monotonic") changed = builder.icmp(ICMP_EQ, xchg, old) # If the xchange is successful, save the thread ID. sreg = nvvmutils.SRegBuilder(builder) with cgutils.ifthen(builder, changed): for dim, ptr, in zip("xyz", gv_tid): val = sreg.tid(dim) builder.store(val, ptr) for dim, ptr, in zip("xyz", gv_ctaid): val = sreg.ctaid(dim) builder.store(val, ptr) builder.ret_void() # force inline inline_function(status.code) module.verify() return wrapfn
def build_ufunc_wrapper(context, func, signature): """ Wrap the scalar function with a loop that iterates over the arguments """ module = func.module byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper = module.add_function(fnty, "__ufunc__." + func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") actual_args = context.get_arguments(func) # Prepare inputs arrays = [] for i, typ in enumerate(signature.args): arrays.append(UArrayArg(context, builder, arg_args, arg_steps, i, context.get_argument_type(typ))) # Prepare output valty = context.get_data_type(signature.return_type) out = UArrayArg(context, builder, arg_args, arg_steps, len(actual_args), valty) # Setup indices offsets = [] zero = context.get_constant(types.intp, 0) for _ in arrays: p = cgutils.alloca_once(builder, intp_t) offsets.append(p) builder.store(zero, p) store_offset = cgutils.alloca_once(builder, intp_t) builder.store(zero, store_offset) unit_strided = cgutils.true_bit for ary in arrays: unit_strided = builder.and_(unit_strided, ary.is_unit_strided) with cgutils.ifelse(builder, unit_strided) as (is_unit_strided, is_strided): with is_unit_strided: with cgutils.for_range(builder, loopcount, intp=intp_t) as ind: fastloop = build_fast_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, ind) builder.ret_void() with is_strided: # General loop with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_slow_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature) builder.ret_void() builder.ret_void() del builder # Set core function to internal so that it is not generated func.linkage = LINKAGE_INTERNAL # Force inline of code function inline_function(slowloop) inline_function(fastloop) # Run optimizer context.optimize(module) if config.DUMP_OPTIMIZED: print(module) return wrapper
def build(self): module = self.func.module byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = self.context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper = module.add_function(fnty, "__gufunc__." + self.func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") # Unpack shapes unique_syms = set() for grp in (self.sin, self.sout): for syms in grp: unique_syms |= set(syms) sym_map = {} for syms in self.sin: for s in syms: if s not in sym_map: sym_map[s] = len(sym_map) sym_dim = {} for s, i in sym_map.items(): sym_dim[s] = builder.load(builder.gep(arg_dims, [self.context.get_constant( types.intp, i + 1)])) # Prepare inputs arrays = [] step_offset = len(self.sin) + len(self.sout) for i, (typ, sym) in enumerate(zip(self.signature.args, self.sin + self.sout)): ary = GUArrayArg(self.context, builder, arg_args, arg_dims, arg_steps, i, step_offset, typ, sym, sym_dim) if not ary.as_scalar: step_offset += ary.ndim arrays.append(ary) bbreturn = cgutils.get_function(builder).append_basic_block('.return') # Prologue self.gen_prologue(builder) # Loop with cgutils.for_range(builder, loopcount, intp=intp_t) as ind: args = [a.array_value for a in arrays] innercall, error = self.gen_loop_body(builder, args) # If error, escape cgutils.cbranch_or_continue(builder, error, bbreturn) for a in arrays: a.next(ind) builder.branch(bbreturn) builder.position_at_end(bbreturn) # Epilogue self.gen_epilogue(builder) builder.ret_void() module.verify() # Set core function to internal so that it is not generated self.func.linkage = LINKAGE_INTERNAL # Force inline of code function inline_function(innercall) # Run optimizer self.context.optimize(module) if config.DUMP_OPTIMIZED: print(module) wrapper.verify() return wrapper, self.env
def build_gufunc_wrapper(context, func, signature, sin, sout): module = func.module byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper = module.add_function(fnty, "__gufunc__." + func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") # Unpack shapes unique_syms = set() for grp in (sin, sout): for syms in grp: unique_syms |= set(syms) sym_map = {} for grp in (sin, sout): for syms in sin: for s in syms: if s not in sym_map: sym_map[s] = len(sym_map) sym_dim = {} for s, i in sym_map.items(): sym_dim[s] = builder.load(builder.gep(arg_dims, [context.get_constant(types.intp, i + 1)])) # Prepare inputs arrays = [] step_offset = len(sin) + len(sout) for i, (typ, sym) in enumerate(zip(signature.args, sin + sout)): ary = GUArrayArg(context, builder, arg_args, arg_dims, arg_steps, i, step_offset, typ, sym, sym_dim) if not ary.as_scalar: step_offset += ary.ndim arrays.append(ary) # Loop with cgutils.for_range(builder, loopcount, intp=intp_t) as ind: args = [a.array_value for a in arrays] status, retval = context.call_function(builder, func, signature.return_type, signature.args, args) # ignore status # ignore retval for a in arrays: a.next(ind) builder.ret_void() # Set core function to internal so that it is not generated func.linkage = LINKAGE_INTERNAL # Force inline of code function inline_function(status.code) # Run optimizer context.optimize(module) if config.DUMP_OPTIMIZED: print(module) wrapper.verify() return wrapper
def build_ufunc_wrapper(context, func, signature): """ Wrap the scalar function with a loop that iterates over the arguments """ module = func.module byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper = module.add_function(fnty, "__ufunc__." + func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") actual_args = context.get_arguments(func) # Prepare inputs arrays = [] for i, typ in enumerate(signature.args): arrays.append( UArrayArg(context, builder, arg_args, arg_steps, i, context.get_argument_type(typ))) # Prepare output valty = context.get_data_type(signature.return_type) out = UArrayArg(context, builder, arg_args, arg_steps, len(actual_args), valty) # Setup indices offsets = [] zero = context.get_constant(types.intp, 0) for _ in arrays: p = cgutils.alloca_once(builder, intp_t) offsets.append(p) builder.store(zero, p) store_offset = cgutils.alloca_once(builder, intp_t) builder.store(zero, store_offset) unit_strided = cgutils.true_bit for ary in arrays: unit_strided = builder.and_(unit_strided, ary.is_unit_strided) with cgutils.ifelse(builder, unit_strided) as (is_unit_strided, is_strided): with is_unit_strided: with cgutils.for_range(builder, loopcount, intp=intp_t) as ind: fastloop = build_fast_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature, ind) builder.ret_void() with is_strided: # General loop with cgutils.for_range(builder, loopcount, intp=intp_t): slowloop = build_slow_loop_body(context, func, builder, arrays, out, offsets, store_offset, signature) builder.ret_void() builder.ret_void() del builder # Set core function to internal so that it is not generated func.linkage = LINKAGE_INTERNAL # Force inline of code function inline_function(slowloop) inline_function(fastloop) # Run optimizer context.optimize(module) if config.DUMP_OPTIMIZED: print(module) return wrapper
def build(self): module = self.func.module byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = self.context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function( Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapper = module.add_function(fnty, "__gufunc__." + self.func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder.new(wrapper.append_basic_block("entry")) loopcount = builder.load(arg_dims, name="loopcount") # Unpack shapes unique_syms = set() for grp in (self.sin, self.sout): for syms in grp: unique_syms |= set(syms) sym_map = {} for syms in self.sin: for s in syms: if s not in sym_map: sym_map[s] = len(sym_map) sym_dim = {} for s, i in sym_map.items(): sym_dim[s] = builder.load( builder.gep(arg_dims, [self.context.get_constant(types.intp, i + 1)])) # Prepare inputs arrays = [] step_offset = len(self.sin) + len(self.sout) for i, (typ, sym) in enumerate( zip(self.signature.args, self.sin + self.sout)): ary = GUArrayArg(self.context, builder, arg_args, arg_dims, arg_steps, i, step_offset, typ, sym, sym_dim) if not ary.as_scalar: step_offset += ary.ndim arrays.append(ary) bbreturn = cgutils.get_function(builder).append_basic_block('.return') # Prologue self.gen_prologue(builder) # Loop with cgutils.for_range(builder, loopcount, intp=intp_t) as ind: args = [a.array_value for a in arrays] innercall, error = self.gen_loop_body(builder, args) # If error, escape cgutils.cbranch_or_continue(builder, error, bbreturn) for a in arrays: a.next(ind) builder.branch(bbreturn) builder.position_at_end(bbreturn) # Epilogue self.gen_epilogue(builder) builder.ret_void() module.verify() # Set core function to internal so that it is not generated self.func.linkage = LINKAGE_INTERNAL # Force inline of code function inline_function(innercall) # Run optimizer self.context.optimize(module) if config.DUMP_OPTIMIZED: print(module) wrapper.verify() return wrapper, self.env