def __init__(self, module, opt_level=3, loop_vectorize=True): # opt_level is used for both module level (opt) and # instruction level optimization (cg) for TargetMachine # and PassManager if not detect_avx_support(): tm = le.TargetMachine.new( opt = opt_level, cm = le.CM_JITDEFAULT, features='-avx', ) else: tm = le.TargetMachine.new( opt = opt_level, cm = le.CM_JITDEFAULT, features='' , ) pass_opts = dict( fpm = False, mod = module, opt = opt_level, vectorize = False, loop_vectorize = loop_vectorize, inline_threshold=self.inline_threshold, ) pms = lp.build_pass_managers(tm = tm, **pass_opts) pms.pm.run(module)
def __init__(self, module_name, optimize = llvm_config.llvm_optimize, verify = llvm_config.llvm_verify): self.module = core.Module.new(module_name) self.engine_builder = ee.EngineBuilder.new(self.module) self.engine_builder.force_jit() opt_level = 3 if optimize else 0 if optimize: self.engine_builder.opt(opt_level) else: self.engine_builder.opt(opt_level) self.exec_engine = self.engine_builder.create() tm = ee.TargetMachine.new(opt = opt_level, cm=ee.CM_JITDEFAULT) self.tm = tm _, fpm = passes.build_pass_managers(tm, opt = opt_level, loop_vectorize = (opt_level > 0), mod = self.module, vectorize = (opt_level > 0), ) self.pass_manager = fpm # self.fpm = fpm for p in self._verify_passes: self.pass_manager.add(p) if optimize: for p in (self._opt_passes + self._verify_passes): self.pass_manager.add(p)
def __initialize(self, opt, cg, inline): assert self.__singleton is None m = self.__module = lc.Module.new("numba_executable_module") # Create the TargetMachine features = '' # try: # from llvm.workaround.avx_support import detect_avx_support # if not detect_avx_support(): # features = '-avx' # except ImportError: # # Old llvm, disable AVX for all features = '-avx' tm = self.__machine = le.TargetMachine.new(opt=cg, cm=le.CM_JITDEFAULT, features=features) # Create the ExceutionEngine self.__engine = le.EngineBuilder.new(m).create(tm) # Build a PassManager which will be used for every module/ has_loop_vectorizer = llvm.version >= (3, 2) passmanagers = lp.build_pass_managers( tm, opt=opt, inline_threshold=inline, loop_vectorize=has_loop_vectorizer, fpm=False) self.__pm = passmanagers.pm self.__string_constants = {}
def __init__(self, module, opt_level=3): tc = le.TargetMachine.new(features='', cm=le.CM_JITDEFAULT) self.pm, self.fpm = lp.build_pass_managers(tc, loop_vectorize=False, vectorize=False, fpm=False, mod=module)
def example(title, module_builder, opt): print(title.center(80, '=')) mod, fn = module_builder() eb = le.EngineBuilder.new(mod).opt(3) if opt: print('opt') tm = eb.select_target() pms = lp.build_pass_managers(mod=mod, tm=tm, opt=3, loop_vectorize=True, fpm=False) pms.pm.run(mod) print(mod) print(mod.to_native_assembly()) engine = eb.create() ptr = engine.get_pointer_to_function(fn) callable = CFUNCTYPE(None, POINTER(c_float), POINTER(c_float), POINTER(c_float), c_int)(ptr) N = 20 in1 = (c_float * N)(*range(N)) in2 = (c_float * N)(*range(N)) out = (c_float * N)() print('in1: ', list(in1)) print('in1: ', list(in2)) callable(in1, in2, out, N) print('out', list(out))
def __initialize(self, opt, cg, inline): assert self.__singleton is None m = self.__module = lc.Module.new("numba_executable_module") # Create the TargetMachine features = '' try: from llvm.workaround.avx_support import detect_avx_support if not detect_avx_support(): features = '-avx' except ImportError: # Old llvm, disable AVX for all features = '-avx' tm = self.__machine = le.TargetMachine.new(opt=cg, cm=le.CM_JITDEFAULT, features=features) # Create the ExceutionEngine self.__engine = le.EngineBuilder.new(m).create(tm) # Build a PassManager which will be used for every module/ has_loop_vectorizer = llvm.version >= (3, 2) passmanagers = lp.build_pass_managers(tm, opt=opt, inline_threshold=inline, loop_vectorize=has_loop_vectorizer, fpm=False) self.__pm = passmanagers.pm self.__string_constants = {}
def build_pass_manager(self): opt = 0 # let Impala optimize # opt = 3 # optimize ourselves pms = lp.build_pass_managers(tm=self.tm, opt=opt, loop_vectorize=True, fpm=False) return pms.pm
def _cull_exports(self): """Read all the exported functions/modules in the translator environment, and join them into a single LLVM module. Resets the export environment afterwards. """ self.exported_signatures = export_registry # Create new module containing everything llvm_module = lc.Module.new(self.module_name) # Compile all exported functions typing_ctx = CPUTarget.typing_context # TODO Use non JIT-ing target target_ctx = CPUTarget.target_context modules = [] flags = Flags() if not self.export_python_wrap: flags.set("no_compile") for entry in self.exported_signatures: cres = compile_extra(typing_ctx, target_ctx, entry.function, entry.signature.args, entry.signature.return_type, flags, locals={}) if self.export_python_wrap: module = cres.llvm_func.module cres.llvm_func.linkage = lc.LINKAGE_INTERNAL wrappername = "wrapper." + cres.llvm_func.name wrapper = module.get_function_named(wrappername) wrapper.name = entry.symbol else: cres.llvm_func.name = entry.symbol modules.append(cres.llvm_module) # Link all exported functions for mod in modules: llvm_module.link_in(mod, preserve=self.export_python_wrap) # Optimize tm = le.TargetMachine.new(opt=3) pms = lp.build_pass_managers(tm=tm, opt=3, loop_vectorize=True, fpm=False) pms.pm.run(llvm_module) if self.export_python_wrap: self._emit_python_wrapper(llvm_module) del self.exported_signatures[:] print(llvm_module) return llvm_module
def optimize_function(self, func): """Run O1 function passes """ pms = lp.build_pass_managers(tm=self.tm, opt=1, pm=False, mod=func.module) fpm = pms.fpm fpm.initialize() fpm.run(func) fpm.finalize()
def make_llvm_context(name="mymodule"): "Return an LLVM context (engine, module, passmanager)" module = lc.Module.new("executable_module") features = '-avx' tm = le.TargetMachine.new(opt=3, cm=le.CM_JITDEFAULT, features=features) engine = le.EngineBuilder.new(module).create(tm) passmanagers = lp.build_pass_managers(tm, opt=3, inline_threshold=1000, fpm=False) return LLVMContext(engine, module, passmanagers.pm)
def optimize_pythonapi(self, func): # Simplify the function using pms = lp.build_pass_managers(tm=self.tm, opt=1, mod=func.module) fpm = pms.fpm fpm.initialize() fpm.run(func) fpm.finalize() # remove extra refct api calls remove_refct_calls(func)
def _cull_exports(self): """Read all the exported functions/modules in the translator environment, and join them into a single LLVM module. Resets the export environment afterwards. """ self.exported_signatures = export_registry # Create new module containing everything llvm_module = lc.Module.new(self.module_name) # Compile all exported functions typing_ctx = CPUTarget.typing_context # TODO Use non JIT-ing target target_ctx = CPUTarget.target_context modules = [] flags = Flags() if not self.export_python_wrap: flags.set("no_compile") for entry in self.exported_signatures: cres = compile_extra(typing_ctx, target_ctx, entry.function, entry.signature.args, entry.signature.return_type, flags, locals={}) if self.export_python_wrap: module = cres.llvm_func.module cres.llvm_func.linkage = lc.LINKAGE_INTERNAL wrappername = "wrapper." + cres.llvm_func.name wrapper = module.get_function_named(wrappername) wrapper.name = entry.symbol else: cres.llvm_func.name = entry.symbol modules.append(cres.llvm_module) # Link all exported functions for mod in modules: llvm_module.link_in(mod, preserve=self.export_python_wrap) # Optimize tm = le.TargetMachine.new(opt=3) pms = lp.build_pass_managers(tm=tm, opt=3, loop_vectorize=True, fpm=False) pms.pm.run(llvm_module) if self.export_python_wrap: self._emit_python_wrapper(llvm_module) #del self.exported_signatures[:] print(llvm_module) return llvm_module
def load(arch): '''Load the LLRT module corresponding to the given architecture Creates a new module and optimizes it using the information from the host machine. ''' if arch != 'x86_64': arch = 'x86' path = os.path.join(os.path.dirname(__file__), 'llrt', 'llrt_%s.ll' % arch) with open(path) as fin: lib = lc.Module.from_assembly(fin) # run passes to optimize tm = le.TargetMachine.new() pms = lp.build_pass_managers(tm, opt=3, fpm=False) pms.pm.run(lib) return lib
def codegen(ast, specializer, retty, argtys): cgen = LLVMEmitter(specializer, retty, argtys) cgen.function.verify() tm = le.TargetMachine.new(opt=3, cm=le.CM_JITDEFAULT, features='') pms = lp.build_pass_managers(tm=tm, fpm=False, mod=module, opt=3, vectorize=False, loop_vectorize=True) pms.pm.run(module) debug(cgen.function) debug(module.to_native_assembly()) return cgen.function
def codegen(ast, specializer, retty, argtys): cgen = LLVMEmitter(specializer, retty, argtys) mod = cgen.visit(ast) cgen.function.verify() tm = le.TargetMachine.new(opt=3, cm=le.CM_JITDEFAULT, features='') pms = lp.build_pass_managers(tm=tm, fpm=False, mod=module, opt=3, vectorize=False, loop_vectorize=True) pms.pm.run(module) debug(cgen.function) debug(module.to_native_assembly()) return cgen.function
def build_pass_manager(self): if config.OPT == 3: # This uses the same passes for clang -O3 pms = lp.build_pass_managers(tm=self.tm, opt=3, loop_vectorize=True, fpm=False) return pms.pm else: # This uses minimum amount of passes for fast code. # TODO: make it generate vector code tm = self.tm pm = lp.PassManager.new() pm.add(tm.target_data.clone()) pm.add(lp.TargetLibraryInfo.new(tm.triple)) # Re-enable for target infomation for vectorization # tm.add_analysis_passes(pm) passes = ''' basicaa scev-aa mem2reg sroa adce dse sccp instcombine simplifycfg loops indvars loop-simplify licm simplifycfg instcombine loop-vectorize instcombine simplifycfg globalopt globaldce '''.split() for p in passes: pm.add(lp.Pass.new(p)) return pm
def build_pass_manager(self): if 0 < config.OPT <= 3: # This uses the same passes for clang -O3 pms = lp.build_pass_managers(tm=self.tm, opt=config.OPT, loop_vectorize=config.LOOP_VECTORIZE, fpm=False) return pms.pm else: # This uses minimum amount of passes for fast code. # TODO: make it generate vector code tm = self.tm pm = lp.PassManager.new() pm.add(tm.target_data.clone()) pm.add(lp.TargetLibraryInfo.new(tm.triple)) # Re-enable for target infomation for vectorization # tm.add_analysis_passes(pm) passes = ''' basicaa scev-aa mem2reg sroa adce dse sccp instcombine simplifycfg loops indvars loop-simplify licm simplifycfg instcombine loop-vectorize instcombine simplifycfg globalopt globaldce '''.split() for p in passes: pm.add(lp.Pass.new(p)) return pm
def __initialize(self, opt, cg, inline): assert self.__singleton is None m = self.__module = lc.Module.new("numba_executable_module") # Create the TargetMachine # FIXME: The follow is a workaround for missing AVX support # in old linux kernel. from llvm.ee import FORCE_DISABLE_AVX if FORCE_DISABLE_AVX: features = '-avx' else: features = '' tm = self.__machine = le.TargetMachine.new(opt=cg, cm=le.CM_JITDEFAULT, features=features) # Create the ExceutionEngine self.__engine = le.EngineBuilder.new(m).create(tm) # Build a PassManager which will be used for every module/ has_loop_vectorizer = llvm.version >= (3, 2) passmanagers = lp.build_pass_managers(tm, opt=opt, inline_threshold=inline, loop_vectorize=has_loop_vectorizer, fpm=False) self.__pm = passmanagers.pm
def func_ptr(self): if self._func_ptr is None: module = self.module.clone() if self._ee is None: from llvm.passes import build_pass_managers import llvm.ee as le tm = le.TargetMachine.new(opt=3, cm=le.CM_JITDEFAULT, features='') pms = build_pass_managers(tm, opt=3, fpm=False, vectorize=True, loop_vectorize=True) pms.pm.run(module) if sys.version_info >= (3,): import builtins else: import __builtin__ as builtins builtins._temp = module.clone() builtins._tempname = self.func.name #self._ee = le.ExecutionEngine.new(module) # FIXME: Temporarily disabling AVX, because of misdetection # in linux VMs. Some code is in llvmpy's workarounds # submodule related to this. self._ee = le.EngineBuilder.new(module).mattrs("-avx").create() func = module.get_function_named(self.func.name) self._func_ptr = self._ee.get_pointer_to_function(func) return self._func_ptr
def unbound_single_ckernel(self): """Creates an UnboundCKernelFunction with the ExprSingleOperation prototype. """ import ctypes if self._unbound_single_ckernel is None: i8_p_type = Type.pointer(Type.int(8)) func_type = Type.function(void_type, [i8_p_type, Type.pointer(i8_p_type), i8_p_type]) module = self.module.clone() single_ck_func_name = self.func.name +"_single_ckernel" single_ck_func = Function.new(module, func_type, name=single_ck_func_name) block = single_ck_func.append_basic_block('entry') builder = lc.Builder.new(block) dst_ptr_arg, src_ptr_arr_arg, extra_ptr_arg = single_ck_func.args dst_ptr_arg.name = 'dst_ptr' src_ptr_arr_arg.name = 'src_ptrs' extra_ptr_arg.name = 'extra_ptr' # Build up the kernel data structure. Currently, this means # adding a shape field for each array argument. First comes # the kernel data prefix with a spot for the 'owner' reference added. input_field_indices = [] kernel_data_fields = [Type.struct([i8_p_type]*3)] kernel_data_ctypes_fields = [('base', JITKernelData)] for i, (kind, a) in enumerate(izip(self.kinds, self.argtypes)): if isinstance(kind, tuple): if kind[0] != lla.C_CONTIGUOUS: raise ValueError('only support C contiguous array presently') input_field_indices.append(len(kernel_data_fields)) kernel_data_fields.append(Type.array( intp_type, len(self.dshapes[i])-1)) kernel_data_ctypes_fields.append(('operand_%d' % i, c_ssize_t * (len(self.dshapes[i])-1))) elif kind in [SCALAR, POINTER]: input_field_indices.append(None) else: raise TypeError(("unbound_single_ckernel codegen doesn't " + "support the parameter kind %r yet") % (k,)) # Make an LLVM and ctypes type for the extra data pointer. kernel_data_llvmtype = Type.struct(kernel_data_fields) class kernel_data_ctypestype(ctypes.Structure): _fields_ = kernel_data_ctypes_fields # Cast the extra pointer to the right llvm type extra_struct = builder.bitcast(extra_ptr_arg, Type.pointer(kernel_data_llvmtype)) # Convert the src pointer args to the # appropriate kinds for the llvm call args = [] for i, (kind, atype) in enumerate(izip(self.kinds[:-1], self.argtypes)): if kind == SCALAR: src_ptr = builder.bitcast(builder.load( builder.gep(src_ptr_arr_arg, (lc.Constant.int(intp_type, i),))), Type.pointer(atype)) src_val = builder.load(src_ptr) args.append(src_val) elif kind == POINTER: src_ptr = builder.bitcast(builder.load( builder.gep(src_ptr_arr_arg, (lc.Constant.int(intp_type, i),))), Type.pointer(atype)) args.append(src_ptr) elif isinstance(kind, tuple): src_ptr = builder.bitcast(builder.load( builder.gep(src_ptr_arr_arg, (lc.Constant.int(intp_type, i),))), Type.pointer(kind[2])) # First get the shape of this parameter. This will # be a combination of Fixed and TypeVar (Var unsupported # here for now) shape = self.dshapes[i][:-1] # Get the llvm array arr_var = builder.alloca(atype.pointee) builder.store(src_ptr, builder.gep(arr_var, (lc.Constant.int(int32_type, 0), lc.Constant.int(int32_type, 0)))) for j, sz in enumerate(shape): if isinstance(sz, Fixed): # If the shape is already known at JIT compile time, # insert the constant shape_el_ptr = builder.gep(arr_var, (lc.Constant.int(int32_type, 0), lc.Constant.int(int32_type, 1), lc.Constant.int(intp_type, j))) builder.store(lc.Constant.int(intp_type, operator.index(sz)), shape_el_ptr) elif isinstance(sz, TypeVar): # TypeVar types are only known when the kernel is bound, # so copy it from the extra data pointer sz_from_extra_ptr = builder.gep(extra_struct, (lc.Constant.int(int32_type, 0), lc.Constant.int(int32_type, input_field_indices[i]), lc.Constant.int(intp_type, j))) sz_from_extra = builder.load(sz_from_extra_ptr) shape_el_ptr = builder.gep(arr_var, (lc.Constant.int(int32_type, 0), lc.Constant.int(int32_type, 1), lc.Constant.int(intp_type, j))) builder.store(sz_from_extra, shape_el_ptr) else: raise TypeError(("unbound_single_ckernel codegen doesn't " + "support dimension type %r") % type(sz)) args.append(arr_var) # Call the function and store in the dst kind = self.kinds[-1] func = module.get_function_named(self.func.name) if kind == SCALAR: dst_ptr = builder.bitcast(dst_ptr_arg, Type.pointer(self.return_type)) dst_val = builder.call(func, args) builder.store(dst_val, dst_ptr) elif kind == POINTER: dst_ptr = builder.bitcast(dst_ptr_arg, Type.pointer(self.return_type)) builder.call(func, args + [dst_ptr]) elif isinstance(kind, tuple): dst_ptr = builder.bitcast(dst_ptr_arg, Type.pointer(kind[2])) # First get the shape of the output. This will # be a combination of Fixed and TypeVar (Var unsupported # here for now) shape = self.dshapes[-1][:-1] # Get the llvm array arr_var = builder.alloca(self.argtypes[-1].pointee) builder.store(dst_ptr, builder.gep(arr_var, (lc.Constant.int(int32_type, 0), lc.Constant.int(int32_type, 0)))) for j, sz in enumerate(shape): if isinstance(sz, Fixed): # If the shape is already known at JIT compile time, # insert the constant shape_el_ptr = builder.gep(arr_var, (lc.Constant.int(int32_type, 0), lc.Constant.int(int32_type, 1), lc.Constant.int(intp_type, j))) builder.store(lc.Constant.int(intp_type, operator.index(sz)), shape_el_ptr) elif isinstance(sz, TypeVar): # TypeVar types are only known when the kernel is bound, # so copy it from the extra data pointer sz_from_extra_ptr = builder.gep(extra_struct, (lc.Constant.int(int32_type, 0), lc.Constant.int(int32_type, input_field_indices[-1]), lc.Constant.int(intp_type, j))) sz_from_extra = builder.load(sz_from_extra_ptr) shape_el_ptr = builder.gep(arr_var, (lc.Constant.int(int32_type, 0), lc.Constant.int(int32_type, 1), lc.Constant.int(intp_type, j))) builder.store(sz_from_extra, shape_el_ptr) else: raise TypeError(("unbound_single_ckernel codegen doesn't " + "support dimension type %r") % type(sz)) builder.call(func, args + [arr_var]) else: raise TypeError(("single_ckernel codegen doesn't " + "support kind %r") % kind) builder.ret_void() #print("Function before optimization passes:") #print(single_ck_func) #module.verify() import llvm.ee as le from llvm.passes import build_pass_managers tm = le.TargetMachine.new(opt=3, cm=le.CM_JITDEFAULT, features='') pms = build_pass_managers(tm, opt=3, fpm=False, vectorize=True, loop_vectorize=True) pms.pm.run(module) #print("Function after optimization passes:") #print(single_ck_func) # DEBUGGING: Verify the module. #module.verify() # TODO: Cache the EE - the interplay with the func_ptr # was broken, so just avoiding caching for now # FIXME: Temporarily disabling AVX, because of misdetection # in linux VMs. Some code is in llvmpy's workarounds # submodule related to this. ee = le.EngineBuilder.new(module).mattrs("-avx").create() func_ptr = ee.get_pointer_to_function(single_ck_func) # Create a function which copies the shape from data # descriptors to the extra data struct. if len(kernel_data_ctypes_fields) == 1: def bind_func(estruct, dst_dd, src_dd_list): pass else: def bind_func(estruct, dst_dd, src_dd_list): for i, (ds, dd) in enumerate( izip(self.dshapes, src_dd_list + [dst_dd])): shape = [operator.index(dim) for dim in dd.dshape[-len(ds):-1]] cshape = getattr(estruct, 'operand_%d' % i) for j, dim_size in enumerate(shape): cshape[j] = dim_size self._unbound_single_ckernel = UnboundCKernelFunction( ExprSingleOperation(func_ptr), kernel_data_ctypestype, bind_func, (ee, func_ptr)) return self._unbound_single_ckernel
def optimize(module, lfunc): tm = le.TargetMachine.new(opt=3, cm=le.CM_JITDEFAULT, features='') pms = build_pass_managers(tm, opt=3, fpm=False, vectorize=True, loop_vectorize=True) pms.pm.run(module)
def build_pass_manager(self): pms = lp.build_pass_managers(tm=self.tm, opt=3, loop_vectorize=True, fpm=False) return pms.pm
def __init__(self, module, opt_level=3): tc = le.TargetMachine.new(features='', cm=le.CM_JITDEFAULT) self.pm, self.fpm = lp.build_pass_managers(tc, loop_vectorize=False, vectorize=True, fpm=False, mod=module)
def optimize_llvm_function(func, opt_level=3, inline_threshold=15000): tm = TargetMachine.new(opt=opt_level) pm = lp.build_pass_managers(tm, opt=opt_level, loop_vectorize=True, fpm=False, inline_threshold=inline_threshold).pm pm.run(func.module)
def jit_compile_unbound_single_ckernel(bek, strided): """Creates an UnboundCKernelFunction with either the ExprSingleOperation prototype or the ExprStridedOperation prototype depending on the `strided` parameter. Parameters ---------- bek : BlazeElementKernel The blaze kernel to compile into an unbound single ckernel. strided : bool If true, returns an ExprStridedOperation, otherwise an ExprSingleOperation. """ inarg_count = len(bek.kinds)-1 module = bek.module.clone() if strided: ck_func_name = bek.func.name +"_strided_ckernel" ck_func = Function.new(module, strided_ckernel_func_type, name=ck_func_name) else: ck_func_name = bek.func.name +"_single_ckernel" ck_func = Function.new(module, single_ckernel_func_type, name=ck_func_name) entry_block = ck_func.append_basic_block('entry') builder = lc.Builder.new(entry_block) if strided: dst_ptr_arg, dst_stride_arg, \ src_ptr_arr_arg, src_stride_arr_arg, \ count_arg, extra_ptr_arg = ck_func.args dst_stride_arg.name = 'dst_stride' src_stride_arr_arg.name = 'src_strides' count_arg.name = 'count' else: dst_ptr_arg, src_ptr_arr_arg, extra_ptr_arg = ck_func.args dst_ptr_arg.name = 'dst_ptr' src_ptr_arr_arg.name = 'src_ptrs' extra_ptr_arg.name = 'extra_ptr' # Build llvm and ctypes structures for the kernel data, using # the argument types. kd_llvmtype, kd_ctypestype = args_to_kernel_data_struct(bek.kinds, bek.argtypes) # Cast the extra pointer to the right llvm type extra_struct = builder.bitcast(extra_ptr_arg, Type.pointer(kd_llvmtype)) if strided: # Allocate an array of pointer counters for the # strided loop src_ptr_arr_tmp = builder.alloca_array(int8_p_type, lc.Constant.int(int32_type, inarg_count), 'src_ptr_arr') # Copy the pointers for i in range(inarg_count): builder.store(builder.load(builder.gep(src_ptr_arr_arg, (lc.Constant.int(int32_type, i),))), builder.gep(src_ptr_arr_tmp, (lc.Constant.int(int32_type, i),))) # Get all the src strides src_stride_vals = [builder.load(builder.gep(src_stride_arr_arg, (lc.Constant.int(int32_type, i),))) for i in range(inarg_count)] # Replace src_ptr_arr_arg with this local variable src_ptr_arr_arg = src_ptr_arr_tmp # Initialize some more basic blocks for the strided loop looptest_block = ck_func.append_basic_block('looptest') loopbody_block = ck_func.append_basic_block('loopbody') end_block = ck_func.append_basic_block('finish') # Finish the entry block by branching # to the looptest block builder.branch(looptest_block) # The looptest block continues the loop while counter != 0 builder.position_at_end(looptest_block) counter_phi = builder.phi(count_arg.type) counter_phi.add_incoming(count_arg, entry_block) dst_ptr_phi = builder.phi(dst_ptr_arg.type) dst_ptr_phi.add_incoming(dst_ptr_arg, entry_block) dst_ptr_arg = dst_ptr_phi kzero = lc.Constant.int(count_arg.type, 0) pred = builder.icmp(lc.ICMP_NE, counter_phi, kzero) builder.cbranch(pred, loopbody_block, end_block) # The loopbody block decrements the counter, and executes # one kernel iteration builder.position_at_end(loopbody_block) kone = lc.Constant.int(counter_phi.type, 1) counter_dec = builder.sub(counter_phi, kone) counter_phi.add_incoming(counter_dec, loopbody_block) # Convert the src pointer args to the # appropriate kinds for the llvm call args = build_llvm_src_ptrs(builder, src_ptr_arr_arg, bek.dshapes, bek.kinds[:-1], bek.argtypes) # Call the function and store in the dst kind = bek.kinds[-1] func = module.get_function_named(bek.func.name) if kind == lla.SCALAR: dst_ptr = builder.bitcast(dst_ptr_arg, Type.pointer(bek.return_type)) dst_val = builder.call(func, args) builder.store(dst_val, dst_ptr) else: dst_ptr = build_llvm_arg_ptr(builder, dst_ptr_arg, bek.dshapes[-1], kind, bek.argtypes[-1]) builder.call(func, args + [dst_ptr]) if strided: # Finish the loopbody block by incrementing all the pointers # and branching to the looptest block dst_ptr_inc = builder.gep(dst_ptr_arg, (dst_stride_arg,)) dst_ptr_phi.add_incoming(dst_ptr_inc, loopbody_block) # Increment the src pointers for i in range(inarg_count): src_ptr_val = builder.load(builder.gep(src_ptr_arr_tmp, (lc.Constant.int(int32_type, i),))) src_ptr_inc = builder.gep(src_ptr_val, (src_stride_vals[i],)) builder.store(src_ptr_inc, builder.gep(src_ptr_arr_tmp, (lc.Constant.int(int32_type, i),))) builder.branch(looptest_block) # The end block just returns builder.position_at_end(end_block) builder.ret_void() #print("Function before optimization passes:") #print(ck_func) #module.verify() import llvm.ee as le from llvm.passes import build_pass_managers tm = le.TargetMachine.new(opt=3, cm=le.CM_JITDEFAULT, features='') pms = build_pass_managers(tm, opt=3, fpm=False, vectorize=True, loop_vectorize=True) pms.pm.run(module) #print("Function after optimization passes:") #print(ck_func) # DEBUGGING: Verify the module. #module.verify() # TODO: Cache the EE - the interplay with the func_ptr # was broken, so just avoiding caching for now # FIXME: Temporarily disabling AVX, because of misdetection # in linux VMs. Some code is in llvmpy's workarounds # submodule related to this. ee = le.EngineBuilder.new(module).mattrs("-avx").create() func_ptr = ee.get_pointer_to_function(ck_func) # Create a function which copies the shape from data # descriptors to the extra data struct. if len(kd_ctypestype._fields_) == 1: # If there were no extra data fields, it's a no-op function def bind_func(estruct, dst_dd, src_dd_list): pass else: def bind_func(estruct, dst_dd, src_dd_list): for i, (ds, dd) in enumerate( izip(bek.dshapes, src_dd_list + [dst_dd])): shape = [operator.index(dim) for dim in dd.dshape[-len(ds):-1]] cshape = getattr(estruct, 'operand_%d' % i) for j, dim_size in enumerate(shape): cshape[j] = dim_size if strided: optype = ExprStridedOperation else: optype = ExprSingleOperation return UnboundCKernelFunction( optype(func_ptr), kd_ctypestype, bind_func, (ee, func_ptr))