def int_sign_impl(context, builder, sig, args): """ np.sign(int) """ [x] = args POS = Constant.int(x.type, 1) NEG = Constant.int(x.type, -1) ZERO = Constant.int(x.type, 0) cmp_zero = builder.icmp(lc.ICMP_EQ, x, ZERO) cmp_pos = builder.icmp(lc.ICMP_SGT, x, ZERO) presult = cgutils.alloca_once(builder, x.type) bb_zero = builder.append_basic_block(".zero") bb_postest = builder.append_basic_block(".postest") bb_pos = builder.append_basic_block(".pos") bb_neg = builder.append_basic_block(".neg") bb_exit = builder.append_basic_block(".exit") builder.cbranch(cmp_zero, bb_zero, bb_postest) with builder.goto_block(bb_zero): builder.store(ZERO, presult) builder.branch(bb_exit) with builder.goto_block(bb_postest): builder.cbranch(cmp_pos, bb_pos, bb_neg) with builder.goto_block(bb_pos): builder.store(POS, presult) builder.branch(bb_exit) with builder.goto_block(bb_neg): builder.store(NEG, presult) builder.branch(bb_exit) builder.position_at_end(bb_exit) res = builder.load(presult) return impl_ret_untracked(context, builder, sig.return_type, res)
def codegen(context, builder, sig, args): fnty = ir.FunctionType( ll_status, [ll_list_type, ll_ssize_t, ll_bytes], ) [tl, tindex] = sig.args [l, index] = args fn = builder.module.get_or_insert_function( fnty, name='numba_list_{}'.format(op)) dm_item = context.data_model_manager[tl.item_type] ll_item = context.get_data_type(tl.item_type) ptr_item = cgutils.alloca_once(builder, ll_item) lp = _container_get_data(context, builder, tl, l) status = builder.call( fn, [ lp, index, _as_bytes(builder, ptr_item), ], ) # Load item if output is available found = builder.icmp_signed('>=', status, status.type(int(ListStatus.LIST_OK))) out = context.make_optional_none( builder, tl.item_type if IS_NOT_NONE else types.int64) pout = cgutils.alloca_once_value(builder, out) with builder.if_then(found): if IS_NOT_NONE: item = dm_item.load_from_data_pointer(builder, ptr_item) context.nrt.incref(builder, tl.item_type, item) loaded = context.make_optional_value(builder, tl.item_type, item) builder.store(loaded, pout) out = builder.load(pout) return context.make_tuple(builder, resty, [status, out])
def impl_list_getiter(context, builder, sig, args): """Implement iter(List).""" [tl] = sig.args [l] = args iterablety = types.ListTypeIterableType(tl) it = context.make_helper(builder, iterablety.iterator_type) fnty = ir.FunctionType(ir.VoidType(), [ll_listiter_type, ll_list_type],) fn = builder.module.get_or_insert_function(fnty, name="numba_list_iter") proto = ctypes.CFUNCTYPE(ctypes.c_size_t) listiter_sizeof = proto(_helperlib.c_helpers["list_iter_sizeof"]) state_type = ir.ArrayType(ir.IntType(8), listiter_sizeof()) pstate = cgutils.alloca_once(builder, state_type, zfill=True) it.state = _as_bytes(builder, pstate) it.parent = l dp = _container_get_data(context, builder, iterablety.parent, args[0]) builder.call(fn, [it.state, dp]) return impl_ret_borrowed(context, builder, sig.return_type, it._getvalue(),)
def string_split_impl(context, builder, sig, args): nitems = cgutils.alloca_once(builder, lir.IntType(64)) # input str, sep, size pointer fnty = lir.FunctionType( lir.IntType(8).as_pointer().as_pointer(), [ lir.IntType(8).as_pointer(), lir.IntType(8).as_pointer(), lir.IntType(64).as_pointer() ]) fn = builder.module.get_or_insert_function(fnty, name="str_split") ptr = builder.call(fn, args + [nitems]) size = builder.load(nitems) # TODO: use ptr instead of allocating and copying, use NRT_MemInfo_new # TODO: deallocate ptr _list = numba.cpython.listobj.ListInstance.allocate( context, builder, sig.return_type, size) _list.size = size with cgutils.for_range(builder, size) as loop: value = builder.load(cgutils.gep_inbounds(builder, ptr, loop.index)) # TODO: refcounted str _list.setitem(loop.index, value, incref=False) return impl_ret_new_ref(context, builder, sig.return_type, _list.value)
def bytes_to_charseq(context, builder, fromty, toty, val): barr = cgutils.create_struct_proxy(fromty)(context, builder, value=val) src = builder.bitcast(barr.data, ir.IntType(8).as_pointer()) src_length = barr.nitems lty = context.get_value_type(toty) dstint_t = ir.IntType(8) dst_ptr = cgutils.alloca_once(builder, lty) dst = builder.bitcast(dst_ptr, dstint_t.as_pointer()) dst_length = ir.Constant(src_length.type, toty.count) is_shorter_value = builder.icmp_unsigned('<', src_length, dst_length) count = builder.select(is_shorter_value, src_length, dst_length) with builder.if_then(is_shorter_value): cgutils.memset(builder, dst, ir.Constant(src_length.type, toty.count), 0) with cgutils.for_range(builder, count) as loop: in_ptr = builder.gep(src, [loop.index]) in_val = builder.zext(builder.load(in_ptr), dstint_t) builder.store(in_val, builder.gep(dst, [loop.index])) return builder.load(dst_ptr)
def unbox_COO(typ: COOType, obj: COO, c) -> NativeValue: ret_ptr = cgutils.alloca_once(c.builder, c.context.get_value_type(typ)) is_error_ptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) fail_obj = c.context.get_constant_null(typ) with local_return(c.builder) as ret: fail_blk = c.builder.append_basic_block("fail") with c.builder.goto_block(fail_blk): c.builder.store(cgutils.true_bit, is_error_ptr) c.builder.store(fail_obj, ret_ptr) ret() data = _unbox_native_field(typ.data_type, obj, "data", c) with cgutils.if_unlikely(c.builder, data.is_error): c.builder.branch(fail_blk) coords = _unbox_native_field(typ.coords_type, obj, "coords", c) with cgutils.if_unlikely(c.builder, coords.is_error): c.builder.branch(fail_blk) shape = _unbox_native_field(typ.shape_type, obj, "shape", c) with cgutils.if_unlikely(c.builder, shape.is_error): c.builder.branch(fail_blk) fill_value = _unbox_native_field(typ.fill_value_type, obj, "fill_value", c) with cgutils.if_unlikely(c.builder, fill_value.is_error): c.builder.branch(fail_blk) coo = cgutils.create_struct_proxy(typ)(c.context, c.builder) coo.coords = coords.value coo.data = data.value coo.shape = shape.value coo.fill_value = fill_value.value c.builder.store(cgutils.false_bit, is_error_ptr) c.builder.store(coo._getvalue(), ret_ptr) return NativeValue(c.builder.load(ret_ptr), is_error=c.builder.load(is_error_ptr))
def _list_new_codegen(context, builder, itemty, new_size, error_handler): fnty = ir.FunctionType( ll_status, [ll_list_type.as_pointer(), ll_ssize_t, ll_ssize_t], ) fn = builder.module.get_or_insert_function(fnty, name='numba_list_new') # Determine sizeof item types ll_item = context.get_data_type(itemty) sz_item = context.get_abi_sizeof(ll_item) reflp = cgutils.alloca_once(builder, ll_list_type, zfill=True) status = builder.call( fn, [reflp, ll_ssize_t(sz_item), new_size], ) msg = "Failed to allocate list" error_handler( builder, status, msg, ) lp = builder.load(reflp) return lp
def masked_scalar_is_null_impl(context, builder, sig, args): """ Implement `MaskedType` is `NA` """ if isinstance(sig.args[1], NAType): masked_type, na = sig.args value = args[0] else: na, masked_type = sig.args value = args[1] indata = cgutils.create_struct_proxy(masked_type)(context, builder, value=value) result = cgutils.alloca_once(builder, ir.IntType(1)) with builder.if_else(indata.valid) as (then, otherwise): with then: builder.store(context.get_constant(types.boolean, 0), result) with otherwise: builder.store(context.get_constant(types.boolean, 1), result) return builder.load(result)
def unbox_list(typ, obj, c): """ Convert list *obj* to a native list. If list was previously unboxed, we reuse the existing native list to ensure consistency. """ size = c.pyapi.list_size(obj) errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) listptr = cgutils.alloca_once(c.builder, c.context.get_value_type(typ)) # See if the list was previously unboxed, if so, re-use the meminfo. ptr = c.pyapi.object_get_private_data(obj) with c.builder.if_else(cgutils.is_not_null(c.builder, ptr)) as ( has_meminfo, otherwise, ): with has_meminfo: # List was previously unboxed => reuse meminfo list = listobj.ListInstance.from_meminfo(c.context, c.builder, typ, ptr) list.size = size if typ.reflected: list.parent = obj c.builder.store(list.value, listptr) with otherwise: _python_list_to_native(typ, obj, c, size, listptr, errorptr) def cleanup(): # Clean up the associated pointer, as the meminfo is now invalid. c.pyapi.object_reset_private_data(obj) return NativeValue( c.builder.load(listptr), is_error=c.builder.load(errorptr), cleanup=cleanup )
def lower_dist_arr_reduce(context, builder, sig, args): op_typ = args[1].type # store an int to specify data type typ_enum = _numba_to_c_type_map[sig.args[0].dtype] typ_arg = cgutils.alloca_once_value( builder, lir.Constant(lir.IntType(32), typ_enum)) ndims = sig.args[0].ndim out = make_array(sig.args[0])(context, builder, args[0]) # store size vars array struct to pointer size_ptr = cgutils.alloca_once(builder, out.shape.type) builder.store(out.shape, size_ptr) size_arg = builder.bitcast(size_ptr, lir.IntType(64).as_pointer()) ndim_arg = cgutils.alloca_once_value( builder, lir.Constant(lir.IntType(32), sig.args[0].ndim)) call_args = [ builder.bitcast(out.data, lir.IntType(8).as_pointer()), size_arg, builder.load(ndim_arg), args[1], builder.load(typ_arg) ] # array, shape, ndim, extra last arg type for type enum arg_typs = [ lir.IntType(8).as_pointer(), lir.IntType(64).as_pointer(), lir.IntType(32), op_typ, lir.IntType(32) ] fnty = lir.FunctionType(lir.IntType(32), arg_typs) fn = builder.module.get_or_insert_function(fnty, name="hpat_dist_arr_reduce") builder.call(fn, call_args) res = out._getvalue() return impl_ret_borrowed(context, builder, sig.return_type, res)
def codegen(context, builder, sig, args): fnty = ir.FunctionType( ll_status, [ll_dict_type.as_pointer(), ll_ssize_t, ll_ssize_t], ) fn = builder.module.get_or_insert_function( fnty, name='numba_dict_new_minsize') # Determine sizeof key and value types ll_key = context.get_data_type(keyty.instance_type) ll_val = context.get_data_type(valty.instance_type) sz_key = context.get_abi_sizeof(ll_key) sz_val = context.get_abi_sizeof(ll_val) refdp = cgutils.alloca_once(builder, ll_dict_type, zfill=True) status = builder.call( fn, [refdp, ll_ssize_t(sz_key), ll_ssize_t(sz_val)], ) _raise_if_error( context, builder, status, msg="Failed to allocate dictionary", ) dp = builder.load(refdp) return dp
def make_enumerate_object(context, builder, sig, args): assert len(args) == 1 or len(args) == 2 # enumerate(it) or enumerate(it, start) srcty = sig.args[0] if len(args) == 1: src = args[0] start_val = context.get_constant(types.intp, 0) elif len(args) == 2: src = args[0] start_val = context.cast(builder, args[1], sig.args[1], types.intp) iterobj = call_getiter(context, builder, srcty, src) enum = context.make_helper(builder, sig.return_type) countptr = cgutils.alloca_once(builder, start_val.type) builder.store(start_val, countptr) enum.count = countptr enum.iter = iterobj res = enum._getvalue() return impl_ret_new_ref(context, builder, sig.return_type, res)
def atomic_sub_wrapper(context, builder, sig, args): # dpcpp yet does not support ``__spirv_AtomicFSubEXT``. To support atomic.sub we # reuse atomic.add and negate the value. For example, atomic.add(A, index, -val) is # equivalent to atomic.sub(A, index, val). val = args[2] new_val = cgutils.alloca_once( builder, context.get_value_type(sig.args[2]), size=context.get_constant(types.uintp, 1), name="new_val_0", ) val_dtype = sig.args[2] if val_dtype == types.float32 or val_dtype == types.float64: builder.store(builder.fmul(val, context.get_constant(sig.args[2], -1)), new_val) elif val_dtype == types.int32 or val_dtype == types.int64: builder.store(builder.mul(val, context.get_constant(sig.args[2], -1)), new_val) else: raise TypeError("Unsupported type %s" % val_dtype) args[2] = builder.load(new_val) return native_atomic_add(context, builder, sig, args)
def create_null_ptr(builder, context): """ Allocates a new LLVM Value storing a ``void*`` and returns the Value to caller. Args: builder: The LLVM IR builder to be used for code generation. context: The LLVM IR builder context. Returns: An LLVM value storing a null pointer """ null_ptr = cgutils.alloca_once( builder=builder, ty=context.get_value_type(types.voidptr), size=context.get_constant(types.uintp, 1), ) builder.store( builder.inttoptr( context.get_constant(types.uintp, 0), get_llvm_type(context=context, type=types.voidptr), ), null_ptr, ) return null_ptr
def details(context, builder, signature, args): ll_void = context.get_value_type(types.void) ll_Py_UCS4 = context.get_value_type(_Py_UCS4) ll_intc = context.get_value_type(types.intc) ll_intc_ptr = ll_intc.as_pointer() ll_uchar = context.get_value_type(types.uchar) ll_uchar_ptr = ll_uchar.as_pointer() ll_ushort = context.get_value_type(types.ushort) ll_ushort_ptr = ll_ushort.as_pointer() fnty = lc.Type.function( ll_void, [ ll_Py_UCS4, # code ll_intc_ptr, # upper ll_intc_ptr, # lower ll_intc_ptr, # title ll_uchar_ptr, # decimal ll_uchar_ptr, # digit ll_ushort_ptr, # flags ], ) fn = builder.module.get_or_insert_function(fnty, name="numba_gettyperecord") upper = cgutils.alloca_once(builder, ll_intc, name="upper") lower = cgutils.alloca_once(builder, ll_intc, name="lower") title = cgutils.alloca_once(builder, ll_intc, name="title") decimal = cgutils.alloca_once(builder, ll_uchar, name="decimal") digit = cgutils.alloca_once(builder, ll_uchar, name="digit") flags = cgutils.alloca_once(builder, ll_ushort, name="flags") byref = [upper, lower, title, decimal, digit, flags] builder.call(fn, [args[0]] + byref) buf = [] for x in byref: buf.append(builder.load(x)) res = context.make_tuple(builder, signature.return_type, tuple(buf)) return impl_ret_untracked(context, builder, signature.return_type, res)
def _prepare_call_to_object_mode(context, builder, pyapi, func, signature, args): mod = builder.module bb_core_return = builder.append_basic_block('ufunc.core.return') # Call to # PyObject* ndarray_new(int nd, # npy_intp *dims, /* shape */ # npy_intp *strides, # void* data, # int type_num, # int itemsize) ll_int = context.get_value_type(types.int32) ll_intp = context.get_value_type(types.intp) ll_intp_ptr = Type.pointer(ll_intp) ll_voidptr = context.get_value_type(types.voidptr) ll_pyobj = context.get_value_type(types.pyobject) fnty = Type.function( ll_pyobj, [ll_int, ll_intp_ptr, ll_intp_ptr, ll_voidptr, ll_int, ll_int]) fn_array_new = mod.get_or_insert_function(fnty, name="numba_ndarray_new") # Convert each llarray into pyobject error_pointer = cgutils.alloca_once(builder, Type.int(1), name='error') builder.store(cgutils.true_bit, error_pointer) # The PyObject* arguments to the kernel function object_args = [] object_pointers = [] for i, (arg, argty) in enumerate(zip(args, signature.args)): # Allocate NULL-initialized slot for this argument objptr = cgutils.alloca_once(builder, ll_pyobj, zfill=True) object_pointers.append(objptr) if isinstance(argty, types.Array): # Special case arrays: we don't need full-blown NRT reflection # since the argument will be gone at the end of the kernel arycls = context.make_array(argty) array = arycls(context, builder, value=arg) zero = Constant.int(ll_int, 0) # Extract members of the llarray nd = Constant.int(ll_int, argty.ndim) dims = builder.gep(array._get_ptr_by_name('shape'), [zero, zero]) strides = builder.gep(array._get_ptr_by_name('strides'), [zero, zero]) data = builder.bitcast(array.data, ll_voidptr) dtype = np.dtype(str(argty.dtype)) # Prepare other info for reconstruction of the PyArray type_num = Constant.int(ll_int, dtype.num) itemsize = Constant.int(ll_int, dtype.itemsize) # Call helper to reconstruct PyArray objects obj = builder.call(fn_array_new, [nd, dims, strides, data, type_num, itemsize]) else: # Other argument types => use generic boxing obj = pyapi.from_native_value(argty, arg) builder.store(obj, objptr) object_args.append(obj) obj_is_null = cgutils.is_null(builder, obj) builder.store(obj_is_null, error_pointer) cgutils.cbranch_or_continue(builder, obj_is_null, bb_core_return) # Call ufunc core function object_sig = [types.pyobject] * len(object_args) status, retval = context.call_conv.call_function(builder, func, types.pyobject, object_sig, object_args) builder.store(status.is_error, error_pointer) # Release returned object pyapi.decref(retval) builder.branch(bb_core_return) # At return block builder.position_at_end(bb_core_return) # Release argument objects for objptr in object_pointers: pyapi.decref(builder.load(objptr)) innercall = status.code return innercall, builder.load(error_pointer)
def box_COO(typ: COOType, val: "some LLVM thing", c) -> COO: ret_ptr = cgutils.alloca_once(c.builder, c.pyapi.pyobj) fail_obj = c.pyapi.get_null_object() coo = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) with local_return(c.builder) as ret: data_obj = c.box(typ.data_type, coo.data) with cgutils.if_unlikely(c.builder, cgutils.is_null(c.builder, data_obj)): c.builder.store(fail_obj, ret_ptr) ret() coords_obj = c.box(typ.coords_type, coo.coords) with cgutils.if_unlikely(c.builder, cgutils.is_null(c.builder, coords_obj)): c.pyapi.decref(data_obj) c.builder.store(fail_obj, ret_ptr) ret() shape_obj = c.box(typ.shape_type, coo.shape) with cgutils.if_unlikely(c.builder, cgutils.is_null(c.builder, shape_obj)): c.pyapi.decref(coords_obj) c.pyapi.decref(data_obj) c.builder.store(fail_obj, ret_ptr) ret() fill_value_obj = c.box(typ.fill_value_type, coo.fill_value) with cgutils.if_unlikely(c.builder, cgutils.is_null(c.builder, fill_value_obj)): c.pyapi.decref(shape_obj) c.pyapi.decref(coords_obj) c.pyapi.decref(data_obj) c.builder.store(fail_obj, ret_ptr) ret() class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(COO)) with cgutils.if_unlikely(c.builder, cgutils.is_null(c.builder, class_obj)): c.pyapi.decref(shape_obj) c.pyapi.decref(coords_obj) c.pyapi.decref(data_obj) c.pyapi.decref(fill_value_obj) c.builder.store(fail_obj, ret_ptr) ret() args = c.pyapi.tuple_pack([coords_obj, data_obj, shape_obj]) c.pyapi.decref(shape_obj) c.pyapi.decref(coords_obj) c.pyapi.decref(data_obj) with cgutils.if_unlikely(c.builder, cgutils.is_null(c.builder, args)): c.pyapi.decref(fill_value_obj) c.pyapi.decref(class_obj) c.builder.store(fail_obj, ret_ptr) ret() kwargs = c.pyapi.dict_pack([("fill_value", fill_value_obj)]) c.pyapi.decref(fill_value_obj) with cgutils.if_unlikely(c.builder, cgutils.is_null(c.builder, kwargs)): c.pyapi.decref(class_obj) c.builder.store(fail_obj, ret_ptr) ret() c.builder.store(c.pyapi.call(class_obj, args, kwargs), ret_ptr) c.pyapi.decref(class_obj) c.pyapi.decref(args) c.pyapi.decref(kwargs) return c.builder.load(ret_ptr)
def _build_array(context, builder, array_ty, input_types, inputs): """Utility function to handle allocation of an implicit output array given the target context, builder, output array type, and a list of _ArrayHelper instances. """ # First, strip optional types, ufunc loops are typed on concrete types input_types = [ x.type if isinstance(x, types.Optional) else x for x in input_types ] intp_ty = context.get_value_type(types.intp) def make_intp_const(val): return context.get_constant(types.intp, val) ZERO = make_intp_const(0) ONE = make_intp_const(1) src_shape = cgutils.alloca_once(builder, intp_ty, array_ty.ndim, "src_shape") dest_ndim = make_intp_const(array_ty.ndim) dest_shape = cgutils.alloca_once(builder, intp_ty, array_ty.ndim, "dest_shape") dest_shape_addrs = tuple( cgutils.gep_inbounds(builder, dest_shape, index) for index in range(array_ty.ndim)) # Initialize the destination shape with all ones. for dest_shape_addr in dest_shape_addrs: builder.store(ONE, dest_shape_addr) # For each argument, try to broadcast onto the destination shape, # mutating along any axis where the argument shape is not one and # the destination shape is one. for arg_number, arg in enumerate(inputs): if not hasattr(arg, "ndim"): # Skip scalar arguments continue arg_ndim = make_intp_const(arg.ndim) for index in range(arg.ndim): builder.store(arg.shape[index], cgutils.gep_inbounds(builder, src_shape, index)) arg_result = context.compile_internal( builder, _broadcast_onto, _broadcast_onto_sig, [arg_ndim, src_shape, dest_ndim, dest_shape]) with cgutils.if_unlikely(builder, builder.icmp(lc.ICMP_SLT, arg_result, ONE)): msg = "unable to broadcast argument %d to output array" % ( arg_number, ) loc = errors.loc_info.get('loc', None) if loc is not None: msg += '\nFile "%s", line %d, ' % (loc.filename, loc.line) context.call_conv.return_user_exc(builder, ValueError, (msg, )) real_array_ty = array_ty.as_array dest_shape_tup = tuple( builder.load(dest_shape_addr) for dest_shape_addr in dest_shape_addrs) array_val = arrayobj._empty_nd_impl(context, builder, real_array_ty, dest_shape_tup) # Get the best argument to call __array_wrap__ on array_wrapper_index = select_array_wrapper(input_types) array_wrapper_ty = input_types[array_wrapper_index] try: # __array_wrap__(source wrapped array, out array) -> out wrapped array array_wrap = context.get_function( '__array_wrap__', array_ty(array_wrapper_ty, real_array_ty)) except NotImplementedError: # If it's the same priority as a regular array, assume we # should use the allocated array unchanged. if array_wrapper_ty.array_priority != types.Array.array_priority: raise out_val = array_val._getvalue() else: wrap_args = (inputs[array_wrapper_index].return_val, array_val._getvalue()) out_val = array_wrap(builder, wrap_args) ndim = array_ty.ndim shape = cgutils.unpack_tuple(builder, array_val.shape, ndim) strides = cgutils.unpack_tuple(builder, array_val.strides, ndim) return _ArrayHelper(context, builder, shape, strides, array_val.data, array_ty.layout, array_ty.dtype, ndim, out_val)
def build_ufunc_wrapper(library, context, fname, signature, objmode, cres): """ Wrap the scalar function with a loop that iterates over the arguments Returns ------- (library, env, name) """ assert isinstance(fname, str) byte_t = Type.int(8) byte_ptr_t = Type.pointer(byte_t) byte_ptr_ptr_t = Type.pointer(byte_ptr_t) intp_t = context.get_value_type(types.intp) intp_ptr_t = Type.pointer(intp_t) fnty = Type.function(Type.void(), [byte_ptr_ptr_t, intp_ptr_t, intp_ptr_t, byte_ptr_t]) wrapperlib = context.codegen().create_library('ufunc_wrapper') wrapper_module = wrapperlib.create_ir_module('') if objmode: func_type = context.call_conv.get_function_type( types.pyobject, [types.pyobject] * len(signature.args)) else: func_type = context.call_conv.get_function_type( signature.return_type, signature.args) func = wrapper_module.add_function(func_type, name=fname) func.attributes.add("alwaysinline") wrapper = wrapper_module.add_function(fnty, "__ufunc__." + func.name) arg_args, arg_dims, arg_steps, arg_data = wrapper.args arg_args.name = "args" arg_dims.name = "dims" arg_steps.name = "steps" arg_data.name = "data" builder = Builder(wrapper.append_basic_block("entry")) # Prepare Environment envname = context.get_env_name(cres.fndesc) env = cres.environment envptr = builder.load(context.declare_env_global(builder.module, envname)) # Emit loop loopcount = builder.load(arg_dims, name="loopcount") # Prepare inputs arrays = [] for i, typ in enumerate(signature.args): arrays.append(UArrayArg(context, builder, arg_args, arg_steps, i, typ)) # Prepare output out = UArrayArg(context, builder, arg_args, arg_steps, len(arrays), signature.return_type) # Setup indices offsets = [] zero = context.get_constant(types.intp, 0) for _ in arrays: p = cgutils.alloca_once(builder, intp_t) offsets.append(p) builder.store(zero, p) store_offset = cgutils.alloca_once(builder, intp_t) builder.store(zero, store_offset) unit_strided = cgutils.true_bit for ary in arrays: unit_strided = builder.and_(unit_strided, ary.is_unit_strided) pyapi = context.get_python_api(builder) if objmode: # General loop gil = pyapi.gil_ensure() with cgutils.for_range(builder, loopcount, intp=intp_t): build_obj_loop_body( context, func, builder, arrays, out, offsets, store_offset, signature, pyapi, envptr, env, ) pyapi.gil_release(gil) builder.ret_void() else: with builder.if_else(unit_strided) as (is_unit_strided, is_strided): with is_unit_strided: with cgutils.for_range(builder, loopcount, intp=intp_t) as loop: build_fast_loop_body( context, func, builder, arrays, out, offsets, store_offset, signature, loop.index, pyapi, env=envptr, ) with is_strided: # General loop with cgutils.for_range(builder, loopcount, intp=intp_t): build_slow_loop_body( context, func, builder, arrays, out, offsets, store_offset, signature, pyapi, env=envptr, ) builder.ret_void() del builder # Link and finalize wrapperlib.add_ir_module(wrapper_module) wrapperlib.add_linking_library(library) return _wrapper_info(library=wrapperlib, env=env, name=wrapper.name)
def init_gdb_codegen(cgctx, builder, signature, args, const_args, do_break=False): int8_t = ir.IntType(8) int32_t = ir.IntType(32) intp_t = ir.IntType(utils.MACHINE_BITS) char_ptr = ir.PointerType(ir.IntType(8)) zero_i32t = int32_t(0) mod = builder.module pid = cgutils.alloca_once(builder, int32_t, size=1) # 32bit pid, 11 char max + terminator pidstr = cgutils.alloca_once(builder, int8_t, size=12) # str consts intfmt = cgctx.insert_const_string(mod, '%d') gdb_str = cgctx.insert_const_string(mod, config.GDB_BINARY) attach_str = cgctx.insert_const_string(mod, 'attach') new_args = [] # add break point command to known location # this command file thing is due to commands attached to a breakpoint # requiring an interactive prompt # https://sourceware.org/bugzilla/show_bug.cgi?id=10079 new_args.extend(['-x', os.path.join(_path, 'cmdlang.gdb')]) # issue command to continue execution from sleep function new_args.extend(['-ex', 'c']) # then run the user defined args if any new_args.extend([x.literal_value for x in const_args]) cmdlang = [cgctx.insert_const_string(mod, x) for x in new_args] # insert getpid, getpid is always successful, call without concern! fnty = ir.FunctionType(int32_t, tuple()) getpid = mod.get_or_insert_function(fnty, "getpid") # insert snprintf # int snprintf(char *str, size_t size, const char *format, ...); fnty = ir.FunctionType(int32_t, (char_ptr, intp_t, char_ptr), var_arg=True) snprintf = mod.get_or_insert_function(fnty, "snprintf") # insert fork fnty = ir.FunctionType(int32_t, tuple()) fork = mod.get_or_insert_function(fnty, "fork") # insert execl fnty = ir.FunctionType(int32_t, (char_ptr, char_ptr), var_arg=True) execl = mod.get_or_insert_function(fnty, "execl") # insert sleep fnty = ir.FunctionType(int32_t, (int32_t, )) sleep = mod.get_or_insert_function(fnty, "sleep") # insert break point fnty = ir.FunctionType(ir.VoidType(), tuple()) breakpoint = mod.get_or_insert_function(fnty, "numba_gdb_breakpoint") # do the work parent_pid = builder.call(getpid, tuple()) builder.store(parent_pid, pid) pidstr_ptr = builder.gep(pidstr, [zero_i32t], inbounds=True) pid_val = builder.load(pid) # call snprintf to write the pid into a char * stat = builder.call(snprintf, (pidstr_ptr, intp_t(12), intfmt, pid_val)) invalid_write = builder.icmp_signed('>', stat, int32_t(12)) with builder.if_then(invalid_write, likely=False): msg = "Internal error: `snprintf` buffer would have overflowed." cgctx.call_conv.return_user_exc(builder, RuntimeError, (msg, )) # fork, check pids etc child_pid = builder.call(fork, tuple()) fork_failed = builder.icmp_signed('==', child_pid, int32_t(-1)) with builder.if_then(fork_failed, likely=False): msg = "Internal error: `fork` failed." cgctx.call_conv.return_user_exc(builder, RuntimeError, (msg, )) is_child = builder.icmp_signed('==', child_pid, zero_i32t) with builder.if_else(is_child) as (then, orelse): with then: # is child nullptr = ir.Constant(char_ptr, None) gdb_str_ptr = builder.gep(gdb_str, [zero_i32t], inbounds=True) attach_str_ptr = builder.gep(attach_str, [zero_i32t], inbounds=True) cgutils.printf(builder, "Attaching to PID: %s\n", pidstr) buf = (gdb_str_ptr, gdb_str_ptr, attach_str_ptr, pidstr_ptr) buf = buf + tuple(cmdlang) + (nullptr, ) builder.call(execl, buf) with orelse: # is parent builder.call(sleep, (int32_t(10), )) # if breaking is desired, break now if do_break is True: builder.call(breakpoint, tuple())
def alloc_boolean_result(builder, name='ret'): """ Allocate an uninitialized boolean result slot. """ ret = cgutils.alloca_once(builder, llvmlite.ir.IntType(1), name=name) return ret
def real_divmod_func_body(context, builder, vx, wx): # Reference Objects/floatobject.c # # float_divmod(PyObject *v, PyObject *w) # { # double vx, wx; # double div, mod, floordiv; # CONVERT_TO_DOUBLE(v, vx); # CONVERT_TO_DOUBLE(w, wx); # mod = fmod(vx, wx); # /* fmod is typically exact, so vx-mod is *mathematically* an # exact multiple of wx. But this is fp arithmetic, and fp # vx - mod is an approximation; the result is that div may # not be an exact integral value after the division, although # it will always be very close to one. # */ # div = (vx - mod) / wx; # if (mod) { # /* ensure the remainder has the same sign as the denominator */ # if ((wx < 0) != (mod < 0)) { # mod += wx; # div -= 1.0; # } # } # else { # /* the remainder is zero, and in the presence of signed zeroes # fmod returns different results across platforms; ensure # it has the same sign as the denominator; we'd like to do # "mod = wx * 0.0", but that may get optimized away */ # mod *= mod; /* hide "mod = +0" from optimizer */ # if (wx < 0.0) # mod = -mod; # } # /* snap quotient to nearest integral value */ # if (div) { # floordiv = floor(div); # if (div - floordiv > 0.5) # floordiv += 1.0; # } # else { # /* div is zero - get the same sign as the true quotient */ # div *= div; /* hide "div = +0" from optimizers */ # floordiv = div * vx / wx; /* zero w/ sign of vx/wx */ # } # return Py_BuildValue("(dd)", floordiv, mod); # } pmod = cgutils.alloca_once(builder, vx.type) pdiv = cgutils.alloca_once(builder, vx.type) pfloordiv = cgutils.alloca_once(builder, vx.type) mod = builder.frem(vx, wx) div = builder.fdiv(builder.fsub(vx, mod), wx) builder.store(mod, pmod) builder.store(div, pdiv) # Note the use of negative zero for proper negating with `ZERO - x` ZERO = vx.type(0.0) NZERO = vx.type(-0.0) ONE = vx.type(1.0) mod_istrue = builder.fcmp_unordered('!=', mod, ZERO) wx_ltz = builder.fcmp_ordered('<', wx, ZERO) mod_ltz = builder.fcmp_ordered('<', mod, ZERO) with builder.if_else(mod_istrue, likely=True) as (if_nonzero_mod, if_zero_mod): with if_nonzero_mod: # `mod` is non-zero or NaN # Ensure the remainder has the same sign as the denominator wx_ltz_ne_mod_ltz = builder.icmp(lc.ICMP_NE, wx_ltz, mod_ltz) with builder.if_then(wx_ltz_ne_mod_ltz): builder.store(builder.fsub(div, ONE), pdiv) builder.store(builder.fadd(mod, wx), pmod) with if_zero_mod: # `mod` is zero, select the proper sign depending on # the denominator's sign mod = builder.select(wx_ltz, NZERO, ZERO) builder.store(mod, pmod) del mod, div div = builder.load(pdiv) div_istrue = builder.fcmp(lc.FCMP_ONE, div, ZERO) with builder.if_then(div_istrue): realtypemap = {'float': types.float32, 'double': types.float64} realtype = realtypemap[str(wx.type)] floorfn = context.get_function(math.floor, typing.signature(realtype, realtype)) floordiv = floorfn(builder, [div]) floordivdiff = builder.fsub(div, floordiv) floordivincr = builder.fadd(floordiv, ONE) HALF = Constant.real(wx.type, 0.5) pred = builder.fcmp(lc.FCMP_OGT, floordivdiff, HALF) floordiv = builder.select(pred, floordivincr, floordiv) builder.store(floordiv, pfloordiv) with cgutils.ifnot(builder, div_istrue): div = builder.fmul(div, div) builder.store(div, pdiv) floordiv = builder.fdiv(builder.fmul(div, vx), wx) builder.store(floordiv, pfloordiv) return builder.load(pfloordiv), builder.load(pmod)
def codegen(context, builder, signature, args): array_type, idx_type, axis_type, extent_type = signature.args array, idx, axis, extent = args array = context.make_array(array_type)(context, builder, array) zero = context.get_constant(types.intp, 0) llvm_intp_t = context.get_value_type(types.intp) ndim = array_type.ndim view_shape = cgutils.alloca_once(builder, llvm_intp_t) view_stride = cgutils.alloca_once(builder, llvm_intp_t) # Final array indexes. We only know the slicing index at runtime # so we need to recreate idx but with zero at the slicing axis indices = cgutils.alloca_once(builder, llvm_intp_t, size=array_type.ndim) for ax in range(array_type.ndim): llvm_ax = context.get_constant(types.intp, ax) predicate = builder.icmp_unsigned("!=", llvm_ax, axis) with builder.if_else(predicate) as (not_equal, equal): with not_equal: # If this is not the slicing axis, # use the appropriate tuple index value = builder.extract_value(idx, ax) builder.store(value, builder.gep(indices, [llvm_ax])) with equal: # If this is the slicing axis, # store zero as the index. # Also record the stride and shape builder.store(zero, builder.gep(indices, [llvm_ax])) size = builder.extract_value(array.shape, ax) stride = builder.extract_value(array.strides, ax) if have_extent: ext_predicate = builder.icmp_signed(">=", extent, size) size = builder.select(ext_predicate, size, extent) builder.store(size, view_shape) builder.store(stride, view_stride) # Build a python list from indices tmp_indices = [] for i in range(ndim): i = context.get_constant(types.intp, i) tmp_indices.append(builder.load(builder.gep(indices, [i]))) # Get the data pointer obtained from indexing the array dataptr = cgutils.get_item_pointer(context, builder, array_type, array, tmp_indices, wraparound=True, boundscheck=True) # Set up the shape and stride. There'll only be one # dimension, corresponding to the axis along which we slice view_shapes = [builder.load(view_shape)] view_strides = [builder.load(view_stride)] # Make a view with the data pointer, shapes and strides retary = make_view(context, builder, array_type, array, return_type, dataptr, view_shapes, view_strides) result = retary._getvalue() return impl_ret_borrowed(context, builder, return_type, result)
def _generic_array(context, builder, shape, dtype, symbol_name, addrspace, can_dynsized=False): elemcount = reduce(operator.mul, shape, 1) # Check for valid shape for this type of allocation. # Only 1d arrays can be dynamic. dynamic_smem = elemcount <= 0 and can_dynsized and len(shape) == 1 if elemcount <= 0 and not dynamic_smem: raise ValueError("array length <= 0") # Check that we support the requested dtype other_supported_type = isinstance(dtype, (types.Record, types.Boolean)) if dtype not in types.number_domain and not other_supported_type: raise TypeError("unsupported type: %s" % dtype) lldtype = context.get_data_type(dtype) laryty = Type.array(lldtype, elemcount) if addrspace == nvvm.ADDRSPACE_LOCAL: # Special case local address space allocation to use alloca # NVVM is smart enough to only use local memory if no register is # available dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name) else: lmod = builder.module # Create global variable in the requested address space gvmem = lmod.add_global_variable(laryty, symbol_name, addrspace) # Specify alignment to avoid misalignment bug align = context.get_abi_sizeof(lldtype) # Alignment is required to be a power of 2 for shared memory. If it is # not a power of 2 (e.g. for a Record array) then round up accordingly. gvmem.align = 1 << (align - 1).bit_length() if dynamic_smem: gvmem.linkage = lc.LINKAGE_EXTERNAL else: ## Comment out the following line to workaround a NVVM bug ## which generates a invalid symbol name when the linkage ## is internal and in some situation. ## See _get_unique_smem_id() # gvmem.linkage = lc.LINKAGE_INTERNAL gvmem.initializer = lc.Constant.undef(laryty) # Convert to generic address-space conv = nvvmutils.insert_addrspace_conv(lmod, Type.int(8), addrspace) addrspaceptr = gvmem.bitcast(Type.pointer(Type.int(8), addrspace)) dataptr = builder.call(conv, [addrspaceptr]) targetdata = _get_target_data(context) lldtype = context.get_data_type(dtype) itemsize = lldtype.get_abi_size(targetdata) # Compute strides laststride = itemsize rstrides = [] for i, lastsize in enumerate(reversed(shape)): rstrides.append(laststride) laststride *= lastsize strides = [s for s in reversed(rstrides)] kstrides = [context.get_constant(types.intp, s) for s in strides] # Compute shape if dynamic_smem: # Compute the shape based on the dynamic shared memory configuration. # Unfortunately NVVM does not provide an intrinsic for the # %dynamic_smem_size register, so we must read it using inline # assembly. get_dynshared_size = InlineAsm.get(Type.function(Type.int(), []), "mov.u32 $0, %dynamic_smem_size;", '=r', side_effect=True) dynsmem_size = builder.zext(builder.call(get_dynshared_size, []), Type.int(width=64)) # Only 1-D dynamic shared memory is supported so the following is a # sufficient construction of the shape kitemsize = context.get_constant(types.intp, itemsize) kshape = [builder.udiv(dynsmem_size, kitemsize)] else: kshape = [context.get_constant(types.intp, s) for s in shape] # Create array object ndim = len(shape) aryty = types.Array(dtype=dtype, ndim=ndim, layout='C') ary = context.make_array(aryty)(context, builder) context.populate_array(ary, data=builder.bitcast(dataptr, ary.data.type), shape=kshape, strides=kstrides, itemsize=context.get_constant(types.intp, itemsize), meminfo=None) return ary._getvalue()
def codegen(context, builder, sig, args): vtablety = ir.LiteralStructType([ ll_voidptr_type, # equal ll_voidptr_type, # key incref ll_voidptr_type, # key decref ll_voidptr_type, # val incref ll_voidptr_type, # val decref ]) setmethod_fnty = ir.FunctionType( ir.VoidType(), [ll_dict_type, vtablety.as_pointer()] ) setmethod_fn = ir.Function( builder.module, setmethod_fnty, name='numba_dict_set_method_table', ) dp = args[0] vtable = cgutils.alloca_once(builder, vtablety, zfill=True) # install key incref/decref key_equal_ptr = cgutils.gep_inbounds(builder, vtable, 0, 0) key_incref_ptr = cgutils.gep_inbounds(builder, vtable, 0, 1) key_decref_ptr = cgutils.gep_inbounds(builder, vtable, 0, 2) val_incref_ptr = cgutils.gep_inbounds(builder, vtable, 0, 3) val_decref_ptr = cgutils.gep_inbounds(builder, vtable, 0, 4) dm_key = context.data_model_manager[keyty.instance_type] if dm_key.contains_nrt_meminfo(): equal = _get_equal(context, builder.module, dm_key, 'dict') key_incref, key_decref = _get_incref_decref( context, builder.module, dm_key, 'dict' ) builder.store( builder.bitcast(equal, key_equal_ptr.type.pointee), key_equal_ptr, ) builder.store( builder.bitcast(key_incref, key_incref_ptr.type.pointee), key_incref_ptr, ) builder.store( builder.bitcast(key_decref, key_decref_ptr.type.pointee), key_decref_ptr, ) dm_val = context.data_model_manager[valty.instance_type] if dm_val.contains_nrt_meminfo(): val_incref, val_decref = _get_incref_decref( context, builder.module, dm_val, 'dict' ) builder.store( builder.bitcast(val_incref, val_incref_ptr.type.pointee), val_incref_ptr, ) builder.store( builder.bitcast(val_decref, val_decref_ptr.type.pointee), val_decref_ptr, ) builder.call(setmethod_fn, [dp, vtable])
def enqueue_kernel_and_copy_back(self, dim_bounds, sycl_queue_val): """ enqueue_kernel_and_copy_back(dim_bounds, sycl_queue_val) Submits the kernel to the specified queue, waits and then copies back any results to the host. Args: dim_bounds : An array of three tuple representing the starting offset, end offset and the stride (step) for each dimension of the input arrays. Every array in a parfor is of the same dimensionality and shape, thus ensuring the bounds are the same. sycl_queue_val : The SYCL queue on which the kernel is submitted. """ submit_fn = DpctlCAPIFnBuilder.get_dpctl_queue_submit_range( builder=self.builder, context=self.context) queue_wait_fn = DpctlCAPIFnBuilder.get_dpctl_queue_wait( builder=self.builder, context=self.context) event_del_fn = DpctlCAPIFnBuilder.get_dpctl_event_delete( builder=self.builder, context=self.context) memcpy_fn = DpctlCAPIFnBuilder.get_dpctl_queue_memcpy( builder=self.builder, context=self.context) free_fn = DpctlCAPIFnBuilder.get_dpctl_free_with_queue( builder=self.builder, context=self.context) event_wait_fn = DpctlCAPIFnBuilder.get_dpctl_event_wait( builder=self.builder, context=self.context) # the assumption is loop_ranges will always be less than or equal to 3 # dimensions num_dim = len(dim_bounds) if len(dim_bounds) < 4 else 3 # form the global range global_range = cgutils.alloca_once( self.builder, utils.get_llvm_type(context=self.context, type=types.uintp), size=self.context.get_constant(types.uintp, num_dim), name="global_range", ) intp_t = utils.get_llvm_type(context=self.context, type=types.intp) intp_ptr_t = utils.get_llvm_ptr_type(intp_t) for i in range(num_dim): start, stop, step = dim_bounds[i] if stop.type != utils.LLVMTypes.int64_t: stop = self.builder.sext(stop, utils.LLVMTypes.int64_t) # we reverse the global range to account for how sycl and opencl # range differs self.builder.store( stop, self.builder.gep( global_range, [ self.context.get_constant(types.uintp, (num_dim - 1) - i) ], ), ) args = [ self.builder.inttoptr( self.context.get_constant(types.uintp, self.kernel_addr), utils.get_llvm_type(context=self.context, type=types.voidptr), ), self.builder.load(sycl_queue_val), self.kernel_arg_array, self.kernel_arg_ty_array, self.context.get_constant(types.uintp, self.total_kernel_args), self.builder.bitcast(global_range, intp_ptr_t), self.context.get_constant(types.uintp, num_dim), self.builder.bitcast( utils.create_null_ptr(builder=self.builder, context=self.context), utils.get_llvm_type(context=self.context, type=types.voidptr), ), self.context.get_constant(types.uintp, 0), ] # Submit the kernel event_ref = self.builder.call(submit_fn, args) # Add a wait on the queue self.builder.call(queue_wait_fn, [self.builder.load(sycl_queue_val)]) # Note that the dpctl_queue_wait call waits on the event and then # decrements the ref count of the sycl::event C++ object. However, the # event object returned by the get_dpctl_queue_submit_range call still # needs to be explicitly deleted to free up the event object properly. self.builder.call(event_del_fn, [event_ref]) # read buffers back to host for write_buff in self.write_buffs: buffer_ptr, total_size, data_member = write_buff args = [ self.builder.load(sycl_queue_val), self.builder.bitcast( self.builder.load(data_member), utils.get_llvm_type(context=self.context, type=types.voidptr), ), self.builder.load(buffer_ptr), self.builder.load(total_size), ] # FIXME: In future, when the DctlQueue_Memcpy is made non-blocking # the returned event should be explicitly freed by calling # get_dpctl_event_delete. event_ref = self.builder.call(memcpy_fn, args) self.builder.call(event_wait_fn, [event_ref]) self.builder.call(event_del_fn, [event_ref]) self.builder.call( free_fn, [ self.builder.load(buffer_ptr), self.builder.load(sycl_queue_val), ], ) for read_buff in self.read_only_buffs: buffer_ptr, total_size, data_member = read_buff self.builder.call( free_fn, [ self.builder.load(buffer_ptr), self.builder.load(sycl_queue_val), ], )
def unicode_to_unicode_charseq(context, builder, fromty, toty, val): uni_str = cgutils.create_struct_proxy(fromty)(context, builder, value=val) src1 = builder.bitcast(uni_str.data, ir.IntType(8).as_pointer()) src2 = builder.bitcast(uni_str.data, ir.IntType(16).as_pointer()) src4 = builder.bitcast(uni_str.data, ir.IntType(32).as_pointer()) kind1 = builder.icmp_unsigned("==", uni_str.kind, ir.Constant(uni_str.kind.type, 1)) kind2 = builder.icmp_unsigned("==", uni_str.kind, ir.Constant(uni_str.kind.type, 2)) kind4 = builder.icmp_unsigned("==", uni_str.kind, ir.Constant(uni_str.kind.type, 4)) src_length = uni_str.length lty = context.get_value_type(toty) dstint_t = ir.IntType(8 * unicode_byte_width) dst_ptr = cgutils.alloca_once(builder, lty) dst = builder.bitcast(dst_ptr, dstint_t.as_pointer()) dst_length = ir.Constant(src_length.type, toty.count) is_shorter_value = builder.icmp_unsigned("<", src_length, dst_length) count = builder.select(is_shorter_value, src_length, dst_length) with builder.if_then(is_shorter_value): cgutils.memset( builder, dst, ir.Constant(src_length.type, toty.count * unicode_byte_width), 0, ) with builder.if_then(kind1): with cgutils.for_range(builder, count) as loop: in_ptr = builder.gep(src1, [loop.index]) in_val = builder.zext(builder.load(in_ptr), dstint_t) builder.store(in_val, builder.gep(dst, [loop.index])) with builder.if_then(kind2): if unicode_byte_width >= 2: with cgutils.for_range(builder, count) as loop: in_ptr = builder.gep(src2, [loop.index]) in_val = builder.zext(builder.load(in_ptr), dstint_t) builder.store(in_val, builder.gep(dst, [loop.index])) else: context.call_conv.return_user_exc( builder, ValueError, "cannot cast 16-bit unicode_type to %s-bit %s" % (unicode_byte_width * 8, toty), ) with builder.if_then(kind4): if unicode_byte_width >= 4: with cgutils.for_range(builder, count) as loop: in_ptr = builder.gep(src4, [loop.index]) in_val = builder.zext(builder.load(in_ptr), dstint_t) builder.store(in_val, builder.gep(dst, [loop.index])) else: context.call_conv.return_user_exc( builder, ValueError, "cannot cast 32-bit unicode_type to %s-bit %s" % (unicode_byte_width * 8, toty), ) return builder.load(dst_ptr)
def codegen(context, builder, sig, args): lir_res_type = context.get_value_type(number_dtype) res_ptr = cgutils.alloca_once(builder, lir_res_type) return builder.bitcast(res_ptr, cgutils.voidptr_t)
def from_data(self, builder, value): stack = cgutils.alloca_once(builder, value.type) builder.store(value, stack) return stack
def _randrange_impl(context, builder, start, stop, step, state): state_ptr = get_state_ptr(context, builder, state) ty = stop.type zero = ir.Constant(ty, 0) one = ir.Constant(ty, 1) nptr = cgutils.alloca_once(builder, ty, name="n") # n = stop - start builder.store(builder.sub(stop, start), nptr) with builder.if_then(builder.icmp_signed("<", step, zero)): # n = (n + step + 1) // step w = builder.add(builder.add(builder.load(nptr), step), one) n = builder.sdiv(w, step) builder.store(n, nptr) with builder.if_then(builder.icmp_signed(">", step, one)): # n = (n + step - 1) // step w = builder.sub(builder.add(builder.load(nptr), step), one) n = builder.sdiv(w, step) builder.store(n, nptr) n = builder.load(nptr) with cgutils.if_unlikely(builder, builder.icmp_signed("<=", n, zero)): # n <= 0 msg = "empty range for randrange()" context.call_conv.return_user_exc(builder, ValueError, (msg, )) fnty = ir.FunctionType(ty, [ty, cgutils.true_bit.type]) fn = builder.function.module.get_or_insert_function( fnty, "llvm.ctlz.%s" % ty) # Since the upper bound is exclusive, we need to subtract one before # calculating the number of bits. This leads to a special case when # n == 1; there's only one possible result, so we don't need bits from # the PRNG. This case is handled separately towards the end of this # function. CPython's implementation is simpler and just runs another # iteration of the while loop when the resulting number is too large # instead of subtracting one, to avoid needing to handle a special # case. Thus, we only perform this subtraction for the NumPy case. nm1 = builder.sub(n, one) if state == "np" else n nbits = builder.trunc(builder.call(fn, [nm1, cgutils.true_bit]), int32_t) nbits = builder.sub(ir.Constant(int32_t, ty.width), nbits) rptr = cgutils.alloca_once(builder, ty, name="r") def get_num(): bbwhile = builder.append_basic_block("while") bbend = builder.append_basic_block("while.end") builder.branch(bbwhile) builder.position_at_end(bbwhile) r = get_next_int(context, builder, state_ptr, nbits, state == "np") r = builder.trunc(r, ty) too_large = builder.icmp_signed(">=", r, n) builder.cbranch(too_large, bbwhile, bbend) builder.position_at_end(bbend) builder.store(r, rptr) if state == "np": # Handle n == 1 case, per previous comment. with builder.if_else(builder.icmp_signed("==", n, one)) as (is_one, is_not_one): with is_one: builder.store(zero, rptr) with is_not_one: get_num() else: get_num() return builder.add(start, builder.mul(builder.load(rptr), step))