def _make_array(context, builder, dataptr, dtype, shape, layout='C'): ndim = len(shape) # Create array object aryty = types.Array(dtype=dtype, ndim=ndim, layout='C') ary = context.make_array(aryty)(context, builder) targetdata = _get_target_data(context) lldtype = context.get_data_type(dtype) itemsize = lldtype.get_abi_size(targetdata) # Compute strides rstrides = [itemsize] for i, lastsize in enumerate(reversed(shape[1:])): rstrides.append(lastsize * rstrides[-1]) strides = [s for s in reversed(rstrides)] kshape = [context.get_constant(types.intp, s) for s in shape] kstrides = [context.get_constant(types.intp, s) for s in strides] context.populate_array(ary, data=builder.bitcast(dataptr, ary.data.type), shape=cgutils.pack_array(builder, kshape), strides=cgutils.pack_array(builder, kstrides), itemsize=context.get_constant(types.intp, itemsize), meminfo=None) return ary._getvalue()
def _shape_and_strides(self, context, builder): # Set shape and strides for a 1D size 1 array one = context.get_constant(types.intp, 1) zero = context.get_constant(types.intp, 0) shape = cgutils.pack_array(builder, [one]) strides = cgutils.pack_array(builder, [zero]) return shape, strides
def cuda_gridsize(context, builder, sig, args): restype = sig.return_type nx = _nthreads_for_dim(builder, 'x') if restype == types.int32: return nx elif isinstance(restype, types.UniTuple): ny = _nthreads_for_dim(builder, 'y') if restype.count == 2: return cgutils.pack_array(builder, (nx, ny)) elif restype.count == 3: nz = _nthreads_for_dim(builder, 'z') return cgutils.pack_array(builder, (nx, ny, nz)) # Fallthrough to here indicates unexpected return type or tuple length raise ValueError('Unexpected return type %s of cuda.gridsize' % restype)
def make_constant_array(self, builder, aryty, arr): """ Unlike the parent version. This returns a a pointer in the constant addrspace. """ lmod = builder.module constvals = [ self.get_constant(types.byte, i) for i in iter(arr.tobytes(order='A')) ] constary = lc.Constant.array(Type.int(8), constvals) addrspace = nvvm.ADDRSPACE_CONSTANT gv = lmod.add_global_variable(constary.type, name="_cudapy_cmem", addrspace=addrspace) gv.linkage = lc.LINKAGE_INTERNAL gv.global_constant = True gv.initializer = constary # Preserve the underlying alignment lldtype = self.get_data_type(aryty.dtype) align = self.get_abi_sizeof(lldtype) gv.align = 2**(align - 1).bit_length() # Convert to generic address-space conv = nvvmutils.insert_addrspace_conv(lmod, Type.int(8), addrspace) addrspaceptr = gv.bitcast(Type.pointer(Type.int(8), addrspace)) genptr = builder.call(conv, [addrspaceptr]) # Create array object ary = self.make_array(aryty)(self, builder) kshape = [self.get_constant(types.intp, s) for s in arr.shape] kstrides = [self.get_constant(types.intp, s) for s in arr.strides] self.populate_array(ary, data=builder.bitcast(genptr, ary.data.type), shape=cgutils.pack_array(builder, kshape), strides=cgutils.pack_array(builder, kstrides), itemsize=ary.itemsize, parent=ary.parent, meminfo=None) return ary._getvalue()
def cuda_grid(context, builder, sig, args): restype = sig.return_type if restype == types.int32: return nvvmutils.get_global_id(builder, dim=1) elif isinstance(restype, types.UniTuple): ids = nvvmutils.get_global_id(builder, dim=restype.count) return cgutils.pack_array(builder, ids) else: raise ValueError('Unexpected return type %s from cuda.grid' % restype)
def unituple_constant(context, builder, ty, pyval): """ Create a homogeneous tuple constant. """ consts = [context.get_constant_generic(builder, ty.dtype, v) for v in pyval] return impl_ret_borrowed( context, builder, ty, cgutils.pack_array(builder, consts), )
def ptx_gridsize2d(context, builder, sig, args): assert len(args) == 1 ntidx = nvvmutils.call_sreg(builder, "ntid.x") nctaidx = nvvmutils.call_sreg(builder, "nctaid.x") ntidy = nvvmutils.call_sreg(builder, "ntid.y") nctaidy = nvvmutils.call_sreg(builder, "nctaid.y") r1 = builder.mul(ntidx, nctaidx) r2 = builder.mul(ntidy, nctaidy) return cgutils.pack_array(builder, [r1, r2])
def core(context, builder, sig, args): lmod = builder.module fargtys = [] for arg in prototype_args: ty = context.get_value_type(arg.ty) if arg.is_ptr: ty = ty.as_pointer() fargtys.append(ty) fretty = context.get_value_type(retty) fnty = Type.function(fretty, fargtys) fn = lmod.get_or_insert_function(fnty, name=func) # For returned values that are returned through a pointer, we need to # allocate variables on the stack and pass a pointer to them. actual_args = [] virtual_args = [] arg_idx = 0 for arg in prototype_args: if arg.is_ptr: # Allocate space for return value and add to args tmp_arg = cgutils.alloca_once(builder, context.get_value_type(arg.ty)) actual_args.append(tmp_arg) virtual_args.append(tmp_arg) else: actual_args.append(args[arg_idx]) arg_idx += 1 ret = builder.call(fn, actual_args) # Following the call, we need to assemble the returned values into a # tuple for returning back to the caller. tuple_args = [] if retty != types.void: tuple_args.append(ret) for arg in virtual_args: tuple_args.append(builder.load(arg)) if isinstance(nb_retty, types.UniTuple): return cgutils.pack_array(builder, tuple_args) else: return cgutils.pack_struct(builder, tuple_args)
def real_divmod_impl(context, builder, sig, args, loc=None): x, y = args quot = cgutils.alloca_once(builder, x.type, name="quot") rem = cgutils.alloca_once(builder, x.type, name="rem") with builder.if_else(cgutils.is_scalar_zero(builder, y), likely=False) as (if_zero, if_non_zero): with if_zero: if not context.error_model.fp_zero_division( builder, ("modulo by zero", ), loc): # No exception raised => compute the nan result, # and set the FP exception word for Numpy warnings. q = builder.fdiv(x, y) r = builder.frem(x, y) builder.store(q, quot) builder.store(r, rem) with if_non_zero: q, r = real_divmod(context, builder, x, y) builder.store(q, quot) builder.store(r, rem) return cgutils.pack_array(builder, (builder.load(quot), builder.load(rem)))
def as_data(self, builder, value): values = [ builder.load(cgutils.gep_inbounds(builder, value, i)) for i in range(self._fe_type.count) ] return cgutils.pack_array(builder, values)
def _shape_and_strides(self, context, builder): shape = cgutils.pack_array(builder, self.shape) strides = cgutils.pack_array(builder, self.strides) return shape, strides
def int_divmod_impl(context, builder, sig, args): quot, rem = _int_divmod_impl(context, builder, sig, args, "integer divmod by zero") return cgutils.pack_array(builder, (builder.load(quot), builder.load(rem)))
def _generic_array(context, builder, shape, dtype, symbol_name, addrspace, can_dynsized=False): elemcount = reduce(operator.mul, shape) # Check for valid shape for this type of allocation dynamic_smem = elemcount <= 0 and can_dynsized if elemcount <= 0 and not dynamic_smem: raise ValueError("array length <= 0") # Check that we support the requested dtype other_supported_type = isinstance(dtype, (types.Record, types.Boolean)) if dtype not in types.number_domain and not other_supported_type: raise TypeError("unsupported type: %s" % dtype) lldtype = context.get_data_type(dtype) laryty = Type.array(lldtype, elemcount) if addrspace == nvvm.ADDRSPACE_LOCAL: # Special case local address space allocation to use alloca # NVVM is smart enough to only use local memory if no register is # available dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name) else: lmod = builder.module # Create global variable in the requested address space gvmem = lmod.add_global_variable(laryty, symbol_name, addrspace) # Specify alignment to avoid misalignment bug align = context.get_abi_sizeof(lldtype) # Alignment is required to be a power of 2 for shared memory. If it is # not a power of 2 (e.g. for a Record array) then round up accordingly. gvmem.align = 1 << (align - 1).bit_length() if dynamic_smem: gvmem.linkage = lc.LINKAGE_EXTERNAL else: ## Comment out the following line to workaround a NVVM bug ## which generates a invalid symbol name when the linkage ## is internal and in some situation. ## See _get_unique_smem_id() # gvmem.linkage = lc.LINKAGE_INTERNAL gvmem.initializer = lc.Constant.undef(laryty) # Convert to generic address-space conv = nvvmutils.insert_addrspace_conv(lmod, Type.int(8), addrspace) addrspaceptr = gvmem.bitcast(Type.pointer(Type.int(8), addrspace)) dataptr = builder.call(conv, [addrspaceptr]) targetdata = _get_target_data(context) lldtype = context.get_data_type(dtype) itemsize = lldtype.get_abi_size(targetdata) # Compute strides rstrides = [itemsize] for i, lastsize in enumerate(reversed(shape[1:])): rstrides.append(lastsize * rstrides[-1]) strides = [s for s in reversed(rstrides)] kstrides = [context.get_constant(types.intp, s) for s in strides] # Compute shape if dynamic_smem: # Compute the shape based on the dynamic shared memory configuration. # Unfortunately NVVM does not provide an intrinsic for the # %dynamic_smem_size register, so we must read it using inline # assembly. get_dynshared_size = InlineAsm.get( Type.function(Type.int(), []), "mov.u32 $0, %dynamic_smem_size;", "=r", side_effect=True, ) dynsmem_size = builder.zext(builder.call(get_dynshared_size, []), Type.int(width=64)) # Only 1-D dynamic shared memory is supported so the following is a # sufficient construction of the shape kitemsize = context.get_constant(types.intp, itemsize) kshape = [builder.udiv(dynsmem_size, kitemsize)] else: kshape = [context.get_constant(types.intp, s) for s in shape] # Create array object ndim = len(shape) aryty = types.Array(dtype=dtype, ndim=ndim, layout="C") ary = context.make_array(aryty)(context, builder) context.populate_array( ary, data=builder.bitcast(dataptr, ary.data.type), shape=cgutils.pack_array(builder, kshape), strides=cgutils.pack_array(builder, kstrides), itemsize=context.get_constant(types.intp, itemsize), meminfo=None, ) return ary._getvalue()
def ptx_grid3d(context, builder, sig, args): assert len(args) == 1 r1, r2, r3 = nvvmutils.get_global_id(builder, dim=3) return cgutils.pack_array(builder, [r1, r2, r3])