def insert_addrspace_conv(self, builder, ptr, addrspace): """ Perform addrspace conversion according to the NVVM spec """ lmod = builder.module base_type = ptr.type.pointee conv = nvvmutils.insert_addrspace_conv(lmod, base_type, addrspace) return builder.call(conv, [ptr])
def make_constant_array(self, builder, aryty, arr): """ Unlike the parent version. This returns a a pointer in the constant addrspace. """ lmod = builder.module constvals = [ self.get_constant(types.byte, i) for i in iter(arr.tobytes(order='A')) ] constaryty = ir.ArrayType(ir.IntType(8), len(constvals)) constary = ir.Constant(constaryty, constvals) addrspace = nvvm.ADDRSPACE_CONSTANT gv = cgutils.add_global_variable(lmod, constary.type, "_cudapy_cmem", addrspace=addrspace) gv.linkage = 'internal' gv.global_constant = True gv.initializer = constary # Preserve the underlying alignment lldtype = self.get_data_type(aryty.dtype) align = self.get_abi_sizeof(lldtype) gv.align = 2**(align - 1).bit_length() # Convert to generic address-space conv = nvvmutils.insert_addrspace_conv(lmod, ir.IntType(8), addrspace) addrspaceptr = gv.bitcast(ir.PointerType(ir.IntType(8), addrspace)) genptr = builder.call(conv, [addrspaceptr]) # Create array object ary = self.make_array(aryty)(self, builder) kshape = [self.get_constant(types.intp, s) for s in arr.shape] kstrides = [self.get_constant(types.intp, s) for s in arr.strides] self.populate_array(ary, data=builder.bitcast(genptr, ary.data.type), shape=kshape, strides=kstrides, itemsize=ary.itemsize, parent=ary.parent, meminfo=None) return ary._getvalue()
def _generic_array(context, builder, shape, dtype, symbol_name, addrspace, can_dynsized=False): elemcount = reduce(operator.mul, shape) lldtype = context.get_data_type(dtype) laryty = Type.array(lldtype, elemcount) if addrspace == nvvm.ADDRSPACE_LOCAL: # Special case local addrespace allocation to use alloca # NVVM is smart enough to only use local memory if no register is # available dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name) else: lmod = builder.module # Create global variable in the requested address-space gvmem = lmod.add_global_variable(laryty, symbol_name, addrspace) # Specify alignment to avoid misalignment bug align = context.get_abi_sizeof(lldtype) # Alignment is required to be a power of 2 for shared memory. If it is # not a power of 2 (e.g. for a Record array) then round up accordingly. gvmem.align = 1 << (align - 1 ).bit_length() if elemcount <= 0: if can_dynsized: # dynamic shared memory gvmem.linkage = lc.LINKAGE_EXTERNAL else: raise ValueError("array length <= 0") else: ## Comment out the following line to workaround a NVVM bug ## which generates a invalid symbol name when the linkage ## is internal and in some situation. ## See _get_unique_smem_id() # gvmem.linkage = lc.LINKAGE_INTERNAL gvmem.initializer = lc.Constant.undef(laryty) other_supported_type = isinstance(dtype, (types.Record, types.Boolean)) if dtype not in types.number_domain and not other_supported_type: raise TypeError("unsupported type: %s" % dtype) # Convert to generic address-space conv = nvvmutils.insert_addrspace_conv(lmod, Type.int(8), addrspace) addrspaceptr = gvmem.bitcast(Type.pointer(Type.int(8), addrspace)) dataptr = builder.call(conv, [addrspaceptr]) return _make_array(context, builder, dataptr, dtype, shape)
def ptx_cmem_arylike(context, builder, sig, args): lmod = builder.module [arr] = args aryty = sig.return_type constvals = [ context.get_constant(types.byte, i) for i in iter(arr.tobytes(order="A")) ] constary = lc.Constant.array(Type.int(8), constvals) addrspace = nvvm.ADDRSPACE_CONSTANT gv = lmod.add_global_variable(constary.type, name="_cudapy_cmem", addrspace=addrspace) gv.linkage = lc.LINKAGE_INTERNAL gv.global_constant = True gv.initializer = constary # Preserve the underlying alignment lldtype = context.get_data_type(aryty.dtype) align = context.get_abi_sizeof(lldtype) gv.align = 2**(align - 1).bit_length() # Convert to generic address-space conv = nvvmutils.insert_addrspace_conv(lmod, Type.int(8), addrspace) addrspaceptr = gv.bitcast(Type.pointer(Type.int(8), addrspace)) genptr = builder.call(conv, [addrspaceptr]) # Create array object ary = context.make_array(aryty)(context, builder) kshape = [context.get_constant(types.intp, s) for s in arr.shape] kstrides = [context.get_constant(types.intp, s) for s in arr.strides] context.populate_array( ary, data=builder.bitcast(genptr, ary.data.type), shape=cgutils.pack_array(builder, kshape), strides=cgutils.pack_array(builder, kstrides), itemsize=ary.itemsize, parent=ary.parent, meminfo=None, ) return ary._getvalue()
def _generic_array(context, builder, shape, dtype, symbol_name, addrspace, can_dynsized=False): elemcount = reduce(operator.mul, shape, 1) # Check for valid shape for this type of allocation. # Only 1d arrays can be dynamic. dynamic_smem = elemcount <= 0 and can_dynsized and len(shape) == 1 if elemcount <= 0 and not dynamic_smem: raise ValueError("array length <= 0") # Check that we support the requested dtype other_supported_type = isinstance(dtype, (types.Record, types.Boolean)) if dtype not in types.number_domain and not other_supported_type: raise TypeError("unsupported type: %s" % dtype) lldtype = context.get_data_type(dtype) laryty = Type.array(lldtype, elemcount) if addrspace == nvvm.ADDRSPACE_LOCAL: # Special case local address space allocation to use alloca # NVVM is smart enough to only use local memory if no register is # available dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name) else: lmod = builder.module # Create global variable in the requested address space gvmem = lmod.add_global_variable(laryty, symbol_name, addrspace) # Specify alignment to avoid misalignment bug align = context.get_abi_sizeof(lldtype) # Alignment is required to be a power of 2 for shared memory. If it is # not a power of 2 (e.g. for a Record array) then round up accordingly. gvmem.align = 1 << (align - 1).bit_length() if dynamic_smem: gvmem.linkage = lc.LINKAGE_EXTERNAL else: ## Comment out the following line to workaround a NVVM bug ## which generates a invalid symbol name when the linkage ## is internal and in some situation. ## See _get_unique_smem_id() # gvmem.linkage = lc.LINKAGE_INTERNAL gvmem.initializer = lc.Constant.undef(laryty) # Convert to generic address-space conv = nvvmutils.insert_addrspace_conv(lmod, Type.int(8), addrspace) addrspaceptr = gvmem.bitcast(Type.pointer(Type.int(8), addrspace)) dataptr = builder.call(conv, [addrspaceptr]) targetdata = _get_target_data(context) lldtype = context.get_data_type(dtype) itemsize = lldtype.get_abi_size(targetdata) # Compute strides laststride = itemsize rstrides = [] for i, lastsize in enumerate(reversed(shape)): rstrides.append(laststride) laststride *= lastsize strides = [s for s in reversed(rstrides)] kstrides = [context.get_constant(types.intp, s) for s in strides] # Compute shape if dynamic_smem: # Compute the shape based on the dynamic shared memory configuration. # Unfortunately NVVM does not provide an intrinsic for the # %dynamic_smem_size register, so we must read it using inline # assembly. get_dynshared_size = InlineAsm.get(Type.function(Type.int(), []), "mov.u32 $0, %dynamic_smem_size;", '=r', side_effect=True) dynsmem_size = builder.zext(builder.call(get_dynshared_size, []), Type.int(width=64)) # Only 1-D dynamic shared memory is supported so the following is a # sufficient construction of the shape kitemsize = context.get_constant(types.intp, itemsize) kshape = [builder.udiv(dynsmem_size, kitemsize)] else: kshape = [context.get_constant(types.intp, s) for s in shape] # Create array object ndim = len(shape) aryty = types.Array(dtype=dtype, ndim=ndim, layout='C') ary = context.make_array(aryty)(context, builder) context.populate_array(ary, data=builder.bitcast(dataptr, ary.data.type), shape=kshape, strides=kstrides, itemsize=context.get_constant(types.intp, itemsize), meminfo=None) return ary._getvalue()