def __init__(self, shape, strides, dtype, stream=0, writeback=None, gpu_data=None): """ Args ---- shape array shape. strides array strides. dtype data type as np.dtype coercible object. stream cuda stream. writeback Deprecated. gpu_data user provided device memory for the ndarray data buffer """ if isinstance(shape, int): shape = (shape, ) if isinstance(strides, int): strides = (strides, ) dtype = np.dtype(dtype) self.ndim = len(shape) if len(strides) != self.ndim: raise ValueError('strides not match ndim') self._dummy = dummyarray.Array.from_desc(0, shape, strides, dtype.itemsize) self.shape = tuple(shape) self.strides = tuple(strides) self.dtype = dtype self.size = int(functools.reduce(operator.mul, self.shape, 1)) # prepare gpu memory if self.size > 0: if gpu_data is None: self.alloc_size = _driver.memory_size_from_info( self.shape, self.strides, self.dtype.itemsize) gpu_data = devices.get_context().memalloc(self.alloc_size) else: self.alloc_size = _driver.device_memory_size(gpu_data) else: # Make NULL pointer for empty allocation gpu_data = _driver.MemoryPointer(context=devices.get_context(), pointer=c_void_p(0), size=0) self.alloc_size = 0 self.gpu_data = gpu_data self.__writeback = writeback # should deprecate the use of this self.stream = stream
def open(self): """ Returns a new *DeviceNDArray* that shares the allocation from the original process. Must not be used on the original process. """ dptr = self._ipc_handle.open(devices.get_context()) return DeviceNDArray(gpu_data=dptr, **self._array_desc)
def _do_setitem(self, key, value, stream=0): stream = self._default_stream(stream) # If the record didn't have a default stream, and the user didn't # provide a stream, then we will use the default stream for the # assignment kernel and synchronize on it. synchronous = not stream if synchronous: ctx = devices.get_context() stream = ctx.get_default_stream() # (1) prepare LHS typ, offset = self.dtype.fields[key] newdata = self.gpu_data.view(offset) lhs = type(self)(dtype=typ, stream=stream, gpu_data=newdata) # (2) prepare RHS rhs, _ = auto_device(lhs.dtype.type(value), stream=stream) # (3) do the copy _driver.device_to_device(lhs, rhs, rhs.dtype.itemsize, stream) if synchronous: stream.synchronize()
def ndarray_device_allocate_data(ary): """ Allocate gpu data buffer """ datasize = driver.host_memory_size(ary) # allocate gpu_data = devices.get_context().memalloc(datasize) return gpu_data
def setUp(self): self.assertTrue(len(devices.gpus) > 0) self.context = devices.get_context() device = self.context.device ccmajor, _ = device.compute_capability if ccmajor >= 2: self.ptx = ptx2 else: self.ptx = ptx1
def get_ipc_handle(self): """ Returns a *IpcArrayHandle* object that is safe to serialize and transfer to another process to share the local allocation. Note: this feature is only available on Linux. """ ipch = devices.get_context().get_ipc_handle(self.gpu_data) desc = dict(shape=self.shape, strides=self.strides, dtype=self.dtype) return IpcArrayHandle(ipc_handle=ipch, array_desc=desc)
def _do_setitem(self, key, value, stream=0): stream = self._default_stream(stream) # If the array didn't have a default stream, and the user didn't provide # a stream, then we will use the default stream for the assignment # kernel and synchronize on it. synchronous = not stream if synchronous: ctx = devices.get_context() stream = ctx.get_default_stream() # (1) prepare LHS arr = self._dummy.__getitem__(key) newdata = self.gpu_data.view(*arr.extent) if isinstance(arr, dummyarray.Element): # convert to a 0d array shape = () strides = () else: shape = arr.shape strides = arr.strides lhs = type(self)( shape=shape, strides=strides, dtype=self.dtype, gpu_data=newdata, stream=stream) # (2) prepare RHS rhs, _ = auto_device(value, stream=stream, user_explicit=True) if rhs.ndim > lhs.ndim: raise ValueError("Can't assign %s-D array to %s-D self" % ( rhs.ndim, lhs.ndim)) rhs_shape = np.ones(lhs.ndim, dtype=np.int64) # negative indices would not work if rhs.ndim == 0 rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape rhs = rhs.reshape(*rhs_shape) for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)): if r != 1 and l != r: raise ValueError("Can't copy sequence with size %d to array " "axis %d with dimension %d" % ( r, i, l)) # (3) do the copy n_elements = functools.reduce(operator.mul, lhs.shape, 1) _assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs) if synchronous: stream.synchronize()
def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem): self.dispatcher = dispatcher self.griddim = griddim self.blockdim = blockdim self.stream = stream self.sharedmem = sharedmem if config.CUDA_LOW_OCCUPANCY_WARNINGS: ctx = get_context() smcount = ctx.device.MULTIPROCESSOR_COUNT grid_size = griddim[0] * griddim[1] * griddim[2] if grid_size < 2 * smcount: msg = ("Grid size ({grid}) < 2 * SM count ({sm}) " "will likely result in GPU under utilization due " "to low occupancy.") msg = msg.format(grid=grid_size, sm=2 * smcount) warn(NumbaPerformanceWarning(msg))
def _compute_thread_per_block(self, kernel): tpb = self.thread_per_block # Prefer user-specified config if tpb != 0: return tpb # Else, ask the driver to give a good config else: ctx = get_context() # Kernel is specialized, so there's only one definition - get it so # we can get the cufunc from the code library defn = next(iter(kernel.overloads.values())) kwargs = dict( func=defn._codelibrary.get_cufunc(), b2d_func=0, # dynamic-shared memory is constant to blksz memsize=self.sharedmem, blocksizelimit=1024, ) _, tpb = ctx.get_max_potential_block_size(**kwargs) return tpb
def test_device_get_uuid(self): # A device UUID looks like: # # GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643 # # To test, we construct an RE that matches this form and verify that # the returned UUID matches. # # Device UUIDs may not conform to parts of the UUID specification (RFC # 4122) pertaining to versions and variants, so we do not extract and # validate the values of these bits. h = '[0-9a-f]{%d}' h4 = h % 4 h8 = h % 8 h12 = h % 12 uuid_format = f'^GPU-{h8}-{h4}-{h4}-{h4}-{h12}$' dev = devices.get_context().device self.assertRegex(dev.uuid, uuid_format)
def max_cooperative_grid_blocks(self, blockdim, dynsmemsize=0): ''' Calculates the maximum number of blocks that can be launched for this kernel in a cooperative grid in the current context, for the given block and dynamic shared memory sizes. :param blockdim: Block dimensions, either as a scalar for a 1D block, or a tuple for 2D or 3D blocks. :param dynsmemsize: Dynamic shared memory size in bytes. :return: The maximum number of blocks in the grid. ''' ctx = get_context() cufunc = self._codelibrary.get_cufunc() if isinstance(blockdim, tuple): blockdim = functools.reduce(lambda x, y: x * y, blockdim) active_per_sm = ctx.get_active_blocks_per_multiprocessor( cufunc, blockdim, dynsmemsize) sm_count = ctx.device.MULTIPROCESSOR_COUNT return active_per_sm * sm_count
def cc_X_or_above(major, minor): if not config.ENABLE_CUDASIM: cc = devices.get_context().device.compute_capability return cc >= (major, minor) else: return True
def setUp(self): super().setUp() self.context = devices.get_context()
def setUp(self): self.context = devices.get_context()