def test_d2d(self): hst = np.arange(100, dtype=np.uint32) hst2 = np.empty_like(hst) sz = hst.size * hst.dtype.itemsize dev1 = self.context.memalloc(sz) dev2 = self.context.memalloc(sz) driver.host_to_device(dev1, hst, sz) driver.device_to_device(dev2, dev1, sz) driver.device_to_host(hst2, dev2, sz) self.assertTrue(np.all(hst == hst2))
def gpu_slice(arr, col): """ Missing feature in NumbaPro """ from numba.cuda.cudadrv.driver import device_to_host view, size = gpu_slice_view(arr, col) host = np.empty(shape=arr.shape[0], dtype=arr.dtype) device_to_host(host, view, size) return host
def test_memcpy(self): hstary = np.arange(100, dtype=np.uint32) hstary2 = np.arange(100, dtype=np.uint32) sz = hstary.size * hstary.dtype.itemsize devary = self.context.memalloc(sz) driver.host_to_device(devary, hstary, sz) driver.device_to_host(hstary2, devary, sz) self.assertTrue(np.all(hstary == hstary2))
def test_memset(self): dtype = np.dtype('uint32') n = 10 sz = dtype.itemsize * 10 devary = self.context.memalloc(sz) driver.device_memset(devary, 0xab, sz) hstary = np.empty(n, dtype=dtype) driver.device_to_host(hstary, devary, sz) hstary2 = np.array([0xabababab] * n, dtype=np.dtype('uint32')) self.assertTrue(np.all(hstary == hstary2))
def copy_to_host(self, ary=None, stream=0): """Copy ``self`` to ``ary`` or create a new Numpy ndarray if ``ary`` is ``None``. If a CUDA ``stream`` is given, then the transfer will be made asynchronously as part as the given stream. Otherwise, the transfer is synchronous: the function returns after the copy is finished. Always returns the host array. Example:: import numpy as np from numba import cuda arr = np.arange(1000) d_arr = cuda.to_device(arr) my_kernel[100, 100](d_arr) result_array = d_arr.copy_to_host() """ if any(s < 0 for s in self.strides): msg = "D->H copy not implemented for negative strides: {}" raise NotImplementedError(msg.format(self.strides)) assert self.alloc_size >= 0, "Negative memory size" stream = self._default_stream(stream) if ary is None: hostary = np.empty(shape=self.alloc_size, dtype=np.byte) else: check_array_compatibility(self, ary) hostary = ary if self.alloc_size != 0: _driver.device_to_host(hostary, self, self.alloc_size, stream=stream) if ary is None: if self.size == 0: hostary = np.ndarray(shape=self.shape, dtype=self.dtype, buffer=hostary) else: hostary = np.ndarray( shape=self.shape, dtype=self.dtype, strides=self.strides, buffer=hostary, ) return hostary
def test_cuda_driver_basic(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array)) function = function.configure((1, ), (100, )) function(memory) device_to_host(array, memory, sizeof(array)) for i, v in enumerate(array): self.assertEqual(i, v) module.unload()
def test_cuda_driver_basic(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array)) function = function.configure((1,), (100,)) function(memory) device_to_host(array, memory, sizeof(array)) for i, v in enumerate(array): self.assertEqual(i, v) module.unload()
def test_cuda_driver_stream(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() stream = self.context.create_stream() with stream.auto_synchronize(): memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array), stream=stream) function = function.configure((1, ), (100, ), stream=stream) function(memory) device_to_host(array, memory, sizeof(array), stream=stream) for i, v in enumerate(array): self.assertEqual(i, v)
def test_cuda_driver_stream(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() stream = self.context.create_stream() with stream.auto_synchronize(): memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array), stream=stream) function = function.configure((1,), (100,), stream=stream) function(memory) device_to_host(array, memory, sizeof(array), stream=stream) for i, v in enumerate(array): self.assertEqual(i, v)
def test_host_alloc_driver(self): n = 32 mem = cuda.current_context().memhostalloc(n, mapped=True) dtype = np.dtype(np.uint8) ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype, buffer=mem) magic = 0xab driver.device_memset(mem, magic, n) self.assertTrue(np.all(ary == magic)) ary.fill(n) recv = np.empty_like(ary) driver.device_to_host(recv, mem, ary.size) self.assertTrue(np.all(ary == recv)) self.assertTrue(np.all(recv == n))
def test_host_alloc_driver(self): n = 32 mem = cuda.current_context().memhostalloc(n, mapped=True) dtype = np.dtype(np.uint8) ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype, buffer=mem) magic = 0xAB driver.device_memset(mem, magic, n) self.assertTrue(np.all(ary == magic)) ary.fill(n) recv = np.empty_like(ary) driver.device_to_host(recv, mem, ary.size) self.assertTrue(np.all(ary == recv)) self.assertTrue(np.all(recv == n))
def test_cuda_driver_basic(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array)) launch_kernel(function.handle, # Kernel 1, 1, 1, # gx, gy, gz 100, 1, 1, # bx, by, bz 0, # dynamic shared mem 0, # stream [memory]) # arguments device_to_host(array, memory, sizeof(array)) for i, v in enumerate(array): self.assertEqual(i, v) module.unload()
def _do_getitem(self, item, stream=0): stream = self._default_stream(stream) typ, offset = self.dtype.fields[item] newdata = self.gpu_data.view(offset) if typ.shape == (): if typ.names is not None: return DeviceRecord(dtype=typ, stream=stream, gpu_data=newdata) else: hostary = np.empty(1, dtype=typ) _driver.device_to_host(dst=hostary, src=newdata, size=typ.itemsize, stream=stream) return hostary[0] else: shape, strides, dtype = \ prepare_shape_strides_dtype(typ.shape, None, typ.subdtype[0], 'C') return DeviceNDArray(shape=shape, strides=strides, dtype=dtype, gpu_data=newdata, stream=stream)
def test_cuda_driver_stream_operations(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function('_Z10helloworldPi') array = (c_int * 100)() stream = self.context.create_stream() with stream.auto_synchronize(): memory = self.context.memalloc(sizeof(array)) host_to_device(memory, array, sizeof(array), stream=stream) launch_kernel(function.handle, # Kernel 1, 1, 1, # gx, gy, gz 100, 1, 1, # bx, by, bz 0, # dynamic shared mem stream.handle, # stream [memory]) # arguments device_to_host(array, memory, sizeof(array), stream=stream) for i, v in enumerate(array): self.assertEqual(i, v)
def _do_getitem(self, item, stream=0): stream = self._default_stream(stream) arr = self._dummy.__getitem__(item) extents = list(arr.iter_contiguous_extent()) cls = type(self) if len(extents) == 1: newdata = self.gpu_data.view(*extents[0]) if not arr.is_array: # Element indexing hostary = np.empty(1, dtype=self.dtype) _driver.device_to_host(dst=hostary, src=newdata, size=self._dummy.itemsize, stream=stream) return hostary[0] else: return cls(shape=arr.shape, strides=arr.strides, dtype=self.dtype, gpu_data=newdata, stream=stream) else: newdata = self.gpu_data.view(*arr.extent) return cls(shape=arr.shape, strides=arr.strides, dtype=self.dtype, gpu_data=newdata, stream=stream)
def load_symbol(name): mem, sz = cufunc.module.get_global_symbol( "%s__%s__" % (cufunc.name, name)) val = ctypes.c_int() driver.device_to_host(ctypes.addressof(val), mem, sz) return val.value
def launch(self, args, griddim, blockdim, stream=0, sharedmem=0): # Prepare kernel cufunc = self._codelibrary.get_cufunc() if self.debug: excname = cufunc.name + "__errcode__" excmem, excsz = cufunc.module.get_global_symbol(excname) assert excsz == ctypes.sizeof(ctypes.c_int) excval = ctypes.c_int() excmem.memset(0, stream=stream) # Prepare arguments retr = [] # hold functors for writeback kernelargs = [] for t, v in zip(self.argument_types, args): self._prepare_args(t, v, stream, retr, kernelargs) if driver.USE_NV_BINDING: zero_stream = driver.binding.CUstream(0) else: zero_stream = None stream_handle = stream and stream.handle or zero_stream # Invoke kernel driver.launch_kernel(cufunc.handle, *griddim, *blockdim, sharedmem, stream_handle, kernelargs, cooperative=self.cooperative) if self.debug: driver.device_to_host(ctypes.addressof(excval), excmem, excsz) if excval.value != 0: # An error occurred def load_symbol(name): mem, sz = cufunc.module.get_global_symbol( "%s__%s__" % (cufunc.name, name)) val = ctypes.c_int() driver.device_to_host(ctypes.addressof(val), mem, sz) return val.value tid = [load_symbol("tid" + i) for i in 'zyx'] ctaid = [load_symbol("ctaid" + i) for i in 'zyx'] code = excval.value exccls, exc_args, loc = self.call_helper.get_exception(code) # Prefix the exception message with the source location if loc is None: locinfo = '' else: sym, filepath, lineno = loc filepath = os.path.abspath(filepath) locinfo = 'In function %r, file %s, line %s, ' % ( sym, filepath, lineno, ) # Prefix the exception message with the thread position prefix = "%stid=%s ctaid=%s" % (locinfo, tid, ctaid) if exc_args: exc_args = ("%s: %s" % (prefix, exc_args[0]),) + \ exc_args[1:] else: exc_args = prefix, raise exccls(*exc_args) # retrieve auto converted arrays for wb in retr: wb()