def test_copy_from_device(self): a_gpu = memory.alloc(4) a_cpu = ctypes.c_int(100) a_gpu.copy_from(ctypes.byref(a_cpu), 4) b_gpu = memory.alloc(4) b_gpu.copy_from(a_gpu, 4) b_cpu = ctypes.c_int() b_gpu.copy_to_host(ctypes.byref(b_cpu), 4) self.assertEqual(b_cpu.value, a_cpu.value)
def test_copy_from_device(self): a_gpu = memory.alloc(4) a_cpu = ctypes.c_int(100) a_gpu.copy_from(ctypes.cast(ctypes.byref(a_cpu), ctypes.c_void_p), 4) b_gpu = memory.alloc(4) b_gpu.copy_from(a_gpu, 4) b_cpu = ctypes.c_int() b_gpu.copy_to_host(ctypes.cast(ctypes.byref(b_cpu), ctypes.c_void_p), 4) assert b_cpu.value == a_cpu.value
def test_copy_from_device_async_using_raw_ptr(self): a_gpu = memory.alloc(4) a_cpu = ctypes.c_int(100) a_cpu_ptr = ctypes.cast(ctypes.byref(a_cpu), ctypes.c_void_p) a_gpu.copy_from_async(a_cpu_ptr.value, 4, stream=self.stream) b_gpu = memory.alloc(4) b_gpu.copy_from_async(a_gpu, 4, stream=self.stream) b_cpu = ctypes.c_int() b_cpu_ptr = ctypes.cast(ctypes.byref(b_cpu), ctypes.c_void_p) b_gpu.copy_to_host_async(b_cpu_ptr.value, 4, stream=self.stream) assert b_cpu.value == a_cpu.value
def test_can_use_cub_oversize_input3(self): # full reduction with 2^63-1 elements mem = memory.alloc(100) max_num = sys.maxsize a = cupy.ndarray((max_num, ), dtype=cupy.int8, memptr=mem) b = cupy.empty((), dtype=cupy.int8) assert self.can_use([a], [b], (0, ), ()) is None
def test_copy_from_device_async_using_raw_ptr(self): a_gpu = memory.alloc(4) a_cpu = ctypes.c_int(100) a_cpu_ptr = ctypes.cast(ctypes.byref(a_cpu), ctypes.c_void_p) a_gpu.copy_from_async(a_cpu_ptr.value, 4, stream=self.stream) b_gpu = memory.alloc(4) b_gpu.copy_from_async(a_gpu, 4, stream=self.stream) b_cpu = ctypes.c_int() b_cpu_ptr = ctypes.cast(ctypes.byref(b_cpu), ctypes.c_void_p) b_gpu.copy_to_host_async(b_cpu_ptr.value, 4, stream=self.stream) if self.stream is not None: self.stream.synchronize() else: stream_module.get_current_stream().synchronize() assert b_cpu.value == a_cpu.value
def test_memset(self): a_gpu = memory.alloc(4) a_gpu.memset(1, 4) a_cpu = ctypes.c_ubyte() for i in range(4): a_gpu.copy_to_host(ctypes.byref(a_cpu), 1) self.assertEqual(a_cpu.value, 1) a_gpu += 1
def test_copy_to_and_from_host(self): a_gpu = memory.alloc(4) a_cpu = ctypes.c_int(100) a_gpu.copy_from(ctypes.cast(ctypes.byref(a_cpu), ctypes.c_void_p), 4) b_cpu = ctypes.c_int() a_gpu.copy_to_host( ctypes.cast(ctypes.byref(b_cpu), ctypes.c_void_p), 4) self.assertEqual(b_cpu.value, a_cpu.value)
def test_memset(self): a_gpu = memory.alloc(4) a_gpu.memset(1, 4) a_cpu = ctypes.c_ubyte() for i in range(4): a_gpu.copy_to_host( ctypes.cast(ctypes.byref(a_cpu), ctypes.c_void_p), 1) assert a_cpu.value == 1 a_gpu += 1
def test_copy_to_and_from_host_async(self): a_gpu = memory.alloc(4) a_cpu = ctypes.c_int(100) a_gpu.copy_from_async(ctypes.cast(ctypes.byref(a_cpu), ctypes.c_void_p), 4, stream=self.stream) b_cpu = ctypes.c_int() a_gpu.copy_to_host_async( ctypes.cast(ctypes.byref(b_cpu), ctypes.c_void_p), 4, stream=self.stream) assert b_cpu.value == a_cpu.value
def test_stream3(self): # Check: destory stream does not affect memory deallocation s = cupy.cuda.Stream() with s: memptr = memory.alloc(100) del s gc.collect() del memptr
def test_stream5(self): # Check: free on another stream s1 = cupy.cuda.Stream() with s1: memptr = memory.alloc(100) del s1 s2 = cupy.cuda.Stream() with s2: del memptr
def test_copy_to_and_from_host_using_raw_ptr(self): a_gpu = memory.alloc(4) a_cpu = ctypes.c_int(100) a_cpu_ptr = ctypes.cast(ctypes.byref(a_cpu), ctypes.c_void_p) a_gpu.copy_from(a_cpu_ptr.value, 4) b_cpu = ctypes.c_int() b_cpu_ptr = ctypes.cast(ctypes.byref(b_cpu), ctypes.c_void_p) a_gpu.copy_to_host(b_cpu_ptr.value, 4) assert b_cpu.value == a_cpu.value
def test_raw_pointer(self): mod = cupy.RawModule(code=test_cast, backend=self.backend) ker = mod.get_function('my_func') a = cupy.ones((100, ), dtype=cupy.float64) memptr = memory.alloc(100 * a.dtype.itemsize) memptr.copy_from(a.data, 100 * a.dtype.itemsize) # one-initialize b = cupy.ndarray((100, ), cupy.float64, memptr=memptr) ker((1, ), (100, ), (memptr, 100)) a = 3. * a - 8. assert (a == b).all()
def test_copy_to_and_from_host_async(self): a_gpu = memory.alloc(4) a_cpu = ctypes.c_int(100) a_gpu.copy_from_async(ctypes.cast(ctypes.byref( a_cpu), ctypes.c_void_p), 4, stream=self.stream) b_cpu = ctypes.c_int() a_gpu.copy_to_host_async( ctypes.cast(ctypes.byref(b_cpu), ctypes.c_void_p), 4, stream=self.stream) if self.stream is not None: self.stream.synchronize() else: stream_module.get_current_stream().synchronize() assert b_cpu.value == a_cpu.value
def test_can_use_cub_oversize_input2(self): # full reduction with array size = 64 GB should work! mem = memory.alloc(100) a = cupy.ndarray((2**6 * 1024**3, ), dtype=cupy.int8, memptr=mem) b = cupy.empty((), dtype=cupy.int8) assert self.can_use([a], [b], (0, ), ()) is not None
def __init__(self, handle, seed): state_size = cudnn.dropoutGetStatesSize(handle) self._states = memory.alloc(state_size) self._desc = create_dropout_descriptor(handle, 0., self._states.ptr, state_size, seed)
def test_stream4(self): # Check: free on the same stream s = cupy.cuda.Stream() with s: memptr = memory.alloc(100) del memptr
def test_stream2(self): # Check: the memory was allocated on the right stream s = cupy.cuda.Stream() with s: memptr = memory.alloc(100) assert memptr.mem.stream == s.ptr
def test_can_use_cub_oversize_input4(self): # partial reduction with too many (2^31) blocks mem = memory.alloc(100) a = cupy.ndarray((2**31, 8), dtype=cupy.int8, memptr=mem) b = cupy.empty((), dtype=cupy.int8) assert self.can_use([a], [b], (1, ), (0, )) is None