def test_concatenate_large_different_devices(self): arrs = [] for i in range(10): with cuda.Device(i % 2): arrs.append(cupy.empty((2, 3, 4))) with pytest.raises(ValueError): cupy.concatenate(arrs)
def copy(a): """Creates a copy of a given array on the current device. This function allocates the new array on the current device. If the given array is allocated on the different device, then this function tries to copy the contents over the devices. Args: a (cupy.ndarray): The source array. Returns: cupy.ndarray: The copy of ``a`` on the current device. See: :func:`numpy.copy`, :meth:`cupy.ndarray.copy` """ # If the current device is different from the device of ``a``, then this # function allocates a new array on the current device, and copies the # contents over the devices. # TODO(beam2d): Support ordering option if a.size == 0: return cupy.empty_like(a) if not a.flags.c_contiguous: a = ascontiguousarray(a) if a.data.device == cuda.Device(): return a newarray = cupy.empty_like(a) newarray.data.copy_from(a.data, a.nbytes) return newarray
def test_copy_multi_device_non_contiguous(self, order): arr = core.ndarray((20, ))[::2] dev1 = cuda.Device(1) with dev1: arr2 = arr.copy(order) assert arr2.device == dev1 testing.assert_array_equal(arr, arr2)
def test_device_attributes(self): d = cuda.Device() attributes = d.attributes assert isinstance(attributes, dict) assert all(isinstance(a, int) for a in attributes.values()) # test a specific attribute that would be present on any supported GPU assert 'MaxThreadsPerBlock' in attributes
def t1_seq(): t0_setup.wait() with dev1: with dev0: t1_setup.set() t0_first_exit.wait() t1_exit_device.append(cuda.Device().id)
def inv_gpu(b): # We do a batched LU decomposition on the GPU to compute the inverse # Change the shape of the array to be size=1 minibatch if necessary # Also copy the matrix as the elments will be modified in-place a = _as_batch_mat(b).copy() n = a.shape[1] n_matrices = len(a) # Pivot array p = cupy.empty((n, n_matrices), dtype=np.int32) # Output array c = cupy.empty_like(a) # These arrays hold information on the execution success # or if the matrix was singular info = cupy.empty(n_matrices, dtype=np.int32) ap = _mat_ptrs(a) cp = _mat_ptrs(c) _, lda = _get_ld(a) _, ldc = _get_ld(c) handle = cuda.Device().cublas_handle cuda.cublas.sgetrfBatched( handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) cuda.cublas.sgetriBatched( handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc, info.data.ptr, n_matrices) return c
def to_gpu(array, device=None, stream=None): """Copies the given CPU array to the specified device. Args: array (*array*, None, list or tuple): Array or arrays to be sent to GPU. device: CUDA device specifier. If ``None`` or :data:`cuda.DummyDevice`, the arrays will be copied to the current CUDA device. stream (~cupy.cuda.Stream): *(deprecated since v3.0.0)* CUDA stream. If not ``None``, the copy runs asynchronously. Returns: cupy.ndarray, list or tuple: Array or arrays on GPU. If some of the arrays are already on GPU, then this function just returns those arrays without performing any copy. If input arrays include `None`, it is returned as `None` as is. """ if stream is not None: warnings.warn( 'The stream option is deprecated in chainer.backends.cuda.to_gpu. ' 'Please remove it.', DeprecationWarning) check_cuda_available() if device is DummyDevice: device = cuda.Device() else: device = _get_device_or_current(device) return _backend._convert_arrays( array, lambda arr: _array_to_gpu(arr, device, stream))
def get_handle(): global _handles device = cuda.Device() handle = _handles.get(device.id, None) if handle is None: handle = cudnn.create() _handles[device.id] = handle return handle
def run_send_recv(rank, n_workers, dtype): dev = cuda.Device(rank) dev.use() comm = NCCLBackend(n_workers, rank) in_array = cupy.arange(10, dtype='f') for i in range(n_workers): out_array = cupy.zeros((10, ), dtype='f') comm.send_recv(in_array, out_array, i) testing.assert_allclose(out_array, in_array)
def run_all_reduce(rank, n_workers, dtype): dev = cuda.Device(rank) dev.use() comm = NCCLBackend(n_workers, rank) in_array = cupy.arange(2 * 3 * 4, dtype='f').reshape(2, 3, 4) out_array = cupy.zeros((2, 3, 4), dtype='f') comm.all_reduce(in_array, out_array) testing.assert_allclose(out_array, 2 * in_array)
def check_args_device(args): dev = cuda.Device() for arg in args: if isinstance(arg, cupy.ndarray): arg_dev = arg.data.device if arg_dev != dev: raise ValueError('Array device must be same as the current ' 'device: array device = %d while current = %d' % (arg_dev.id, dev.id))
def ret(*args, **kwargs): arg_key = (args, frozenset(kwargs.items())) if for_each_device: arg_key = (cuda.Device().id, arg_key) result = memo.get(arg_key, none) if result is none: result = f(*args, **kwargs) memo[arg_key] = result return result
def run_reduce_scatter(rank, n_workers, dtype): dev = cuda.Device(rank) dev.use() comm = NCCLBackend(n_workers, rank) in_array = 1 + cupy.arange(n_workers * 10, dtype='f').reshape( n_workers, 10) out_array = cupy.zeros((10, ), dtype='f') comm.reduce_scatter(in_array, out_array, 10) testing.assert_allclose(out_array, 2 * in_array[rank])
def run_init(rank, n_workers): dev = cuda.Device(rank) dev.use() comm = init_process_group(n_workers, rank) # Do a simple call to verify we got a valid comm in_array = cupy.zeros(1) if rank == 0: in_array = in_array + 1 comm.broadcast(in_array, 0) testing.assert_allclose(in_array, cupy.ones(1))
def test_copy_multi_device_with_stream(self): # Kernel that takes long enough then finally writes values. kern = cupy.RawKernel( _test_copy_multi_device_with_stream_src, 'wait_and_write') # Allocates a memory and launches the kernel on a device with its # stream. with cuda.Device(0): with cuda.Stream(): a = cupy.zeros((2,), dtype=numpy.uint64) kern((1,), (1,), a) # D2D copy to another device with another stream should get the # original values of the memory before the kernel on the first device # finally makes the write. with cuda.Device(1): with cuda.Stream(): b = a.copy() testing.assert_array_equal( b, numpy.array([0, 0], dtype=numpy.uint64))
def test_concatenate_large_different_devices(self): arrs = [] for i in range(10): with cuda.Device(i % 2): arrs.append(cupy.empty((2, 3, 4))) if cuda.runtime.deviceCanAccessPeer(0, 1) == 1: with pytest.warns(cupy._util.PerformanceWarning): cupy.concatenate(arrs) else: with pytest.raises(ValueError): cupy.concatenate(arrs)
def run_send_and_recv(rank, n_workers, dtype): dev = cuda.Device(rank) dev.use() comm = NCCLBackend(n_workers, rank) in_array = cupy.arange(10, dtype='f') out_array = cupy.zeros((10, ), dtype='f') if rank == 0: comm.send(in_array, 1) else: comm.recv(out_array, 0) testing.assert_allclose(out_array, in_array)
def run_all_to_all(rank, n_workers, dtype): dev = cuda.Device(rank) dev.use() comm = NCCLBackend(n_workers, rank) in_array = cupy.arange(n_workers * 10, dtype='f').reshape(n_workers, 10) out_array = cupy.zeros((n_workers, 10), dtype='f') comm.all_to_all(in_array, out_array) expected = (10 * rank) + cupy.broadcast_to(cupy.arange(10, dtype='f'), (n_workers, 10)) testing.assert_allclose(out_array, expected)
def run_barrier(rank, n_workers): dev = cuda.Device(rank) dev.use() comm = NCCLBackend(n_workers, rank) comm.barrier() before = time.time() if rank == 0: time.sleep(2) comm.barrier() after = time.time() assert int(after - before) == 2
def run_broadcast(rank, n_workers, root, dtype): dev = cuda.Device(rank) dev.use() comm = NCCLBackend(n_workers, rank) expected = cupy.arange(2 * 3 * 4, dtype=dtype).reshape((2, 3, 4)) if rank == root: in_array = expected else: in_array = cupy.zeros((2, 3, 4), dtype=dtype) comm.broadcast(in_array, root) testing.assert_allclose(in_array, expected)
def test_single_proc_single_dev(self): comms = cuda.nccl.NcclCommunicator.initAll(1) cuda.nccl.groupStart() for comm in comms: cuda.Device(comm.device_id()).use() sendbuf = cupy.arange(10) recvbuf = cupy.zeros_like(sendbuf) comm.allReduce(sendbuf.data.ptr, recvbuf.data.ptr, 10, cuda.nccl.NCCL_INT64, cuda.nccl.NCCL_SUM, cuda.Stream.null.ptr) cuda.nccl.groupEnd() assert cupy.allclose(sendbuf, recvbuf)
def run_gather(rank, n_workers, root, dtype): dev = cuda.Device(rank) dev.use() comm = NCCLBackend(n_workers, rank) in_array = (rank + 1) * cupy.arange(10, dtype='f') out_array = cupy.zeros((n_workers, 10), dtype='f') comm.gather(in_array, out_array, root) if rank == root: expected = 1 + cupy.arange(n_workers).reshape(n_workers, 1) expected = expected * cupy.broadcast_to(cupy.arange(10, dtype='f'), (n_workers, 10)) testing.assert_allclose(out_array, expected)
def _check_args(args): dev = cuda.Device() cp_array = cupy.ndarray scalar_type = _scalar_type for arg in args: if isinstance(arg, cp_array): arr_dev = arg.device if arr_dev is not None and arr_dev != dev: raise ValueError('Array device must be same as the current ' 'device: array device = %d while current = %d' % (arr_dev.id, dev.id)) elif not isinstance(arg, scalar_type): raise TypeError('Unsupported type %s' % type(arg))
def test_copy_multi_device_with_stream(self): # Kernel that takes long enough then finally writes values. src = _test_copy_multi_device_with_stream_src if runtime.is_hip and driver.get_build_version() >= 5_00_00000: src = '#include <ctime>\n' + src kern = cupy.RawKernel(src, 'wait_and_write') # Allocates a memory and launches the kernel on a device with its # stream. with cuda.Device(0): # Keep this stream alive over the D2D copy below for HIP with cuda.Stream() as s1: # NOQA a = cupy.zeros((2,), dtype=numpy.uint64) kern((1,), (1,), a) # D2D copy to another device with another stream should get the # original values of the memory before the kernel on the first device # finally makes the write. with cuda.Device(1): with cuda.Stream(): b = a.copy() testing.assert_array_equal( b, numpy.array([0, 0], dtype=numpy.uint64))
def ret(*args, **kwargs): global _memoized_funcs arg_key = (args, frozenset(kwargs.items())) if for_each_device: arg_key = (cuda.Device().id, arg_key) memo = getattr(f, '_cupy_memo', None) if memo is None: memo = f._cupy_memo = {} _memoized_funcs.append(f) result = memo.get(arg_key, None) if result is None: result = f(*args, **kwargs) memo[arg_key] = result return result
def test_context_and_use(self): dev0 = cuda.Device(0) dev1 = cuda.Device(1) dev1.use() with dev0: assert 0 == cuda.Device().id dev1.use() with dev1: assert 1 == cuda.Device().id assert 0 == cuda.Device().id assert 0 == cuda.Device().id
def get_random_state(): """Gets the state of the random number generator for the current device. If the state for the current device is not created yet, this function creates a new one, initializes it, and stores it as the state for the current device. Returns: RandomState: The state of the random number generator for the device. """ global _random_states dev = cuda.Device() rs = _random_states.get(dev.id, None) if rs is None: rs = RandomState() _random_states[dev.id] = rs return rs
def get_random_state(): """Gets the state of the random number generator for the current device. If the state for the current device is not created yet, this function creates a new one, initializes it, and stores it as the state for the current device. Returns: RandomState: The state of the random number generator for the device. """ dev = cuda.Device() rs = _random_states.get(dev.id, None) if rs is None: seed = os.getenv('CUPY_SEED') if seed is None: seed = os.getenv('CHAINER_SEED') rs = RandomState(seed) rs = _random_states.setdefault(dev.id, rs) return rs
def test_copy_multi_device_non_contiguous_K(self): arr = core.ndarray((20,))[::2] with cuda.Device(1): with self.assertRaises(NotImplementedError): arr.copy('K')
def test_deepcopy_multi_device(self): arr = core.ndarray(self.shape) with cuda.Device(1): arr2 = copy.deepcopy(arr) self._check_deepcopy(arr, arr2) assert arr2.device == arr.device