Beispiel #1
0
 def test_concatenate_large_different_devices(self):
     arrs = []
     for i in range(10):
         with cuda.Device(i % 2):
             arrs.append(cupy.empty((2, 3, 4)))
     with pytest.raises(ValueError):
         cupy.concatenate(arrs)
Beispiel #2
0
def copy(a):
    """Creates a copy of a given array on the current device.

    This function allocates the new array on the current device. If the given
    array is allocated on the different device, then this function tries to
    copy the contents over the devices.

    Args:
        a (cupy.ndarray): The source array.

    Returns:
        cupy.ndarray: The copy of ``a`` on the current device.

    See: :func:`numpy.copy`, :meth:`cupy.ndarray.copy`

    """
    # If the current device is different from the device of ``a``, then this
    # function allocates a new array on the current device, and copies the
    # contents over the devices.
    # TODO(beam2d): Support ordering option
    if a.size == 0:
        return cupy.empty_like(a)

    if not a.flags.c_contiguous:
        a = ascontiguousarray(a)
        if a.data.device == cuda.Device():
            return a
    newarray = cupy.empty_like(a)
    newarray.data.copy_from(a.data, a.nbytes)
    return newarray
Beispiel #3
0
 def test_copy_multi_device_non_contiguous(self, order):
     arr = core.ndarray((20, ))[::2]
     dev1 = cuda.Device(1)
     with dev1:
         arr2 = arr.copy(order)
     assert arr2.device == dev1
     testing.assert_array_equal(arr, arr2)
Beispiel #4
0
 def test_device_attributes(self):
     d = cuda.Device()
     attributes = d.attributes
     assert isinstance(attributes, dict)
     assert all(isinstance(a, int) for a in attributes.values())
     # test a specific attribute that would be present on any supported GPU
     assert 'MaxThreadsPerBlock' in attributes
Beispiel #5
0
 def t1_seq():
     t0_setup.wait()
     with dev1:
         with dev0:
             t1_setup.set()
             t0_first_exit.wait()
         t1_exit_device.append(cuda.Device().id)
Beispiel #6
0
def inv_gpu(b):
    # We do a batched LU decomposition on the GPU to compute the inverse
    # Change the shape of the array to be size=1 minibatch if necessary
    # Also copy the matrix as the elments will be modified in-place
    a = _as_batch_mat(b).copy()
    n = a.shape[1]
    n_matrices = len(a)
    # Pivot array
    p = cupy.empty((n, n_matrices), dtype=np.int32)
    # Output array
    c = cupy.empty_like(a)
    # These arrays hold information on the execution success
    # or if the matrix was singular
    info = cupy.empty(n_matrices, dtype=np.int32)
    ap = _mat_ptrs(a)
    cp = _mat_ptrs(c)
    _, lda = _get_ld(a)
    _, ldc = _get_ld(c)
    handle = cuda.Device().cublas_handle
    cuda.cublas.sgetrfBatched(
        handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices)
    cuda.cublas.sgetriBatched(
        handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc,
        info.data.ptr, n_matrices)
    return c
Beispiel #7
0
def to_gpu(array, device=None, stream=None):
    """Copies the given CPU array to the specified device.

    Args:
        array (*array*, None, list or tuple):
            Array or arrays to be sent to GPU.
        device: CUDA device specifier. If ``None`` or :data:`cuda.DummyDevice`,
            the arrays will be copied to the current CUDA device.
        stream (~cupy.cuda.Stream): *(deprecated since v3.0.0)*
            CUDA stream. If not ``None``, the copy runs asynchronously.

    Returns:
        cupy.ndarray, list or tuple: Array or arrays on GPU.

        If some of the arrays are already on GPU, then this function just
        returns those arrays without performing any copy.

        If input arrays include `None`, it is returned as `None` as is.

    """
    if stream is not None:
        warnings.warn(
            'The stream option is deprecated in chainer.backends.cuda.to_gpu. '
            'Please remove it.', DeprecationWarning)

    check_cuda_available()
    if device is DummyDevice:
        device = cuda.Device()
    else:
        device = _get_device_or_current(device)

    return _backend._convert_arrays(
        array, lambda arr: _array_to_gpu(arr, device, stream))
Beispiel #8
0
def get_handle():
    global _handles
    device = cuda.Device()
    handle = _handles.get(device.id, None)
    if handle is None:
        handle = cudnn.create()
        _handles[device.id] = handle
    return handle
Beispiel #9
0
 def run_send_recv(rank, n_workers, dtype):
     dev = cuda.Device(rank)
     dev.use()
     comm = NCCLBackend(n_workers, rank)
     in_array = cupy.arange(10, dtype='f')
     for i in range(n_workers):
         out_array = cupy.zeros((10, ), dtype='f')
         comm.send_recv(in_array, out_array, i)
         testing.assert_allclose(out_array, in_array)
Beispiel #10
0
    def run_all_reduce(rank, n_workers, dtype):
        dev = cuda.Device(rank)
        dev.use()
        comm = NCCLBackend(n_workers, rank)
        in_array = cupy.arange(2 * 3 * 4, dtype='f').reshape(2, 3, 4)
        out_array = cupy.zeros((2, 3, 4), dtype='f')

        comm.all_reduce(in_array, out_array)
        testing.assert_allclose(out_array, 2 * in_array)
Beispiel #11
0
def check_args_device(args):
    dev = cuda.Device()
    for arg in args:
        if isinstance(arg, cupy.ndarray):
            arg_dev = arg.data.device
            if arg_dev != dev:
                raise ValueError('Array device must be same as the current '
                                 'device: array device = %d while current = %d'
                                 % (arg_dev.id, dev.id))
Beispiel #12
0
        def ret(*args, **kwargs):
            arg_key = (args, frozenset(kwargs.items()))
            if for_each_device:
                arg_key = (cuda.Device().id, arg_key)

            result = memo.get(arg_key, none)
            if result is none:
                result = f(*args, **kwargs)
                memo[arg_key] = result
            return result
Beispiel #13
0
    def run_reduce_scatter(rank, n_workers, dtype):
        dev = cuda.Device(rank)
        dev.use()
        comm = NCCLBackend(n_workers, rank)
        in_array = 1 + cupy.arange(n_workers * 10, dtype='f').reshape(
            n_workers, 10)
        out_array = cupy.zeros((10, ), dtype='f')

        comm.reduce_scatter(in_array, out_array, 10)
        testing.assert_allclose(out_array, 2 * in_array[rank])
Beispiel #14
0
 def run_init(rank, n_workers):
     dev = cuda.Device(rank)
     dev.use()
     comm = init_process_group(n_workers, rank)
     # Do a simple call to verify we got a valid comm
     in_array = cupy.zeros(1)
     if rank == 0:
         in_array = in_array + 1
     comm.broadcast(in_array, 0)
     testing.assert_allclose(in_array, cupy.ones(1))
Beispiel #15
0
    def test_copy_multi_device_with_stream(self):
        # Kernel that takes long enough then finally writes values.
        kern = cupy.RawKernel(
            _test_copy_multi_device_with_stream_src, 'wait_and_write')

        # Allocates a memory and launches the kernel on a device with its
        # stream.
        with cuda.Device(0):
            with cuda.Stream():
                a = cupy.zeros((2,), dtype=numpy.uint64)
                kern((1,), (1,), a)

        # D2D copy to another device with another stream should get the
        # original values of the memory before the kernel on the first device
        # finally makes the write.
        with cuda.Device(1):
            with cuda.Stream():
                b = a.copy()
                testing.assert_array_equal(
                    b, numpy.array([0, 0], dtype=numpy.uint64))
Beispiel #16
0
 def test_concatenate_large_different_devices(self):
     arrs = []
     for i in range(10):
         with cuda.Device(i % 2):
             arrs.append(cupy.empty((2, 3, 4)))
     if cuda.runtime.deviceCanAccessPeer(0, 1) == 1:
         with pytest.warns(cupy._util.PerformanceWarning):
             cupy.concatenate(arrs)
     else:
         with pytest.raises(ValueError):
             cupy.concatenate(arrs)
Beispiel #17
0
 def run_send_and_recv(rank, n_workers, dtype):
     dev = cuda.Device(rank)
     dev.use()
     comm = NCCLBackend(n_workers, rank)
     in_array = cupy.arange(10, dtype='f')
     out_array = cupy.zeros((10, ), dtype='f')
     if rank == 0:
         comm.send(in_array, 1)
     else:
         comm.recv(out_array, 0)
         testing.assert_allclose(out_array, in_array)
Beispiel #18
0
 def run_all_to_all(rank, n_workers, dtype):
     dev = cuda.Device(rank)
     dev.use()
     comm = NCCLBackend(n_workers, rank)
     in_array = cupy.arange(n_workers * 10,
                            dtype='f').reshape(n_workers, 10)
     out_array = cupy.zeros((n_workers, 10), dtype='f')
     comm.all_to_all(in_array, out_array)
     expected = (10 * rank) + cupy.broadcast_to(cupy.arange(10, dtype='f'),
                                                (n_workers, 10))
     testing.assert_allclose(out_array, expected)
Beispiel #19
0
 def run_barrier(rank, n_workers):
     dev = cuda.Device(rank)
     dev.use()
     comm = NCCLBackend(n_workers, rank)
     comm.barrier()
     before = time.time()
     if rank == 0:
         time.sleep(2)
     comm.barrier()
     after = time.time()
     assert int(after - before) == 2
Beispiel #20
0
 def run_broadcast(rank, n_workers, root, dtype):
     dev = cuda.Device(rank)
     dev.use()
     comm = NCCLBackend(n_workers, rank)
     expected = cupy.arange(2 * 3 * 4, dtype=dtype).reshape((2, 3, 4))
     if rank == root:
         in_array = expected
     else:
         in_array = cupy.zeros((2, 3, 4), dtype=dtype)
     comm.broadcast(in_array, root)
     testing.assert_allclose(in_array, expected)
Beispiel #21
0
 def test_single_proc_single_dev(self):
     comms = cuda.nccl.NcclCommunicator.initAll(1)
     cuda.nccl.groupStart()
     for comm in comms:
         cuda.Device(comm.device_id()).use()
         sendbuf = cupy.arange(10)
         recvbuf = cupy.zeros_like(sendbuf)
         comm.allReduce(sendbuf.data.ptr, recvbuf.data.ptr, 10,
                        cuda.nccl.NCCL_INT64, cuda.nccl.NCCL_SUM,
                        cuda.Stream.null.ptr)
     cuda.nccl.groupEnd()
     assert cupy.allclose(sendbuf, recvbuf)
Beispiel #22
0
 def run_gather(rank, n_workers, root, dtype):
     dev = cuda.Device(rank)
     dev.use()
     comm = NCCLBackend(n_workers, rank)
     in_array = (rank + 1) * cupy.arange(10, dtype='f')
     out_array = cupy.zeros((n_workers, 10), dtype='f')
     comm.gather(in_array, out_array, root)
     if rank == root:
         expected = 1 + cupy.arange(n_workers).reshape(n_workers, 1)
         expected = expected * cupy.broadcast_to(cupy.arange(10, dtype='f'),
                                                 (n_workers, 10))
         testing.assert_allclose(out_array, expected)
Beispiel #23
0
def _check_args(args):
    dev = cuda.Device()
    cp_array = cupy.ndarray
    scalar_type = _scalar_type
    for arg in args:
        if isinstance(arg, cp_array):
            arr_dev = arg.device
            if arr_dev is not None and arr_dev != dev:
                raise ValueError('Array device must be same as the current '
                                 'device: array device = %d while current = %d'
                                 % (arr_dev.id, dev.id))
        elif not isinstance(arg, scalar_type):
            raise TypeError('Unsupported type %s' % type(arg))
Beispiel #24
0
    def test_copy_multi_device_with_stream(self):
        # Kernel that takes long enough then finally writes values.
        src = _test_copy_multi_device_with_stream_src
        if runtime.is_hip and driver.get_build_version() >= 5_00_00000:
            src = '#include <ctime>\n' + src
        kern = cupy.RawKernel(src, 'wait_and_write')

        # Allocates a memory and launches the kernel on a device with its
        # stream.
        with cuda.Device(0):
            # Keep this stream alive over the D2D copy below for HIP
            with cuda.Stream() as s1:  # NOQA
                a = cupy.zeros((2,), dtype=numpy.uint64)
                kern((1,), (1,), a)

        # D2D copy to another device with another stream should get the
        # original values of the memory before the kernel on the first device
        # finally makes the write.
        with cuda.Device(1):
            with cuda.Stream():
                b = a.copy()
                testing.assert_array_equal(
                    b, numpy.array([0, 0], dtype=numpy.uint64))
Beispiel #25
0
        def ret(*args, **kwargs):
            global _memoized_funcs

            arg_key = (args, frozenset(kwargs.items()))
            if for_each_device:
                arg_key = (cuda.Device().id, arg_key)

            memo = getattr(f, '_cupy_memo', None)
            if memo is None:
                memo = f._cupy_memo = {}
                _memoized_funcs.append(f)

            result = memo.get(arg_key, None)
            if result is None:
                result = f(*args, **kwargs)
                memo[arg_key] = result
            return result
Beispiel #26
0
    def test_context_and_use(self):
        dev0 = cuda.Device(0)
        dev1 = cuda.Device(1)

        dev1.use()
        with dev0:
            assert 0 == cuda.Device().id
            dev1.use()
            with dev1:
                assert 1 == cuda.Device().id
            assert 0 == cuda.Device().id
        assert 0 == cuda.Device().id
Beispiel #27
0
def get_random_state():
    """Gets the state of the random number generator for the current device.

    If the state for the current device is not created yet, this function
    creates a new one, initializes it, and stores it as the state for the
    current device.

    Returns:
        RandomState: The state of the random number generator for the
        device.

    """
    global _random_states
    dev = cuda.Device()
    rs = _random_states.get(dev.id, None)
    if rs is None:
        rs = RandomState()
        _random_states[dev.id] = rs
    return rs
Beispiel #28
0
def get_random_state():
    """Gets the state of the random number generator for the current device.

    If the state for the current device is not created yet, this function
    creates a new one, initializes it, and stores it as the state for the
    current device.

    Returns:
        RandomState: The state of the random number generator for the
        device.

    """
    dev = cuda.Device()
    rs = _random_states.get(dev.id, None)
    if rs is None:
        seed = os.getenv('CUPY_SEED')
        if seed is None:
            seed = os.getenv('CHAINER_SEED')
        rs = RandomState(seed)
        rs = _random_states.setdefault(dev.id, rs)
    return rs
Beispiel #29
0
 def test_copy_multi_device_non_contiguous_K(self):
     arr = core.ndarray((20,))[::2]
     with cuda.Device(1):
         with self.assertRaises(NotImplementedError):
             arr.copy('K')
Beispiel #30
0
 def test_deepcopy_multi_device(self):
     arr = core.ndarray(self.shape)
     with cuda.Device(1):
         arr2 = copy.deepcopy(arr)
     self._check_deepcopy(arr, arr2)
     assert arr2.device == arr.device