Beispiel #1
0
def copy(a):
    """Creates a copy of a given array on the current device.

    This function allocates the new array on the current device. If the given
    array is allocated on the different device, then this function tries to
    copy the contents over the devices.

    Args:
        a (cupy.ndarray): The source array.

    Returns:
        cupy.ndarray: The copy of ``a`` on the current device.

    See: :func:`numpy.copy`, :meth:`cupy.ndarray.copy`

    """
    # If the current device is different from the device of ``a``, then this
    # function allocates a new array on the current device, and copies the
    # contents over the devices.
    # TODO(beam2d): Support ordering option
    if a.size == 0:
        return cupy.empty_like(a)

    if not a.flags.c_contiguous:
        a = ascontiguousarray(a)
        if a.data.device == cuda.Device():
            return a
    newarray = cupy.empty_like(a)
    newarray.data.copy_from(a.data, a.nbytes)
    return newarray
Beispiel #2
0
def _batched_inv(a):

    assert a.ndim >= 3
    _util._assert_cupy_array(a)
    _util._assert_stacked_square(a)
    dtype, out_dtype = _util.linalg_common_type(a)

    if dtype == cupy.float32:
        getrf = cupy.cuda.cublas.sgetrfBatched
        getri = cupy.cuda.cublas.sgetriBatched
    elif dtype == cupy.float64:
        getrf = cupy.cuda.cublas.dgetrfBatched
        getri = cupy.cuda.cublas.dgetriBatched
    elif dtype == cupy.complex64:
        getrf = cupy.cuda.cublas.cgetrfBatched
        getri = cupy.cuda.cublas.cgetriBatched
    elif dtype == cupy.complex128:
        getrf = cupy.cuda.cublas.zgetrfBatched
        getri = cupy.cuda.cublas.zgetriBatched
    else:
        msg = ('dtype must be float32, float64, complex64 or complex128'
               ' (actual: {})'.format(a.dtype))
        raise ValueError(msg)

    if 0 in a.shape:
        return cupy.empty_like(a, dtype=out_dtype)
    a_shape = a.shape

    # copy is necessary to present `a` to be overwritten.
    a = a.astype(dtype, order='C').reshape(-1, a_shape[-2], a_shape[-1])

    handle = device.get_cublas_handle()
    batch_size = a.shape[0]
    n = a.shape[1]
    lda = n
    step = n * lda * a.itemsize
    start = a.data.ptr
    stop = start + step * batch_size
    a_array = cupy.arange(start, stop, step, dtype=cupy.uintp)
    pivot_array = cupy.empty((batch_size, n), dtype=cupy.int32)
    info_array = cupy.empty((batch_size, ), dtype=cupy.int32)

    getrf(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr,
          info_array.data.ptr, batch_size)
    cupy.linalg._util._check_cublas_info_array_if_synchronization_allowed(
        getrf, info_array)

    c = cupy.empty_like(a)
    ldc = lda
    step = n * ldc * c.itemsize
    start = c.data.ptr
    stop = start + step * batch_size
    c_array = cupy.arange(start, stop, step, dtype=cupy.uintp)

    getri(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr,
          c_array.data.ptr, ldc, info_array.data.ptr, batch_size)
    cupy.linalg._util._check_cublas_info_array_if_synchronization_allowed(
        getri, info_array)

    return c.reshape(a_shape).astype(out_dtype, copy=False)
 def _init_params(self, n):
     # CuPy setup
     self.x = cp.linspace(0, 2, n, dtype=cp.double)
     self.y = cp.linspace(0, 2, n, dtype=cp.double)
     self.X, self.Y = cp.meshgrid(self.x, self.y)
     self.u = cp.zeros((n, n), dtype=cp.double)
     self.v = cp.zeros((n, n), dtype=cp.double)
     self.p = cp.zeros((n, n), dtype=cp.double)
     self.un = cp.empty_like(self.u, dtype=cp.double)
     self.vn = cp.empty_like(self.v, dtype=cp.double)
     self.pn = cp.empty_like(self.p, dtype=cp.double)
     self.b = cp.zeros((n, n), dtype=cp.double)
Beispiel #4
0
def _ndimage_mean_kernel_2(input, labels, index, batch_size=4,
                           return_count=False):
    sum_val = cupy.empty_like(index, dtype=cupy.float64)
    count = cupy.empty_like(index, dtype=cupy.uint64)
    for i in range(0, index.size, batch_size):
        matched = labels == index[i:i + batch_size].reshape(
            (-1,) + (1,) * input.ndim)
        mean_axes = tuple(range(1, 1 + input.ndim))
        count[i:i + batch_size] = matched.sum(axis=mean_axes)
        sum_val[i:i + batch_size] = cupy.where(matched, input, 0).sum(
            axis=mean_axes)
    if return_count:
        return sum_val / count, count
    return sum_val / count
Beispiel #5
0
def precompute_cc_factors(ad, bd, radius, mode="constant"):

    # factors = cp.zeros((5,) + ad.shape, dtype=ad.dtype)
    factors = [None] * 5
    sum_h = cp.ones((2 * radius + 1, ), dtype=ad.dtype)
    h_tuple = (sum_h, ) * ad.ndim
    kwargs = dict(mode=mode)
    sum_a = convolve_separable(ad, h_tuple, **kwargs)
    sum_b = convolve_separable(bd, h_tuple, **kwargs)
    sum_ab = convolve_separable(ad * bd, h_tuple, **kwargs)
    sum_aa = convolve_separable(ad * ad, h_tuple, **kwargs)
    sum_bb = convolve_separable(bd * bd, h_tuple, **kwargs)
    if mode != "constant":
        cnt = (2 * radius + 1)**ad.ndim
    else:
        cnt = convolve_separable(cp.ones_like(ad), (sum_h, ) * ad.ndim,
                                 **kwargs).astype(cp.int32)

    if True:
        factors[0] = cp.empty_like(ad)
        factors[1] = cp.empty_like(ad)
        factors[2] = cp.empty_like(ad)
        factors[3] = cp.empty_like(ad)
        factors[4] = cp.empty_like(ad)
        _cc_precompute(
            ad,
            bd,
            sum_a,
            sum_b,
            sum_ab,
            sum_aa,
            sum_bb,
            cnt,
            factors[0],
            factors[1],
            factors[2],
            factors[3],
            factors[4],
        )
    else:
        a_mean = sum_a / cnt
        b_mean = sum_b / cnt
        factors[0] = ad - a_mean
        factors[1] = bd - b_mean
        factors[2] = sum_ab - b_mean * sum_a - a_mean * sum_b + sum_a * b_mean
        factors[3] = sum_aa - (a_mean + a_mean) * sum_a + sum_a * a_mean
        factors[4] = sum_bb - (b_mean + b_mean) * sum_b + sum_b * b_mean

    return factors
Beispiel #6
0
 def test_getitem_int(self):
     x = cupy.arange(24).reshape((2, 3, 4)).astype('i')
     y = cupy.empty_like(x)
     y = cupy.ElementwiseKernel(
         'raw T x', 'int32 y', 'y = x[i]', 'test_carray_getitem_int',
     )(x, y)
     testing.assert_array_equal(y, x)
Beispiel #7
0
    def clip(self, indices, in_place=False, remove=False):
        '''
        Removes the items of list, header, and data
        that do not have index in the given indices.

        Parameters
        ----------
        indices : array-like
            indices of items to leave
        in_place : bool, default False
            If True, perform in-place array manipulations for ndarray.
            This saves memory, but the values of the original ndarray
            are not preserved.
        '''
        if remove:
            indices = sorted(set(range(len(self))) - set(indices))
        else:
            indices = sorted(indices)

        list = self.list
        head = self.header
        data = self.data

        self.list = (list[i] for i in indices)
        self.header = (head[i] for i in indices)

        tmpd = data[:len(indices)]
        if not in_place:
            tmpd = cp.empty_like(tmpd)
        for dst, j in zip(tmpd, indices):
            cp.copyto(dst, data[j])
        self.data = tmpd
Beispiel #8
0
def apply_diffusion(in_field, out_field, alpha, num_halo, num_iter=1):
    """ Integrate 4th-order diffusion equation by a certain number of iterations.

    Parameters
    ----------
    in_field : array-like
        Input field (nz x ny x nx with halo in x- and y-direction).
    lap_field : array-like
        Result (must be same size as ``in_field``).
    alpha : float
        Diffusion coefficient (dimensionless).
    num_iter : `int`, optional
        Number of iterations to execute.
    """
    tmp_field = xp.empty_like(in_field)

    for n in range(num_iter):
        halo_update(in_field, num_halo)

        laplacian(in_field, tmp_field, num_halo=num_halo, extend=1)
        laplacian(tmp_field, out_field, num_halo=num_halo, extend=0)

        out_field[:, num_halo:-num_halo, num_halo:-num_halo] = (
            in_field[:, num_halo:-num_halo, num_halo:-num_halo] -
            alpha * out_field[:, num_halo:-num_halo, num_halo:-num_halo])

        if n < num_iter - 1:
            in_field, out_field = out_field, in_field
        else:
            halo_update(out_field, num_halo)
Beispiel #9
0
def _pack(binary):

    data_size = binary.dtype.itemsize * binary.shape[0]
    out_size = data_size

    out = cp.empty_like(binary, dtype=cp.ubyte, shape=out_size)

    threadsperblock, blockspergrid = _get_tpb_bpg()

    k_type = "pack"

    _populate_kernel_cache(out.dtype, k_type)

    kernel = _get_backend_kernel(
        out.dtype,
        blockspergrid,
        threadsperblock,
        k_type,
    )

    kernel(out_size, binary, out)

    _print_atts(kernel)

    # Remove binary data
    del binary

    return out
Beispiel #10
0
def _reshuffle_dct3(y, n, axis, dst):
    """Reorder entries to allow computation of DCT/DST-II via FFT."""
    x = cupy.empty_like(y)
    n_half = (n + 1) // 2

    # Store first half of y in the even entries of the output
    sl_even = [slice(None)] * y.ndim
    sl_even[axis] = slice(0, None, 2)
    sl_even = tuple(sl_even)

    sl_half = [slice(None)] * y.ndim
    sl_half[axis] = slice(0, n_half)
    x[sl_even] = y[tuple(sl_half)]

    # Store the second half of y in the odd entries of the output
    sl_odd = [slice(None)] * y.ndim
    sl_odd[axis] = slice(1, None, 2)
    sl_odd = tuple(sl_odd)

    sl_half[axis] = slice(-1, n_half - 1, -1)
    if dst:
        x[sl_odd] = -y[tuple(sl_half)]
    else:
        x[sl_odd] = y[tuple(sl_half)]
    return x
Beispiel #11
0
    def test_static_array(self):
        code = '''
struct double5 {
    double value[5];
    __device__ const double& operator[](size_t i) const { return value[i]; }
};

extern "C" __global__ void test_kernel(const double* a, double5 b, double* x) {
  int i = threadIdx.x;
  x[i] = a[i] + b[0] + b[1] + b[2] + b[3] + b[4];
}
'''

        a_cpu = numpy.arange(24, dtype=numpy.float64)
        a = cupy.array(a_cpu)
        x = cupy.empty_like(a)

        func = _compile_func('test_kernel', code)

        # We cannot pass np.ndarray kernel arguments of size > 1
        b = numpy.arange(5).astype(numpy.float64)
        with pytest.raises(TypeError):
            func.linear_launch(a.size, (a, b, x))

        double5 = numpy.dtype({
            'names': ['dummy'],
            'formats': [(numpy.float64, (5, ))]
        })
        func.linear_launch(a.size, (a, b.view(double5), x))

        expected = a_cpu + b.sum()
        testing.assert_array_equal(x, expected)
Beispiel #12
0
def copy(array, out=None, out_device=None, stream=None):
    """Copies a cupy.ndarray object using the default stream.

    This function can copy the device array to the destination array on another
    device.

    Args:
        array (cupy.ndarray): Array to be copied.
        out (cupy.ndarray): Destination array.
            If it is not ``None``, then ``out_device`` argument is ignored.
        out_device: Destination device specifier. Actual device object is
            obtained by passing this value to :func:`get_device`.
        stream (cupy.cuda.Stream): CUDA stream.

    Returns:
        cupy.ndarray: Copied array.

        If ``out`` is not specified, then the array is allocated on the device
        specified by ``out_device`` argument.

    """
    check_cuda_available()
    assert stream is None  # TODO(beam2d): FIX IT

    if out is None:
        if out_device is None:
            out_device = array
        with get_device(out_device):
            out = cupy.empty_like(array)

    with get_device(array):
        cupy.copyto(out, array)

    return out
Beispiel #13
0
 def test_getitem_int(self):
     x = cupy.arange(24).reshape((2, 3, 4)).astype('i')
     y = cupy.empty_like(x)
     y = cupy.ElementwiseKernel(
         'raw T x', 'int32 y', 'y = x[i]', 'test_carray_getitem_int',
     )(x, y)
     testing.assert_array_equal(y, x)
Beispiel #14
0
 def test_uniform_filter1d(self):
     d = cp.random.randn(5000)
     os = cp.empty((4, d.size))
     ot = cp.empty_like(os)
     self.check_func_serial(4, sndi.uniform_filter1d, (d, 5), os)
     self.check_func_thread(4, sndi.uniform_filter1d, (d, 5), ot)
     assert_array_equal(os, ot)
Beispiel #15
0
def activation_backward(x, y, gy, mode):
    x = cupy.ascontiguousarray(x)
    gy = cupy.ascontiguousarray(gy)

    gx = cupy.empty_like(x)
    dtype = "d" if x.dtype == "d" else "f"
    one = numpy.array(1, dtype=dtype).ctypes
    zero = numpy.array(0, dtype=dtype).ctypes
    handle = get_handle()
    y_mat = _as4darray(y)
    desc = create_tensor_descriptor(y_mat)
    cudnn.activationBackward_v3(
        handle,
        mode,
        one.data,
        desc.value,
        y.data.ptr,
        desc.value,
        gy.data.ptr,
        desc.value,
        x.data.ptr,
        zero.data,
        desc.value,
        gx.data.ptr,
    )
    return gx
Beispiel #16
0
def to_gpu(array, device=None, stream=None):
    """Copies the given CPU array to the specified device.

    Args:
        array: Array to be sent to GPU.
        device: Device specifier.
        stream (~cupy.cuda.Stream): *(deprecated since v3.0.0)*
            CUDA stream. If not ``None``, the copy runs asynchronously.

    Returns:
        cupy.ndarray: Array on GPU.

        If ``array`` is already on the GPU device specified by ``device``,
        this function just returns ``array`` without performing any copy.

    """
    if stream is not None:
        warnings.warn(
            'The stream option is deprecated in chainer.cuda.to_gpu. '
            'Please remove it.', DeprecationWarning)

    check_cuda_available()
    if not isinstance(array, (cupy.ndarray, numpy.ndarray)):
        raise TypeError(
            'The array sent to gpu must be numpy.ndarray or cupy.ndarray.'
            '\nActual type: {0}.'.format(type(array)))
    with _get_device(device):
        array_dev = get_device_from_array(array)
        if array_dev.id == cupy.cuda.device.get_device_id():
            return array

        if stream is not None and stream.ptr != 0:
            ret = cupy.empty_like(array)
            if array_dev.id == -1:
                # cpu to gpu
                mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
                src = numpy.frombuffer(mem, array.dtype,
                                       array.size).reshape(array.shape)
                src[...] = array
                ret.set(src, stream)
                cupy.cuda.pinned_memory._add_to_watch_list(
                    stream.record(), mem)
            else:
                # gpu to gpu
                with array_dev:
                    src = array.copy()
                    event = Stream.null.record()
                stream.wait_event(event)
                ret.data.copy_from_device_async(src.data, src.nbytes, stream)

                # to hold a reference until the end of the asynchronous
                # memcpy
                stream.add_callback(lambda *x: None, (src, ret))
            return ret

        if array_dev.id == -1:
            return cupy.asarray(array)

        # Need to make a copy when an array is copied to another device
        return cupy.array(array, copy=True)
Beispiel #17
0
    def test_backward_fft(self, dtype):
        t = dtype
        idtype = odtype = edtype = cupy.dtype(t)
        shape = self.shape
        length = cupy.core.internal.prod(shape[1:])

        a = testing.shaped_random(shape, cupy, dtype)
        out = cupy.empty_like(a)

        plan = cufft.XtPlanNd(shape[1:],
                              shape[1:],
                              1,
                              length,
                              idtype,
                              shape[1:],
                              1,
                              length,
                              odtype,
                              shape[0],
                              edtype,
                              order='C',
                              last_axis=-1,
                              last_size=None)
        plan.fft(a, out, cufft.CUFFT_INVERSE)

        if len(shape) <= 2:
            out_cp = cupy.fft.ifft(a)
        else:
            out_cp = cupy.fft.ifftn(a, axes=(-1, -2))
        testing.assert_allclose(out / length, out_cp)
Beispiel #18
0
def _terrain_gpu(height_map, seed, x_range=(0, 1), y_range=(0, 1)):

    NOISE_LAYERS = ((1 / 2**i, (2**i, 2**i)) for i in range(16))

    noise = cupy.empty_like(height_map, dtype=np.float32)

    griddim, blockdim = cuda_args(height_map.shape)

    for i, (m, (xfreq, yfreq)) in enumerate(NOISE_LAYERS):

        # cupy.random.seed(seed+i)
        # p = cupy.random.permutation(2**20)

        # use numpy.random then transfer data to GPU to ensure the same result
        # when running numpy backed and cupy backed data array.
        np.random.seed(seed + i)
        p = cupy.asarray(np.random.permutation(2**20))
        p = cupy.append(p, p)

        _perlin_gpu[griddim,
                    blockdim](p, x_range[0] * xfreq, x_range[1] * xfreq,
                              y_range[0] * yfreq, y_range[1] * yfreq, m, noise)

        height_map += noise

    height_map /= (1.00 + 0.50 + 0.25 + 0.13 + 0.06 + 0.03)
    height_map = height_map**3
    return height_map
def test_cross_correlate_masked_over_axes():
    """Masked normalized cross-correlation over axes should be
    equivalent to a loop over non-transform axes."""
    # See random number generator for reproducible results
    np.random.seed(23)

    arr1 = np.random.random((8, 8, 5))
    arr2 = np.random.random((8, 8, 5))

    m1 = np.random.choice([True, False], arr1.shape)
    m2 = np.random.choice([True, False], arr2.shape)

    arr1 = cp.asarray(arr1)
    arr2 = cp.asarray(arr2)
    m1 = cp.asarray(m1)
    m2 = cp.asarray(m2)

    # Loop over last axis
    with_loop = cp.empty_like(arr1, dtype=np.complex128)
    for index in range(arr1.shape[-1]):
        with_loop[:, :, index] = cross_correlate_masked(arr1[:, :, index],
                                                        arr2[:, :, index],
                                                        m1[:, :, index],
                                                        m2[:, :, index],
                                                        axes=(0, 1),
                                                        mode='same')

    over_axes = cross_correlate_masked(
        arr1, arr2, m1, m2, axes=(0, 1), mode='same')

    cp.testing.assert_array_almost_equal(with_loop, over_axes)
Beispiel #20
0
def _unpack(binary, dtype, endianness):

    data_size = cp.dtype(dtype).itemsize // binary.dtype.itemsize

    out_size = binary.shape[0] // data_size

    out = cp.empty_like(binary, dtype=dtype, shape=out_size)

    if endianness == "B":
        little = False
    else:
        little = True

    threadsperblock, blockspergrid = _get_tpb_bpg()

    k_type = "unpack"

    _populate_kernel_cache(out.dtype, k_type)

    kernel = _get_backend_kernel(
        out.dtype,
        blockspergrid,
        threadsperblock,
        k_type,
    )

    kernel(out_size, little, binary, out)

    _print_atts(kernel)

    # Remove binary data
    del binary

    return out
Beispiel #21
0
def activation_backward(x, y, gy, mode):
    x = cupy.ascontiguousarray(x)
    gy = cupy.ascontiguousarray(gy)

    gx = cupy.empty_like(x)
    dtype = 'd' if x.dtype == 'd' else 'f'
    one = numpy.array(1, dtype=dtype).ctypes
    zero = numpy.array(0, dtype=dtype).ctypes
    handle = get_handle()
    y_mat = _as4darray(y)
    desc = create_tensor_descriptor(y_mat)
    if _cudnn_version >= 4000:
        act_desc = Descriptor(cudnn.createActivationDescriptor(),
                              cudnn.destroyActivationDescriptor)
        cudnn.setActivationDescriptor(
            act_desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0)
        cudnn.activationBackward_v4(
            handle, act_desc.value, one.data, desc.value, y.data.ptr,
            desc.value, gy.data.ptr, desc.value, x.data.ptr,
            zero.data, desc.value, gx.data.ptr)
    else:
        cudnn.activationBackward_v3(
            handle, mode, one.data, desc.value, y.data.ptr,
            desc.value, gy.data.ptr, desc.value, x.data.ptr,
            zero.data, desc.value, gx.data.ptr)
    return gx
Beispiel #22
0
def main():
    N = 8
    module = cupy.RawModule(code=code,
                            options=('-std=c++11', ),
                            name_expressions=('kernel<float>',
                                              'kernel<double>'))

    # The kernel computes out = A*B+C where A, B and C are 4x4 matrices.
    # A and B are arrays of N such matrices and C is a matrix kernel parameter.

    for (ctype, dtype) in zip(('float', 'double'),
                              (numpy.float32, numpy.float64)):

        A = cupy.random.rand(16 * N, dtype=dtype).reshape(N, 4, 4)
        B = cupy.random.rand(16 * N, dtype=dtype).reshape(N, 4, 4)
        C = numpy.random.rand(16).astype(dtype).reshape(4, 4)
        out = cupy.empty_like(A)

        Matrix = numpy.dtype({
            'names': ['value'],
            'formats': [(dtype, (4, 4))]
        })

        kernel = module.get_function('kernel<{}>'.format(ctype))
        args = (A, B, C.ravel().view(Matrix), out)
        kernel((1, ), (N, ), args)

        expected = cupy.matmul(A, B) + cupy.asarray(C[None, :, :])

        cupy.testing.assert_array_almost_equal(expected, out)
        print("Kernel output matches expected value for "
              "type '{}'.".format(ctype))
Beispiel #23
0
def ascontiguousarray(a, dtype=None):
    """Returns a C-contiguous array.

    Args:
        a (cupy.ndarray): Source array.
        dtype: Data type specifier.

    Returns:
        cupy.ndarray: If no copy is required, it returns ``a``. Otherwise, it
        returns a copy of ``a``.

    .. seealso:: :func:`numpy.ascontiguousarray`

    """
    if dtype is None:
        dtype = a.dtype
    else:
        dtype = numpy.dtype(dtype)

    if dtype == a.dtype and a.flags.c_contiguous:
        return a
    else:
        newarray = cupy.empty_like(a, dtype)
        elementwise.copy(a, newarray)
        return newarray
Beispiel #24
0
def copy(array, out=None, out_device=None, stream=None):
    """Copies a :class:`cupy.ndarray` object using the default stream.

    This function can copy the device array to the destination array on another
    device.

    Args:
        array (cupy.ndarray): Array to be copied.
        out (cupy.ndarray): Destination array.
            If it is not ``None``, then ``out_device`` argument is ignored.
        out_device: Destination device specifier. Actual device object is
            obtained by passing this value to :func:`get_device`.
        stream (cupy.cuda.Stream): CUDA stream.

    Returns:
        cupy.ndarray: Copied array.

        If ``out`` is not specified, then the array is allocated on the device
        specified by ``out_device`` argument.

    """
    check_cuda_available()
    assert stream is None  # TODO(beam2d): FIX IT

    if out is None:
        if out_device is None:
            out_device = array
        with get_device(out_device):
            out = cupy.empty_like(array)

    with get_device(array):
        cupy.copyto(out, array)

    return out
Beispiel #25
0
def inv_gpu(b):
    # We do a batched LU decomposition on the GPU to compute the inverse
    # Change the shape of the array to be size=1 minibatch if necessary
    # Also copy the matrix as the elments will be modified in-place
    a = _as_batch_mat(b).copy()
    n = a.shape[1]
    n_matrices = len(a)
    # Pivot array
    p = cupy.empty((n, n_matrices), dtype=np.int32)
    # Output array
    c = cupy.empty_like(a)
    # These arrays hold information on the execution success
    # or if the matrix was singular
    info = cupy.empty(n_matrices, dtype=np.int32)
    ap = _mat_ptrs(a)
    cp = _mat_ptrs(c)
    _, lda = _get_ld(a)
    _, ldc = _get_ld(c)
    handle = cuda.Device().cublas_handle
    cuda.cublas.sgetrfBatched(
        handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices)
    cuda.cublas.sgetriBatched(
        handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc,
        info.data.ptr, n_matrices)
    return c
def pressure_poisson(u,v,p,b, nit, dx, dy):

    pn = cp.empty_like(p)
    pn = p.copy()
    # pn has all boundary related elemnts after 1st time loop
    # Below loop helps us to achieve pressure terms from boundary to whole surface via differential schemes in time.
    # Indeed we used boundary conditions and  get the whole surface discretely

    for q in range(nit):

        # (n-1)th time presssure part is used to calculate nth time pressure.
        pn = p.copy()

        # optimized python code for our PP equation
        p[1:-1, 1:-1] = (((pn[1:-1, 2:] + pn[1:-1, 0:-2]) * dy**2 +
                          (pn[2:, 1:-1] + pn[0:-2, 1:-1]) * dx**2) /
                          (2 * (dx**2 + dy**2)) -
                          dx**2 * dy**2 / (2 * (dx**2 + dy**2)) *
                          b[1:-1,1:-1])

      # boundary conditions
      # utilized backward difference scheme and equated it to 0

        p[:, -1] = p[:, -2] # dp/dx = 0 at x = 2
        p[0, :] = p[1, :]   # dp/dy = 0 at y = 0
        p[:, 0] = p[:, 1]   # dp/dx = 0 at x = 0
        p[-1, :] = 0        # p = 0 at y = 2

    return p
Beispiel #27
0
def ascontiguousarray(a, dtype=None):
    """Returns a C-contiguous array.

    Args:
        a (cupy.ndarray): Source array.
        dtype: Data type specifier.

    Returns:
        cupy.ndarray: If no copy is required, it returns ``a``. Otherwise, it
        returns a copy of ``a``.

    .. seealso:: :func:`numpy.ascontiguousarray`

    """
    if dtype is None:
        dtype = a.dtype
    else:
        dtype = numpy.dtype(dtype)

    if dtype == a.dtype and a.flags.c_contiguous:
        return a
    else:
        newarray = cupy.empty_like(a, dtype)
        elementwise.copy(a, newarray)
        return newarray
Beispiel #28
0
def activation_backward(x, y, gy, mode):
    x = cupy.ascontiguousarray(x)
    gy = cupy.ascontiguousarray(gy)

    gx = cupy.empty_like(x)
    dtype = 'd' if x.dtype == 'd' else 'f'
    one = numpy.array(1, dtype=dtype).ctypes
    zero = numpy.array(0, dtype=dtype).ctypes
    handle = get_handle()
    y_mat = _as4darray(y)
    desc = create_tensor_descriptor(y_mat)
    if _cudnn_version >= 4000:
        act_desc = Descriptor(cudnn.createActivationDescriptor(),
                              cudnn.destroyActivationDescriptor)
        cudnn.setActivationDescriptor(
            act_desc.value, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, 0.0)
        cudnn.activationBackward_v4(
            handle, act_desc.value, one.data, desc.value, y.data.ptr,
            desc.value, gy.data.ptr, desc.value, x.data.ptr,
            zero.data, desc.value, gx.data.ptr)
    else:
        cudnn.activationBackward_v3(
            handle, mode, one.data, desc.value, y.data.ptr,
            desc.value, gy.data.ptr, desc.value, x.data.ptr,
            zero.data, desc.value, gx.data.ptr)
    return gx
Beispiel #29
0
 def test_correlate1d(self):
     d = cp.random.randn(5000)
     os = cp.empty((4, d.size))
     ot = cp.empty_like(os)
     self.check_func_serial(4, sndi.correlate1d, (d, cp.arange(5)), os)
     self.check_func_thread(4, sndi.correlate1d, (d, cp.arange(5)), ot)
     assert_array_equal(os, ot)
def cavity_flow(nt, u, v, p, b, vis, rho, nx, ny, nit, dt, dx, dy):
    un = cp.empty_like(u)
    vn = cp.empty_like(v)
    b = cp.zeros((ny, nx), dtype = np.float64)

    for n in range(nt):
        un = u.copy()
        vn = v.copy()

        #b = build_up_b(b, rho, dt, u, v, dx, dy)
        #p = pressure_poisson(p, dx, dy, b)

        b = build_up_b(b, u, v, rho, dt, dx, dy)
        p = pressure_poisson(un,vn,p,b, nit, dx, dy)

        u[1:-1, 1:-1] = (un[1:-1, 1:-1]-
                         un[1:-1, 1:-1] * dt / dx *
                        (un[1:-1, 1:-1] - un[1:-1, 0:-2]) -
                         vn[1:-1, 1:-1] * dt / dy *
                        (un[1:-1, 1:-1] - un[0:-2, 1:-1]) -
                         dt / (2 * rho * dx) * (p[1:-1, 2:] - p[1:-1, 0:-2]) +
                         vis * (dt / dx**2 *
                        (un[1:-1, 2:] - 2 * un[1:-1, 1:-1] + un[1:-1, 0:-2]) +
                         dt / dy**2 *
                        (un[2:, 1:-1] - 2 * un[1:-1, 1:-1] + un[0:-2, 1:-1])))

        v[1:-1,1:-1] = (vn[1:-1, 1:-1] -
                        un[1:-1, 1:-1] * dt / dx *
                       (vn[1:-1, 1:-1] - vn[1:-1, 0:-2]) -
                        vn[1:-1, 1:-1] * dt / dy *
                       (vn[1:-1, 1:-1] - vn[0:-2, 1:-1]) -
                        dt / (2 * rho * dy) * (p[2:, 1:-1] - p[0:-2, 1:-1]) +
                        vis * (dt / dx**2 *
                       (vn[1:-1, 2:] - 2 * vn[1:-1, 1:-1] + vn[1:-1, 0:-2]) +
                        dt / dy**2 *
                       (vn[2:, 1:-1] - 2 * vn[1:-1, 1:-1] + vn[0:-2, 1:-1])))

        u[0, :]  = 0
        u[:, 0]  = 0
        u[:, -1] = 0
        u[-1, :] = 1    # set velocity on cavity lid equal to 1
        v[0, :]  = 0
        v[-1, :] = 0
        v[:, 0]  = 0
        v[:, -1] = 0

    return u,v,p
Beispiel #31
0
def solve(a, b):
    """Solves a linear matrix equation.

    It computes the exact solution of ``x`` in ``ax = b``,
    where ``a`` is a square and full rank matrix.

    Args:
        a (cupy.ndarray): The matrix with dimension ``(..., M, M)``.
        b (cupy.ndarray): The matrix with dimension ``(...,M)`` or
            ``(..., M, K)``.

    Returns:
        cupy.ndarray:
            The matrix with dimension ``(..., M)`` or ``(..., M, K)``.

    .. warning::
        This function calls one or more cuSOLVER routine(s) which may yield
        invalid results if input conditions are not met.
        To detect these invalid results, you can set the `linalg`
        configuration to a value that is not `ignore` in
        :func:`cupyx.errstate` or :func:`cupyx.seterr`.

    .. seealso:: :func:`numpy.linalg.solve`
    """
    if a.ndim > 2 and a.shape[-1] <= get_batched_gesv_limit():
        # Note: There is a low performance issue in batched_gesv when matrix is
        # large, so it is not used in such cases.
        return batched_gesv(a, b)

    # TODO(kataoka): Move the checks to the beginning
    _util._assert_cupy_array(a, b)
    _util._assert_stacked_2d(a)
    _util._assert_stacked_square(a)

    if not ((a.ndim == b.ndim or a.ndim == b.ndim + 1)
            and a.shape[:-1] == b.shape[:a.ndim - 1]):
        raise ValueError(
            'a must have (..., M, M) shape and b must have (..., M) '
            'or (..., M, K)')

    dtype, out_dtype = _util.linalg_common_type(a, b)
    if a.ndim == 2:
        # prevent 'a' and 'b' to be overwritten
        a = a.astype(dtype, copy=True, order='F')
        b = b.astype(dtype, copy=True, order='F')
        cupyx.lapack.gesv(a, b)
        return b.astype(out_dtype, copy=False)

    # prevent 'a' to be overwritten
    a = a.astype(dtype, copy=True, order='C')
    x = cupy.empty_like(b, dtype=out_dtype)
    shape = a.shape[:-2]
    for i in range(numpy.prod(shape)):
        index = numpy.unravel_index(i, shape)
        # prevent 'b' to be overwritten
        bi = b[index].astype(dtype, copy=True, order='F')
        cupyx.lapack.gesv(a[index], bi)
        x[index] = bi
    return x
Beispiel #32
0
 def matvec(x):
     y = cupy.empty_like(x)
     desc_x = cusparse.DnVecDescriptor.create(x)
     desc_y = cusparse.DnVecDescriptor.create(y)
     _cusparse.spMV(handle, op_a, alpha.ctypes.data, desc_A.desc,
                    desc_x.desc, beta.ctypes.data, desc_y.desc,
                    cuda_dtype, alg, buff.data.ptr)
     return y
Beispiel #33
0
def _compute_cc_step(grad_static,
                     factors,
                     radius,
                     forward=True,
                     zero_borders=True,
                     coord_axis=-1):
    out = cp.empty_like(grad_static)
    ndim = out.ndim - 1
    Ii = factors[0]
    Ji = factors[1]
    sfm = factors[2]
    sff = factors[3]
    smm = factors[4]

    result = cp.empty_like(sfm)
    _cc_local_correlation(sfm, sff, smm, 1e-5, result)
    energy = result.sum()

    if forward:
        cc_kernel = _cc_compute_forward
    else:
        cc_kernel = _cc_compute_backward
    # can reuse result for the output array
    cc_kernel(sfm, sff, smm, Ji, Ii, result)

    if coord_axis == -1:
        result = result[..., cp.newaxis]
    else:
        result = result[cp.newaxis, :]
    result = result * grad_static

    if zero_borders and radius > 0:
        if coord_axis not in [0, -1]:
            raise ValueError("coord_axis must be 0 or -1.")
        slices = [slice(None)] * (ndim + 1)
        if coord_axis == -1:
            axes = range(ndim)
        else:
            axes = range(1, ndim + 1)
        for ax in axes:
            slices[ax] = slice(0, radius)
            result[tuple(slices)] = 0
            slices[ax] = slice(-radius, None)
            result[tuple(slices)] = 0
            slices[ax] = slice(None)
    return result, energy
Beispiel #34
0
 def test_correlate(self):
     d = cp.random.randn(500, 500)
     k = cp.random.randn(10, 10)
     os = cp.empty([4] + list(d.shape))
     ot = cp.empty_like(os)
     self.check_func_serial(4, sndi.correlate, (d, k), os)
     self.check_func_thread(4, sndi.correlate, (d, k), ot)
     assert_array_equal(os, ot)
Beispiel #35
0
    def test_empty_like_reshape_cupy_only(self, dtype, order):
        a = testing.shaped_arange((2, 3, 4), cupy, dtype)
        b = cupy.empty_like(a, shape=self.shape)
        b.fill(0)
        c = cupy.empty(self.shape, order=order, dtype=dtype)
        c.fill(0)

        testing.assert_array_equal(b, c)
Beispiel #36
0
def to_gpu(array, device=None, stream=None):
    """Copies the given CPU array to specified device.

    Args:
        array: Array to be sent to GPU.
        device: Device specifier.
        stream (cupy.cuda.Stream): CUDA stream. If not ``None``, the copy runs
            asynchronously.

    Returns:
        cupy.ndarray: Array on GPU.

        If ``array`` is already on GPU, then this function just returns
        ``array`` without performing any copy. Note that this function does not
        copy :class:`cupy.ndarray` into specified device.

    """
    check_cuda_available()
    with _get_device(device):
        array_dev = get_device_from_array(array)
        if array_dev.id == cupy.cuda.device.get_device_id():
            return array

        if stream is not None:
            warnings.warn(
                'The stream option is deprecated in chainer.cuda.to_gpu. '
                'Please remove it.', DeprecationWarning)
            if stream.ptr != 0:
                ret = cupy.empty_like(array)
                if array_dev.id == -1:
                    # cpu to gpu
                    mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
                    src = numpy.frombuffer(
                        mem, array.dtype, array.size).reshape(array.shape)
                    src[...] = array
                    ret.set(src, stream)
                    cupy.cuda.pinned_memory._add_to_watch_list(
                        stream.record(), mem)
                else:
                    # gpu to gpu
                    with array_dev:
                        src = array.copy()
                        event = Stream.null.record()
                    stream.wait_event(event)
                    ret.data.copy_from_device_async(
                        src.data, src.nbytes, stream)

                    # to hold a reference until the end of the asynchronous
                    # memcpy
                    stream.add_callback(lambda *x: None, (src, ret))
                return ret

        if array_dev.id == -1:
            return cupy.asarray(array)

        # Need to make a copy when an array is copied to another device
        return cupy.array(array, copy=True)
Beispiel #37
0
 def test_getitem_idx(self):
     x = cupy.arange(24).reshape((2, 3, 4)).astype('i')
     y = cupy.empty_like(x)
     y = cupy.ElementwiseKernel(
         'raw T x', 'int32 y',
         'int idx[] = {i / 12, i / 4 % 3, i % 4}; y = x[idx]',
         'test_carray_getitem_idx',
     )(x, y)
     testing.assert_array_equal(y, x)
Beispiel #38
0
def activation_forward(x, mode):
    x = cupy.ascontiguousarray(x)
    y = cupy.empty_like(x)

    dtype = "d" if x.dtype == "d" else "f"
    one = numpy.array(1, dtype=dtype).ctypes
    zero = numpy.array(0, dtype=dtype).ctypes
    handle = get_handle()
    x_mat = _as4darray(x)
    desc = create_tensor_descriptor(x_mat)
    cudnn.activationForward_v3(handle, mode, one.data, desc.value, x_mat.data.ptr, zero.data, desc.value, y.data.ptr)
    return y
Beispiel #39
0
def _array_to_gpu(array, device, stream):
    assert device is DummyDevice or isinstance(device, Device)
    if array is None:
        return None

    if isinstance(array, (numpy.number, numpy.bool_)):
        array = numpy.asarray(array)
    elif isinstance(array, intel64.mdarray):
        array = numpy.asarray(array)

    if not isinstance(array, (cupy.ndarray, numpy.ndarray)):
        raise TypeError(
            'The array sent to gpu must be an array or a NumPy scalar.'
            '\nActual type: {0}.'.format(type(array)))

    array_dev = get_device_from_array(array)
    if array_dev.id == cupy.cuda.device.get_device_id():
        return array

    if stream is not None and stream.ptr != 0:
        ret = cupy.empty_like(array)
        if array_dev.id == -1:
            # cpu to gpu
            mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
            src = numpy.frombuffer(
                mem, array.dtype, array.size).reshape(array.shape)
            src[...] = array
            ret.set(src, stream)
            cupy.cuda.pinned_memory._add_to_watch_list(
                stream.record(), mem)
        else:
            # gpu to gpu
            with array_dev:
                src = array.copy()
                event = Stream.null.record()
            stream.wait_event(event)
            ret.data.copy_from_device_async(
                src.data, src.nbytes, stream)

            # to hold a reference until the end of the asynchronous
            # memcpy
            stream.add_callback(lambda *x: None, (src, ret))
        return ret

    if array_dev.id == -1:
        return cupy.asarray(array)

    # Need to make a copy when an array is copied to another device
    return cupy.array(array, copy=True)
Beispiel #40
0
def roll(a, shift, axis=None):
    """Roll array elements along a given axis.

    Args:
        a (~cupy.ndarray): Array to be rolled.
        shift (int): The number of places by which elements are shifted.
        axis (int or None): The axis along which elements are shifted.
            If ``axis`` is ``None``, the array is flattened before shifting,
            and after that it is reshaped to the original shape.

    Returns:
        ~cupy.ndarray: Output array.

    .. seealso:: :func:`numpy.roll`

    """
    if axis is None:
        if a.size == 0:
            return a
        size = a.size
        ra = a.ravel()
        shift %= size
        res = cupy.empty((size,), a.dtype)
        res[:shift] = ra[size - shift:]
        res[shift:] = ra[:size - shift]
        return res.reshape(a.shape)
    else:
        axis = int(axis)
        if axis < 0:
            axis += a.ndim
        if not 0 <= axis < a.ndim:
            raise ValueError('axis must be >= %d and < %d' % (-a.ndim, a.ndim))
        size = a.shape[axis]
        if size == 0:
            return a
        shift %= size
        prev = (slice(None),) * axis
        rest = (slice(None),) * (a.ndim - axis - 1)
        # Roll only the dimensiont at the given axis
        # ind1 is [:, ..., size-shift:, ..., :]
        # ind2 is [:, ..., :size-shift, ..., :]
        ind1 = prev + (slice(size - shift, None, None),) + rest
        ind2 = prev + (slice(None, size - shift, None),) + rest
        r_ind1 = prev + (slice(None, shift, None),) + rest
        r_ind2 = prev + (slice(shift, None, None),) + rest
        res = cupy.empty_like(a)
        res[r_ind1] = a[ind1]
        res[r_ind2] = a[ind2]
        return res
Beispiel #41
0
def empty_like(array):
    """Creates an uninitialized GPU array like the given one.

    Args:
        array (cupy.ndarray or numpy.ndarray): Base array.

    Returns:
        cupy.ndarray: GPU array of the same shape and dtype as `array`.

    """
    warnings.warn("chainer.cuda.empty_like is deprecated. Use cupy.empty_like instead.", DeprecationWarning)
    check_cuda_available()
    if isinstance(array, cupy.ndarray):
        return cupy.empty_like(array)
    return cupy.empty(array.shape, dtype=array.dtype)
Beispiel #42
0
def to_gpu(array, device=None, stream=None):
    """Copies the given CPU array to specified device.

    Args:
        array: Array to be sent to GPU.
        device: Device specifier.
        stream (cupy.cuda.Stream): CUDA stream. If not ``None``, the copy runs
            asynchronously.

    Returns:
        cupy.ndarray: Array on GPU.

        If ``array`` is already on GPU, then this function just returns
        ``array`` without performing any copy. Note that this function does not
        copy :class:`cupy.ndarray` into specified device.

    """
    check_cuda_available()
    with get_device(device):
        array_dev = get_device(array)
        if array_dev.id == cupy.cuda.device.get_device_id():
            return array

        if stream is not None:
            ret = cupy.empty_like(array)
            if array_dev.id == -1:
                # cpu to gpu
                src = array.copy(order='C')
                ret.set(src, stream)
            else:
                # gpu to gpu
                with array_dev:
                    src = array.copy()
                ret.data.copy_from_device_async(src.data, src.nbytes, stream)

            # to hold a reference until the end of the asynchronous memcpy
            stream.add_callback(lambda *x: None, (src, ret))
            return ret

        if array_dev.id == -1:
            return cupy.asarray(array)

        # Need to make a copy when an array is copied to another device
        return cupy.array(array, copy=True)
Beispiel #43
0
def packbits(myarray):
    """Packs the elements of a binary-valued array into bits in a uint8 array.

    This function currently does not support ``axis`` option.

    Args:
        myarray (cupy.ndarray): Input array.

    Returns:
        cupy.ndarray: The packed array.

    .. note::
        When the input array is empty, this function returns a copy of it,
        i.e., the type of the output array is not necessarily always uint8.
        This exactly follows the NumPy's behaviour (as of version 1.11),
        alghough this is inconsistent to the documentation.

    .. seealso:: :func:`numpy.packbits`
    """
    if myarray.dtype.kind not in 'biu':
        raise TypeError(
            'Expected an input array of integer or boolean data type')

    if myarray.size == 0:
        return cupy.empty_like(myarray)

    myarray = myarray.ravel()
    packed_size = (myarray.size + 7) // 8
    packed = cupy.zeros((packed_size,), dtype=cupy.uint8)

    cupy.ElementwiseKernel(
        'raw T myarray, raw int32 myarray_size', 'uint8 packed',
        '''for (int j = 0; j < 8; ++j) {
            int k = i * 8 + j;
            int bit = k < myarray_size && myarray[k] != 0;
            packed |= bit << (7 - j);
        }''',
        'packbits_kernel'
    )(myarray, myarray.size, packed)

    return packed
Beispiel #44
0
def _array_to_gpu(array, device, stream):
    if array is None:
        return None

    if isinstance(array, chainerx.ndarray):
        # TODO(niboshi): Update this logic once both CuPy and ChainerX support
        # the array interface.
        if array.device.backend.name == 'cuda':
            # Convert to cupy.ndarray on the same device as source array
            array = cupy.ndarray(
                array.shape,
                array.dtype,
                cupy.cuda.MemoryPointer(
                    cupy.cuda.UnownedMemory(
                        array.data_ptr + array.offset,
                        array.data_size,
                        array,
                        array.device.index),
                    0),
                strides=array.strides)
        else:
            array = chainerx.to_numpy(array)
    elif isinstance(array, (numpy.number, numpy.bool_)):
        array = numpy.asarray(array)
    elif isinstance(array, intel64.mdarray):
        array = numpy.asarray(array)

    if isinstance(array, ndarray):
        if array.device == device:
            return array
        is_numpy = False
    elif isinstance(array, numpy.ndarray):
        is_numpy = True
    else:
        raise TypeError(
            'The array sent to gpu must be an array or a NumPy scalar.'
            '\nActual type: {0}.'.format(type(array)))

    if stream is not None and stream.ptr != 0:
        ret = cupy.empty_like(array)
        if is_numpy:
            # cpu to gpu
            mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
            src = numpy.frombuffer(
                mem, array.dtype, array.size).reshape(array.shape)
            src[...] = array
            ret.set(src, stream)
            cupy.cuda.pinned_memory._add_to_watch_list(
                stream.record(), mem)
        else:
            # gpu to gpu
            with array.device:
                src = array.copy()
                event = Stream.null.record()
            stream.wait_event(event)
            ret.data.copy_from_device_async(
                src.data, src.nbytes, stream)

            # to hold a reference until the end of the asynchronous
            # memcpy
            stream.add_callback(lambda *x: None, (src, ret))
        return ret

    with device:
        if is_numpy:
            return cupy.asarray(array)
        # Need to make a copy when an array is copied to another device
        return cupy.array(array, copy=True)