Beispiel #1
0
def cholesky(a):
    '''Cholesky decomposition.

    Decompose a given two-dimensional square matrix into ``L * L.T``,
    where ``L`` is a lower-triangular matrix and ``.T`` is a conjugate
    transpose operator. Note that in the current implementation ``a`` must be
    a real matrix, and only float32 and float64 are supported.

    Args:
        a (cupy.ndarray): The input matrix with dimension ``(N, N)``

    .. seealso:: :func:`numpy.linalg.cholesky`
    '''
    if not cuda.cusolver_enabled:
        raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0')

    # TODO(Saito): Current implementation only accepts two-dimensional arrays
    _assert_cupy_array(a)
    _assert_rank2(a)
    _assert_nd_squareness(a)

    # Cast to float32 or float64
    if a.dtype.char == 'f' or a.dtype.char == 'd':
        dtype = a.dtype.char
    else:
        dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char

    x = a.astype(dtype, copy=True)
    n = len(a)
    handle = device.get_cusolver_handle()
    dev_info = cupy.empty(1, dtype=numpy.int32)
    if dtype == 'f':
        buffersize = cusolver.spotrf_bufferSize(
            handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n)
        workspace = cupy.empty(buffersize, dtype=numpy.float32)
        cusolver.spotrf(
            handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n,
            workspace.data.ptr, buffersize, dev_info.data.ptr)
    else:  # dtype == 'd'
        buffersize = cusolver.dpotrf_bufferSize(
            handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n)
        workspace = cupy.empty(buffersize, dtype=numpy.float64)
        cusolver.dpotrf(
            handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n,
            workspace.data.ptr, buffersize, dev_info.data.ptr)
    status = int(dev_info[0])
    if status > 0:
        raise linalg.LinAlgError(
            'The leading minor of order {} '
            'is not positive definite'.format(status))
    elif status < 0:
        raise linalg.LinAlgError(
            'Parameter error (maybe caused by a bug in cupy.linalg?)')
    _tril(x, k=0)
    return x
Beispiel #2
0
def tile(A, reps):
    """Construct an array by repeating A the number of times given by reps.

    Args:
        A (cupy.ndarray): Array to transform.
        reps (int or tuple): The number of repeats.

    Returns:
        cupy.ndarray: Transformed array with repeats.

    .. seealso:: :func:`numpy.tile`

    """
    try:
        tup = tuple(reps)
    except TypeError:
        tup = (reps,)
    d = len(tup)
    if tup.count(1) == len(tup) and isinstance(A, cupy.ndarray):
        # Fixes the problem that the function does not make a copy if A is a
        # array and the repetitions are 1 in all dimensions
        return cupy.array(A, copy=True, ndmin=d)
    else:
        # Note that no copy of zero-sized arrays is made. However since they
        # have no data there is no risk of an inadvertent overwrite.
        c = cupy.array(A, copy=False, ndmin=d)
    if (d < c.ndim):
        tup = (1,) * (c.ndim - d) + tup
    shape_out = tuple(s * t for s, t in zip(c.shape, tup))
    if c.size == 0:
        return cupy.empty(shape_out, dtype=c.dtype)
    c_shape = []
    ret_shape = []
    for dim_in, nrep in zip(c.shape, tup):
        if nrep == 1:
            c_shape.append(dim_in)
            ret_shape.append(dim_in)
        elif dim_in == 1:
            c_shape.append(dim_in)
            ret_shape.append(nrep)
        else:
            c_shape.append(1)
            c_shape.append(dim_in)
            ret_shape.append(nrep)
            ret_shape.append(dim_in)
    ret = cupy.empty(ret_shape, dtype=c.dtype)
    if ret.size:
        ret[...] = c.reshape(c_shape)
    return ret.reshape(shape_out)
Beispiel #3
0
 def _generate_normal(self, func, size, dtype, *args):
     # curand functions below don't support odd size.
     # * curand.generateNormal
     # * curand.generateNormalDouble
     # * curand.generateLogNormal
     # * curand.generateLogNormalDouble
     size = core.get_size(size)
     element_size = six.moves.reduce(operator.mul, size, 1)
     if element_size % 2 == 0:
         out = cupy.empty(size, dtype=dtype)
         func(self._generator, out.data.ptr, out.size, *args)
         return out
     else:
         out = cupy.empty((element_size + 1,), dtype=dtype)
         func(self._generator, out.data.ptr, out.size, *args)
         return out[:element_size].reshape(size)
def _get_crossentropyloss_gpu(probs, t):

    kernel = _crossentropyloss_kernel()
    N, M = probs.shape
    loss = cp.empty((1,), dtype=np.float32)
    kernel(grid=(N, 1, 1), block=(32, 1, 1), args=(probs, t, loss, np.int32(N), np.int32(M)))
    return loss
 def check_copy(self, dtype, src_id, dst_id):
     with cuda.Device(src_id):
         src = testing.shaped_arange((2, 3, 4), dtype=dtype)
     with cuda.Device(dst_id):
         dst = cupy.empty((2, 3, 4), dtype=dtype)
     core.elementwise_copy(src, dst)
     testing.assert_allclose(src, dst)
    def test_copy_orders(self, order):
        a = cupy.empty((2, 3, 4))
        b = cupy.copy(a, order)

        a_cpu = numpy.empty((2, 3, 4))
        b_cpu = numpy.copy(a_cpu, order)

        self.assertEqual(b.strides, b_cpu.strides)
Beispiel #7
0
def _pyfftw_rfftn_empty_aligned(shape, axes, dtype, order='C', n=None):
    """Patched version of :func:`sporco.linalg.pyfftw_rfftn_empty_aligned`.
    """

    ashp = list(shape)
    raxis = axes[-1]
    ashp[raxis] = ashp[raxis] // 2 + 1
    cdtype = _complex_dtype(dtype)
    return cp.empty(ashp, cdtype, order)
Beispiel #8
0
def take(a, indices, axis=None, out=None):
    """Takes elements of an array at specified indices along an axis.

    This is an implementation of "fancy indexing" at single axis.

    This function does not support ``mode`` option.

    Args:
        a (cupy.ndarray): Array to extract elements.
        indices (int or array-like): Indices of elements that this function
            takes.
        axis (int): The axis along which to select indices. The flattened input
            is used by default.
        out (cupy.ndarray): Output array. If provided, it should be of
            appropriate shape and dtype.

    Returns:
        cupy.ndarray: The result of fancy indexing.

    .. seealso:: :func:`numpy.take`

    """
    if axis is None:
        a = a.ravel()
        lshape = ()
        rshape = ()
    else:
        if axis >= a.ndim:
            raise ValueError('Axis overrun')
        lshape = a.shape[:axis]
        rshape = a.shape[axis + 1:]

    if numpy.isscalar(indices):
        a = cupy.rollaxis(a, axis)
        if out is None:
            return a[indices].copy()
        else:
            out[:] = a[indices]
            return out
    elif not isinstance(indices, cupy.ndarray):
        indices = cupy.array(indices, dtype=int)

    out_shape = lshape + indices.shape + rshape
    if out is None:
        out = cupy.empty(out_shape, dtype=a.dtype)
    else:
        if out.dtype != a.dtype:
            raise TypeError('Output dtype mismatch')
        if out.shape != out_shape:
            raise ValueError('Output shape mismatch')

    cdim = indices.size
    rdim = internal.prod(rshape)
    indices = cupy.reshape(
        indices, (1,) * len(lshape) + indices.shape + (1,) * len(rshape))
    return _take_kernel(a, indices, cdim, rdim, out)
Beispiel #9
0
def _get_out_args(out_args, out_types, out_shape):
    if not out_args:
        return [cupy.empty(out_shape, t) for t in out_types]

    for a in out_args:
        if not isinstance(a, cupy.ndarray):
            raise TypeError(
                'Output arguments type must be cupy.ndarray')
        if a.shape != out_shape:
            raise ValueError('Out shape is mismatched')
    return out_args
Beispiel #10
0
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None):
    """Returns an array with evenly-spaced values within a given interval.

    Instead of specifying the step width like :func:`cupy.arange`, this
    function requires the total number of elements specified.

    Args:
        start: Start of the interval.
        stop: End of the interval.
        num: Number of elements.
        endpoint (bool): If True, the stop value is included as the last
            element. Otherwise, the stop value is omitted.
        retstep (bool): If True, this function returns (array, step).
            Otherwise, it returns only the array.
        dtype: Data type specifier. It is inferred from the start and stop
            arguments by default.

    Returns:
        cupy.ndarray: The 1-D array of ranged values.

    """
    if num < 0:
        raise ValueError('linspace with num<0 is not supported')

    if dtype is None:
        # In actual implementation, only float is used
        dtype = float

    ret = cupy.empty((num,), dtype=dtype)
    if num == 0:
        step = float('nan')
    elif num == 1:
        ret.fill(start)
        step = float('nan')
    else:
        div = (num - 1) if endpoint else num
        step = float(stop - start) / div
        stop = float(stop)

        if step == 0.0:
            # for underflow
            _linspace_ufunc_underflow(start, stop - start, div, ret,
                                      casting='unsafe')
        else:
            _linspace_ufunc(start, step, ret, casting='unsafe')

        if endpoint:
            ret[-1] = stop

    if retstep:
        return ret, step
    else:
        return ret
Beispiel #11
0
def concatenate(tup, axis=0):
    """Joins arrays along an axis.

    Args:
        tup (sequence of arrays): Arrays to be joined. All of these should have
            same dimensionalities except the specified axis.
        axis (int): The axis to join arrays along.

    Returns:
        cupy.ndarray: Joined array.

    .. seealso:: :func:`numpy.concatenate`

    """
    ndim = None
    shape = None
    for a in tup:
        if not isinstance(a, cupy.ndarray):
            raise TypeError('Only cupy arrays can be concatenated')
        if a.ndim == 0:
            raise TypeError('zero-dimensional arrays cannot be concatenated')
        if ndim is None:
            ndim = a.ndim
            shape = list(a.shape)
            axis = _get_positive_axis(a.ndim, axis)
            continue

        if a.ndim != ndim:
            raise ValueError(
                'All arrays to concatenate must have the same ndim')
        if any(i != axis and shape[i] != a.shape[i]
               for i in six.moves.range(ndim)):
            raise ValueError(
                'All arrays must have same shape except the axis to '
                'concatenate')
        shape[axis] += a.shape[axis]

    if ndim is None:
        raise ValueError('Cannot concatenate from empty tuple')

    dtype = numpy.find_common_type([a.dtype for a in tup], [])
    ret = cupy.empty(shape, dtype=dtype)

    skip = (slice(None),) * axis
    i = 0
    for a in tup:
        aw = a.shape[axis]
        ret[skip + (slice(i, i + aw),)] = a
        i += aw

    return ret
Beispiel #12
0
def arange(start, stop=None, step=1, dtype=None):
    """Rerurns an array with evenly spaced values within a given interval.

    Values are generated within the half-open interval [start, stop). The first
    three arguments are mapped like the ``range`` built-in function, i.e. start
    and step are optional.

    Args:
        start: Start of the interval.
        stop: End of the interval.
        step: Step width between each pair of consecutive values.
        dtype: Data type specifier. It is inferred from other arguments by
            default.

    Returns:
        cupy.ndarray: The 1-D array of range values.

    .. seealso:: :func:`numpy.arange`

    """
    if dtype is None:
        if any(numpy.dtype(type(val)).kind == 'f'
               for val in (start, stop, step)):
            dtype = float
        else:
            dtype = int

    if stop is None:
        stop = start
        start = 0
    size = int(numpy.ceil((stop - start) / step))
    if size <= 0:
        return cupy.empty((0,), dtype=dtype)

    ret = cupy.empty((size,), dtype=dtype)
    typ = numpy.dtype(dtype).type
    _arange_ufunc(typ(start), typ(step), ret, dtype=dtype)
    return ret
Beispiel #13
0
def _get_out_args_with_params(out_args, out_types, out_shape, out_params):
    if not out_args:
        for p in out_params:
            if p.raw:
                raise ValueError('Output array size is Undecided')
        return [cupy.empty(out_shape, t) for t in out_types]

    for a, p in six_zip(out_args, out_params):
        if not isinstance(a, cupy.ndarray):
            raise TypeError(
                'Output arguments type must be cupy.ndarray')
        if a.shape != out_shape and not p.raw:
            raise ValueError('Out shape is mismatched')
    return out_args
Beispiel #14
0
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None):
    """Returns an array with evenly-spaced values within a given interval.

    Instead of specifying the step width like :func:`cupy.arange`, this
    function requires the total number of elements specified.

    Args:
        start: Start of the interval.
        stop: End of the interval.
        num: Number of elements.
        endpoint (bool): If True, the stop value is included as the last
            element. Otherwise, the stop value is omitted.
        retstep (bool): If True, this function returns (array, step).
            Otherwise, it returns only the array.
        dtype: Data type specifier. It is inferred from the start and stop
            arguments by default.

    Returns:
        cupy.ndarray: The 1-D array of ranged values.

    """
    if num <= 0:
        # TODO(beam2d): Return zero-sized array
        raise ValueError('linspace with num<=0 is not supported')

    if dtype is None:
        if any(numpy.dtype(type(val)).kind == 'f' for val in (start, stop)):
            dtype = float
        else:
            dtype = int

    ret = cupy.empty((num,), dtype=dtype)
    if num == 0:
        return ret
    elif num == 1:
        ret.fill(start)
        return ret

    if endpoint:
        step = (stop - start) / (num - 1)
    else:
        step = (stop - start) / num
        stop = start + step * (num - 1)

    typ = numpy.dtype(dtype).type
    _linspace_ufunc(typ(start), stop - start, num - 1, ret)
    if retstep:
        return ret, step
    else:
        return ret
Beispiel #15
0
def empty(shape, dtype=numpy.float32):
    """Creates an uninitialized cupy.ndarray object.

    Args:
        shape (tuple of ints): The shape of array.
        dtype (numpy.dtype): Element type.

    Returns:
        cupy.ndarray: Uninitialized GPU array allocated by the memory pool.

    """
    warnings.warn("chainer.cuda.empty is deprecated. Use cupy.empty instead.", DeprecationWarning)
    check_cuda_available()
    return cupy.empty(shape, dtype)
Beispiel #16
0
def roll(a, shift, axis=None):
    """Roll array elements along a given axis.

    Args:
        a (~cupy.ndarray): Array to be rolled.
        shift (int): The number of places by which elements are shifted.
        axis (int or None): The axis along which elements are shifted.
            If ``axis`` is ``None``, the array is flattened before shifting,
            and after that it is reshaped to the original shape.

    Returns:
        ~cupy.ndarray: Output array.

    .. seealso:: :func:`numpy.roll`

    """
    if axis is None:
        if a.size == 0:
            return a
        size = a.size
        ra = a.ravel()
        shift %= size
        res = cupy.empty((size,), a.dtype)
        res[:shift] = ra[size - shift:]
        res[shift:] = ra[:size - shift]
        return res.reshape(a.shape)
    else:
        axis = int(axis)
        if axis < 0:
            axis += a.ndim
        if not 0 <= axis < a.ndim:
            raise ValueError('axis must be >= %d and < %d' % (-a.ndim, a.ndim))
        size = a.shape[axis]
        if size == 0:
            return a
        shift %= size
        prev = (slice(None),) * axis
        rest = (slice(None),) * (a.ndim - axis - 1)
        # Roll only the dimensiont at the given axis
        # ind1 is [:, ..., size-shift:, ..., :]
        # ind2 is [:, ..., :size-shift, ..., :]
        ind1 = prev + (slice(size - shift, None, None),) + rest
        ind2 = prev + (slice(None, size - shift, None),) + rest
        r_ind1 = prev + (slice(None, shift, None),) + rest
        r_ind2 = prev + (slice(shift, None, None),) + rest
        res = cupy.empty_like(a)
        res[r_ind1] = a[ind1]
        res[r_ind2] = a[ind2]
        return res
Beispiel #17
0
def diagonal(a, offset=0, axis1=0, axis2=1):
    """Returns specified diagonals.

    This function extracts the diagonals along two specified axes. The other
    axes are not changed. This function returns a writable view of this array
    as NumPy 1.10 will do.

    Args:
        a (cupy.ndarray): Array from which the diagonals are taken.
        offset (int): Index of the diagonals. Zero indicates the main
            diagonals, a positive value upper diagonals, and a negative value
            lower diagonals.
        axis1 (int): The first axis to take diagonals from.
        axis2 (int): The second axis to take diagonals from.

    Returns:
        cupy.ndarray: A view of the diagonals of ``a``.

    .. seealso:: :func:`numpy.diagonal`

    """
    if axis1 < axis2:
        min_axis, max_axis = axis1, axis2
    else:
        min_axis, max_axis = axis2, axis1

    tr = list(six.moves.range(a.ndim))
    del tr[max_axis]
    del tr[min_axis]
    if offset >= 0:
        a = cupy.transpose(a, tr + [axis1, axis2])
    else:
        a = cupy.transpose(a, tr + [axis2, axis1])
        offset = -offset

    diag_size = max(0, min(a.shape[-2], a.shape[-1] - offset))
    ret_shape = a.shape[:-2] + (diag_size,)
    if diag_size == 0:
        return cupy.empty(ret_shape, dtype=a.dtype)

    a = a[..., :diag_size, offset:offset + diag_size]

    ret = a.view()
    ret._shape = a.shape[:-2] + (diag_size,)
    ret._strides = a.strides[:-2] + (a.strides[-1] + a.strides[-2],)
    ret._size = internal.prod(ret._shape)
    ret._c_contiguous = -1
    ret._f_contiguous = -1
    return ret
Beispiel #18
0
def empty_like(array):
    """Creates an uninitialized GPU array like the given one.

    Args:
        array (cupy.ndarray or numpy.ndarray): Base array.

    Returns:
        cupy.ndarray: GPU array of the same shape and dtype as `array`.

    """
    warnings.warn("chainer.cuda.empty_like is deprecated. Use cupy.empty_like instead.", DeprecationWarning)
    check_cuda_available()
    if isinstance(array, cupy.ndarray):
        return cupy.empty_like(array)
    return cupy.empty(array.shape, dtype=array.dtype)
Beispiel #19
0
def _get_out_args(in_args, out_args, out_types, out_shape, out_params=None):
    if len(out_args) == 0:
        if out_params is not None and any(p.raw for p in out_params):
            raise ValueError('Output array size is Undecided')
        out_args = [cupy.empty(shape=out_shape, dtype=t)
                    for t in out_types]
    else:
        assert len(out_args) == len(out_types)
        for i, a in enumerate(out_args):
            if not isinstance(a, cupy.ndarray):
                raise TypeError(
                    'Output arguments type must be cupy.ndarray')
            if a.shape != out_shape:
                if out_params is None or not out_params[i].raw:
                    raise ValueError('Out shape is mismatched')
    return out_args
Beispiel #20
0
    def lognormal(self, mean=0.0, sigma=1.0, size=None, dtype=float):
        """Returns an array of samples drawn from a log normal distribution.

        .. seealso::
            :func:`cupy.random.lognormal` for full documentation,
            :meth:`numpy.random.RandomState.lognormal`

        """
        dtype = _check_and_get_dtype(dtype)
        out = cupy.empty(size, dtype=dtype)
        if dtype.char == 'f':
            func = curand.generateLogNormal
        else:
            func = curand.generateLogNormalDouble
        func(self._generator, out.data.ptr, out.size, mean, sigma)
        return out
Beispiel #21
0
    def normal(self, loc=0.0, scale=1.0, size=None, dtype=float):
        """Returns an array of normally distributed samples.

        .. seealso::
            :func:`cupy.random.normal` for full documentation,
            :meth:`numpy.random.RandomState.normal`

        """
        dtype = _check_and_get_dtype(dtype)
        out = cupy.empty(size, dtype=dtype)
        if dtype.char == 'f':
            func = curand.generateNormal
        else:
            func = curand.generateNormalDouble
        func(self._generator, out.data.ptr, out.size, loc, scale)
        return out
Beispiel #22
0
def asfortranarray(a, dtype=None):
    """Return an array laid out in Fortran order in memory.

    Args:
        a (~cupy.ndarray): The input array.
        dtype (str or dtype object, optional): By default, the data-type is
            inferred from the input data.

    Returns:
        ~cupy.ndarray: The input `a` in Fortran, or column-major, order.

    .. seealso:: :func:`numpy.asfortranarray`

    """
    ret = cupy.empty(a.shape[::-1], a.dtype if dtype is None else dtype).T
    ret[...] = a
    return ret
Beispiel #23
0
    def random_sample(self, size=None, dtype=float):
        """Returns an array of random values over the interval ``[0, 1)``.

        .. seealso::
            :func:`cupy.random.random_sample` for full documentation,
            :meth:`numpy.random.RandomState.random_sample`

        """
        dtype = _check_and_get_dtype(dtype)
        out = cupy.empty(size, dtype=dtype)
        if dtype.char == 'f':
            func = curand.generateUniform
        else:
            func = curand.generateUniformDouble
        func(self._generator, out.data.ptr, out.size)
        RandomState._1m_kernel(out)
        return out
Beispiel #24
0
 def forward_gpu(self, inputs):
     x = inputs[0]
     W = inputs[1]
     # Prepare BLAS call
     handle = cuda.Device().cublas_handle
     k, m = W.shape
     n, l = x.shape[0] * x.shape[1], x.shape[2]
     lda = max(1, x.shape[-1])
     ldb = max(1, W.strides[0] // W.dtype.itemsize)
     ldc = max(1, m)
     Wx = cupy.empty((x.shape[0], x.shape[1], W.shape[1]),
                     dtype=numpy.float32)
     sgemm(handle, False, False, m, n, k, 1, W.data.ptr, ldb,
           x.data.ptr, lda, 0, Wx.data.ptr, ldc)
     if len(inputs) > 2:
         b = inputs[2]
         Wx += b
     return Wx,
def _forward_gpu(x):
    
    T = x.shape[0]
    N = x.shape[1]
    M = x.shape[2]
    y = cp.empty((N, T, M), dtype=np.float32)
    
    if N == 1:
        bdim, gdim = gpu.utils.Get_bdim_and_gdimRowVec(M)
    elif M >= (N*4):
        bdim, gdim = gpu.utils.Get_bdim_and_gdimSmallNBigM(N,M)
    else:
        bdim, gdim = gpu.utils.Get_bdim_and_gdim2D(N,M)
    
    forward_kernel = _GetForward_kernel()
    
    forward_kernel(grid=gdim, block=bdim,
                   args=(x, y,
                         T, N, M
                         )
                    )  
    return y
Beispiel #26
0
def hotdot(a, indices, out=None, dont_add=False):
    """
    In:
        a: a pycuda gpuarray
        indices: hot indices a K-hot encoded matrix
    out:
        out: x.dot(a.T), where x is a K-hot encoded matrix 
    
    """
    HotDot1, HotDot2 = _get_HotDot_kernels()
    H, D = a.shape
    N, K = indices.shape
    if N == 1:
        bdim, gdim = Get_bdim_and_gdimRowVec(H)
    elif H >= (N*4):
        bdim, gdim = Get_bdim_and_gdimSmallNBigM(N,H)
    else:
        bdim, gdim = Get_bdim_and_gdim2D(N,H)
    if dont_add:
        B = np.int32(1)
    else:
        B = np.int32(0)
        
    if out is None:
        out = cp.empty((N,H), dtype=np.float32)
        B = np.int32(1)
    
    if K > 1:
        HotDot1(grid=gdim, block=bdim,
                 args=(a, out, indices,
                np.int32(K), np.int32(N), np.int32(H), np.int32(D), np.int32(B))
                )
    else:
        HotDot2(grid=gdim, block=bdim,
                 args=(a, out, indices,
                        np.int32(N), np.int32(H), np.int32(D), np.int32(B))
                )
        return out
Beispiel #27
0
def asfortranarray(a, dtype=None):
    """Return an array laid out in Fortran order in memory.

    Args:
        a (~cupy.ndarray): The input array.
        dtype (str or dtype object, optional): By default, the data-type is
            inferred from the input data.

    Returns:
        ~cupy.ndarray: The input `a` in Fortran, or column-major, order.

    .. seealso:: :func:`numpy.asfortranarray`

    """
    ret = cupy.empty(a.shape[::-1], a.dtype if dtype is None else dtype).T
    if (a.flags.c_contiguous and
            (a.dtype == numpy.float32 or a.dtype == numpy.float64) and
            a.ndim == 2 and
            dtype is None):
        m, n = a.shape
        if a.dtype == numpy.float32:
            cupy.cuda.cublas.sgeam(
                cupy.cuda.Device().cublas_handle,
                1,  # transpose a
                1,  # transpose ret
                m, n, 1., a.data.ptr, n, 0., a.data.ptr, n,
                ret.data.ptr, m)
        elif a.dtype == numpy.float64:
            cupy.cuda.cublas.dgeam(
                cupy.cuda.Device().cublas_handle,
                1,  # transpose a
                1,  # transpose ret
                m, n, 1., a.data.ptr, n, 0., a.data.ptr, n,
                ret.data.ptr, m)
        return ret
    else:
        ret[...] = a
        return ret
def _backward_gpu(gy):
    
    N = gy.shape[0]
    T = gy.shape[1]
    M = gy.shape[2]
    gx = cp.empty((T, N, M), dtype=np.float32)
    
    if N == 1:
        bdim, gdim = gpu.utils.Get_bdim_and_gdimRowVec(M)
    elif M >= (N*4):
        bdim, gdim = gpu.utils.Get_bdim_and_gdimSmallNBigM(N,M)
    else:
        bdim, gdim = gpu.utils.Get_bdim_and_gdim2D(N,M)
    

    Backward_kernel = _GetBackward_kernel()
    
    
    Backward_kernel(grid=gdim, block=bdim,
                   args=(gy, gx,
                         T, N, M)
                    )  
    return gx
Beispiel #29
0
def _empty_aligned(shape, dtype, order='C', n=None):
    """Patched version of :func:`sporco.fft.empty_aligned`."""

    return cp.empty(shape, dtype, order)
Beispiel #30
0
def add_buffers_gpu(species, float_recv_left, float_recv_right, uint_recv_left,
                    uint_recv_right):
    """
    Add the particles stored in recv_left and recv_right
    to the existing particle in species.

    Parameters
    ----------
    species: a Particles object
        Contain the particles that stayed on the present processors

    float_recv_left, float_recv_right, uint_recv_left, uint_recv_right:
        arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl
        is the number of particles that are received to the left
        proc and right proc respectively, and where n_float and n_int
        are the number of float and integer quantities respectively
        These arrays are always on the CPU (since they were used for MPI)
    """
    # Get the new number of particles
    old_Ntot = species.Ntot
    n_left = float_recv_left.shape[1]
    n_right = float_recv_right.shape[1]
    new_Ntot = old_Ntot + n_left + n_right

    # Get the threads per block and the blocks per grid
    n_left_grid, n_left_block = cuda_tpb_bpg_1d(n_left)
    n_right_grid, n_right_block = cuda_tpb_bpg_1d(n_right)
    n_old_grid, n_old_block = cuda_tpb_bpg_1d(old_Ntot)

    # Iterate over particle attributes
    # Build list of float attributes to copy
    attr_list = [ (species,'x'), (species,'y'), (species,'z'), \
                  (species,'ux'), (species,'uy'), (species,'uz'), \
                  (species,'inv_gamma'), (species,'w') ]
    if species.ionizer is not None:
        attr_list += [(species.ionizer, 'w_times_level')]
    # Loop through the float quantities
    for i_attr in range(len(attr_list)):
        # Copy the proper buffers to the GPU
        left_buffer = cupy.asarray(float_recv_left[i_attr])
        right_buffer = cupy.asarray(float_recv_right[i_attr])
        # Initialize the new particle array
        particle_array = cupy.empty((new_Ntot, ), dtype=np.float64)
        # Merge the arrays on the GPU
        stay_buffer = getattr(attr_list[i_attr][0], attr_list[i_attr][1])
        if n_left != 0:
            copy_particles[n_left_grid, n_left_block](n_left, left_buffer, 0,
                                                      particle_array, 0)
        if old_Ntot != 0:
            copy_particles[n_old_grid, n_old_block](old_Ntot, stay_buffer, 0,
                                                    particle_array, n_left)
        if n_right != 0:
            copy_particles[n_right_grid,
                           n_right_block](n_right, right_buffer, 0,
                                          particle_array, n_left + old_Ntot)
        # Assign the stay_buffer to the initial particle data array
        # and fill the sending buffers (if needed for MPI)
        setattr(attr_list[i_attr][0], attr_list[i_attr][1], particle_array)

    # Build list of integer quantities to copy
    attr_list = []
    if species.tracker is not None:
        attr_list.append((species.tracker, 'id'))
    if species.ionizer is not None:
        attr_list.append((species.ionizer, 'ionization_level'))
    # Loop through the integer quantities
    for i_attr in range(len(attr_list)):
        # Copy the proper buffers to the GPU
        left_buffer = cupy.asarray(uint_recv_left[i_attr])
        right_buffer = cupy.asarray(uint_recv_right[i_attr])
        # Initialize the new particle array
        particle_array = cupy.empty((new_Ntot, ), dtype=np.uint64)
        # Merge the arrays on the GPU
        stay_buffer = getattr(attr_list[i_attr][0], attr_list[i_attr][1])
        if n_left != 0:
            copy_particles[n_left_grid, n_left_block](n_left, left_buffer, 0,
                                                      particle_array, 0)
        if old_Ntot != 0:
            copy_particles[n_old_grid, n_old_block](old_Ntot, stay_buffer, 0,
                                                    particle_array, n_left)
        if n_right != 0:
            copy_particles[n_right_grid,
                           n_right_block](n_right, right_buffer, 0,
                                          particle_array, n_left + old_Ntot)
        # Assign the stay_buffer to the initial particle data array
        # and fill the sending buffers (if needed for MPI)
        setattr(attr_list[i_attr][0], attr_list[i_attr][1], particle_array)

    # Adapt the total number of particles
    species.Ntot = new_Ntot
Beispiel #31
0
def eigsh(a,
          k=6,
          *,
          which='LM',
          ncv=None,
          maxiter=None,
          tol=0,
          return_eigenvectors=True):
    """Finds ``k`` eigenvalues and eigenvectors of the real symmetric matrix.

    Solves ``Ax = wx``, the standard eigenvalue problem for ``w`` eigenvalues
    with corresponding eigenvectors ``x``.

    Args:
        a (ndarray, spmatrix or LinearOperator): A symmetric square matrix with
            dimension ``(n, n)``. ``a`` must :class:`cupy.ndarray`,
            :class:`cupyx.scipy.sparse.spmatrix` or
            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
        k (int): The number of eigenvalues and eigenvectors to compute. Must be
            ``1 <= k < n``.
        which (str): 'LM' or 'LA'. 'LM': finds ``k`` largest (in magnitude)
            eigenvalues. 'LA': finds ``k`` largest (algebraic) eigenvalues.
        ncv (int): The number of Lanczos vectors generated. Must be
            ``k + 1 < ncv < n``. If ``None``, default value is used.
        maxiter (int): Maximum number of Lanczos update iterations.
            If ``None``, default value is used.
        tol (float): Tolerance for residuals ``||Ax - wx||``. If ``0``, machine
            precision is used.
        return_eigenvectors (bool): If ``True``, returns eigenvectors in
            addition to eigenvalues.

    Returns:
        tuple:
            If ``return_eigenvectors is True``, it returns ``w`` and ``x``
            where ``w`` is eigenvalues and ``x`` is eigenvectors. Otherwise,
            it returns only ``w``.

    .. seealso:: :func:`scipy.sparse.linalg.eigsh`

    .. note::
        This function uses the thick-restart Lanczos methods
        (https://sdm.lbl.gov/~kewu/ps/trlan.html).

    """
    n = a.shape[0]
    if a.ndim != 2 or a.shape[0] != a.shape[1]:
        raise ValueError('expected square matrix (shape: {})'.format(a.shape))
    if a.dtype.char not in 'fdFD':
        raise TypeError('unsupprted dtype (actual: {})'.format(a.dtype))
    if k <= 0:
        raise ValueError('k must be greater than 0 (actual: {})'.format(k))
    if k >= n:
        raise ValueError('k must be smaller than n (actual: {})'.format(k))
    if which not in ('LM', 'LA'):
        raise ValueError('which must be \'LM\' or \'LA\' (actual: {})'
                         ''.format(which))
    if ncv is None:
        ncv = min(max(2 * k, k + 32), n - 1)
    else:
        ncv = min(max(ncv, k + 2), n - 1)
    if maxiter is None:
        maxiter = 10 * n
    if tol == 0:
        tol = numpy.finfo(a.dtype).eps

    alpha = cupy.zeros((ncv, ), dtype=a.dtype)
    beta = cupy.zeros((ncv, ), dtype=a.dtype.char.lower())
    V = cupy.empty((ncv, n), dtype=a.dtype)

    # Set initial vector
    u = cupy.random.random((n, )).astype(a.dtype)
    V[0] = u / cublas.nrm2(u)

    # Choose Lanczos implementation, unconditionally use 'fast' for now
    upadte_impl = 'fast'
    if upadte_impl == 'fast':
        lanczos = _lanczos_fast(a, n, ncv)
    else:
        lanczos = _lanczos_asis

    # Lanczos iteration
    lanczos(a, V, u, alpha, beta, 0, ncv)

    iter = ncv
    w, s = _eigsh_solve_ritz(alpha, beta, None, k, which)
    x = V.T @ s

    # Compute residual
    beta_k = beta[-1] * s[-1, :]
    res = cublas.nrm2(beta_k)

    while res > tol and iter < maxiter:
        # Setup for thick-restart
        beta[:k] = 0
        alpha[:k] = w
        V[:k] = x.T

        u -= u.T @ V[:k].conj().T @ V[:k]
        V[k] = u / cublas.nrm2(u)

        u[...] = a @ V[k]
        cublas.dotc(V[k], u, out=alpha[k])
        u -= alpha[k] * V[k]
        u -= V[:k].T @ beta_k
        cublas.nrm2(u, out=beta[k])
        V[k + 1] = u / beta[k]

        # Lanczos iteration
        lanczos(a, V, u, alpha, beta, k + 1, ncv)

        iter += ncv - k
        w, s = _eigsh_solve_ritz(alpha, beta, beta_k, k, which)
        x = V.T @ s

        # Compute residual
        beta_k = beta[-1] * s[-1, :]
        res = cublas.nrm2(beta_k)

    if return_eigenvectors:
        idx = cupy.argsort(w)
        return w[idx], x[:, idx]
    else:
        return cupy.sort(w)
Beispiel #32
0
    def aux(A, V, u, alpha, beta, i_start, i_end):
        assert A is outer_A

        # Get ready for spmv if enabled
        if cusparse_handle is not None:
            # Note: I would like to reuse descriptors and working buffer
            # on the next update, but I gave it up because it sometimes
            # caused illegal memory access error.
            spmv_desc_A = cusparse.SpMatDescriptor.create(A)
            spmv_desc_v = cusparse.DnVecDescriptor.create(v)
            spmv_desc_u = cusparse.DnVecDescriptor.create(u)
            buff_size = _cusparse.spMV_bufferSize(
                cusparse_handle, spmv_op_a, spmv_alpha.ctypes.data,
                spmv_desc_A.desc, spmv_desc_v.desc, spmv_beta.ctypes.data,
                spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg)
            spmv_buff = cupy.empty(buff_size, cupy.int8)

        v[...] = V[i_start]
        for i in range(i_start, i_end):
            # Matrix-vector multiplication
            if cusparse_handle is None:
                u[...] = A @ v
            else:
                _cusparse.spMV(cusparse_handle, spmv_op_a,
                               spmv_alpha.ctypes.data, spmv_desc_A.desc,
                               spmv_desc_v.desc, spmv_beta.ctypes.data,
                               spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg,
                               spmv_buff.data.ptr)

            # Call dotc
            _cublas.setPointerMode(cublas_handle,
                                   _cublas.CUBLAS_POINTER_MODE_DEVICE)
            try:
                dotc(cublas_handle, n, v.data.ptr, 1, u.data.ptr, 1,
                     alpha.data.ptr + i * alpha.itemsize)
            finally:
                _cublas.setPointerMode(cublas_handle, cublas_pointer_mode)

            # Orthogonalize
            gemm(cublas_handle, _cublas.CUBLAS_OP_C, _cublas.CUBLAS_OP_N, 1,
                 i + 1, n, one.ctypes.data, u.data.ptr, n, V.data.ptr, n,
                 zero.ctypes.data, uu.data.ptr, 1)
            gemm(cublas_handle, _cublas.CUBLAS_OP_N, _cublas.CUBLAS_OP_C, n, 1,
                 i + 1, mone.ctypes.data, V.data.ptr, n, uu.data.ptr, 1,
                 one.ctypes.data, u.data.ptr, n)

            # Call nrm2
            _cublas.setPointerMode(cublas_handle,
                                   _cublas.CUBLAS_POINTER_MODE_DEVICE)
            try:
                nrm2(cublas_handle, n, u.data.ptr, 1,
                     beta.data.ptr + i * beta.itemsize)
            finally:
                _cublas.setPointerMode(cublas_handle, cublas_pointer_mode)

            # Break here as the normalization below touches V[i+1]
            if i >= i_end - 1:
                break

            # Normalize
            _kernel_normalize(u, beta, i, n, v, V)
Beispiel #33
0
def _lanczos_fast(A, n, ncv):
    cublas_handle = device.get_cublas_handle()
    cublas_pointer_mode = _cublas.getPointerMode(cublas_handle)
    if A.dtype.char == 'f':
        dotc = _cublas.sdot
        nrm2 = _cublas.snrm2
        gemm = _cublas.sgemm
    elif A.dtype.char == 'd':
        dotc = _cublas.ddot
        nrm2 = _cublas.dnrm2
        gemm = _cublas.dgemm
    elif A.dtype.char == 'F':
        dotc = _cublas.cdotc
        nrm2 = _cublas.scnrm2
        gemm = _cublas.cgemm
    elif A.dtype.char == 'D':
        dotc = _cublas.zdotc
        nrm2 = _cublas.dznrm2
        gemm = _cublas.zgemm
    else:
        raise TypeError('invalid dtype ({})'.format(A.dtype))

    cusparse_handle = None
    if csr.isspmatrix_csr(A) and cusparse.check_availability('spmv'):
        cusparse_handle = device.get_cusparse_handle()
        spmv_op_a = _cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
        spmv_alpha = numpy.array(1.0, A.dtype)
        spmv_beta = numpy.array(0.0, A.dtype)
        spmv_cuda_dtype = cusparse._dtype_to_DataType(A.dtype)
        spmv_alg = _cusparse.CUSPARSE_MV_ALG_DEFAULT

    v = cupy.empty((n, ), dtype=A.dtype)
    uu = cupy.empty((ncv, ), dtype=A.dtype)
    one = numpy.array(1.0, dtype=A.dtype)
    zero = numpy.array(0.0, dtype=A.dtype)
    mone = numpy.array(-1.0, dtype=A.dtype)

    outer_A = A

    def aux(A, V, u, alpha, beta, i_start, i_end):
        assert A is outer_A

        # Get ready for spmv if enabled
        if cusparse_handle is not None:
            # Note: I would like to reuse descriptors and working buffer
            # on the next update, but I gave it up because it sometimes
            # caused illegal memory access error.
            spmv_desc_A = cusparse.SpMatDescriptor.create(A)
            spmv_desc_v = cusparse.DnVecDescriptor.create(v)
            spmv_desc_u = cusparse.DnVecDescriptor.create(u)
            buff_size = _cusparse.spMV_bufferSize(
                cusparse_handle, spmv_op_a, spmv_alpha.ctypes.data,
                spmv_desc_A.desc, spmv_desc_v.desc, spmv_beta.ctypes.data,
                spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg)
            spmv_buff = cupy.empty(buff_size, cupy.int8)

        v[...] = V[i_start]
        for i in range(i_start, i_end):
            # Matrix-vector multiplication
            if cusparse_handle is None:
                u[...] = A @ v
            else:
                _cusparse.spMV(cusparse_handle, spmv_op_a,
                               spmv_alpha.ctypes.data, spmv_desc_A.desc,
                               spmv_desc_v.desc, spmv_beta.ctypes.data,
                               spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg,
                               spmv_buff.data.ptr)

            # Call dotc
            _cublas.setPointerMode(cublas_handle,
                                   _cublas.CUBLAS_POINTER_MODE_DEVICE)
            try:
                dotc(cublas_handle, n, v.data.ptr, 1, u.data.ptr, 1,
                     alpha.data.ptr + i * alpha.itemsize)
            finally:
                _cublas.setPointerMode(cublas_handle, cublas_pointer_mode)

            # Orthogonalize
            gemm(cublas_handle, _cublas.CUBLAS_OP_C, _cublas.CUBLAS_OP_N, 1,
                 i + 1, n, one.ctypes.data, u.data.ptr, n, V.data.ptr, n,
                 zero.ctypes.data, uu.data.ptr, 1)
            gemm(cublas_handle, _cublas.CUBLAS_OP_N, _cublas.CUBLAS_OP_C, n, 1,
                 i + 1, mone.ctypes.data, V.data.ptr, n, uu.data.ptr, 1,
                 one.ctypes.data, u.data.ptr, n)

            # Call nrm2
            _cublas.setPointerMode(cublas_handle,
                                   _cublas.CUBLAS_POINTER_MODE_DEVICE)
            try:
                nrm2(cublas_handle, n, u.data.ptr, 1,
                     beta.data.ptr + i * beta.itemsize)
            finally:
                _cublas.setPointerMode(cublas_handle, cublas_pointer_mode)

            # Break here as the normalization below touches V[i+1]
            if i >= i_end - 1:
                break

            # Normalize
            _kernel_normalize(u, beta, i, n, v, V)

    return aux
Beispiel #34
0
def remove_particles_gpu(species, fld, n_guard, left_proc, right_proc):
    """
    Remove the particles that are outside of the physical domain (i.e.
    in the guard cells). Store them in sending buffers, which are returned.

    Parameters
    ----------
    species: a Particles object
        Contains the data of this species

    fld: a Fields object
        Contains information about the dimension of the grid,
        and the prefix sum (when using the GPU)

    n_guard: int
        Number of guard cells

    left_proc, right_proc: int or None
        Indicate whether there is a left or right processor or if the
        boundary is open (None).

    Returns
    -------
    float_send_left, float_send_right, uint_send_left, uint_send_right:
        arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl
        is the number of particles that are sent to the left
        proc and right proc respectively, and where n_float and n_int
        are the number of float and integer quantities respectively
    """
    # Check if particles are sorted
    # (The particles are usually expected to be sorted from the previous
    # iteration at this point - except at the first iteration of `step`.)
    if species.sorted == False:
        species.sort_particles(fld=fld)
        species.sorted = True

    # Get the particle indices between which to remove the particles
    # (Take into account the fact that the moving window may have
    # shifted the grid since the particles were last sorted: prefix_sum_shift)
    prefix_sum = species.prefix_sum
    Nz = fld.Nz
    Nr = fld.Nr
    # Find the z index of the first cell for which particles are kept
    iz_min = max(n_guard + species.prefix_sum_shift, 0)
    # Find the z index of the first cell for which particles are removed again
    iz_max = min(Nz - n_guard + species.prefix_sum_shift + 1, Nz)
    # Find the corresponding indices in the particle array
    # Reminder: prefix_sum[i] is the cumulative sum of the number of particles
    # in cells 0 to i (where cell i is included)
    if iz_min * (Nr + 1) - 1 >= 0:
        i_min = int(prefix_sum[iz_min * (Nr + 1) - 1])
    else:
        i_min = 0
    i_max = int(prefix_sum[iz_max * (Nr + 1) - 1])

    # Total number of particles in each particle group
    N_send_l = i_min
    new_Ntot = i_max - i_min
    N_send_r = species.Ntot - i_max

    # Allocate the sending buffers on the CPU
    n_float = species.n_float_quantities
    n_int = species.n_integer_quantities
    if left_proc is not None:
        float_send_left = np.empty((n_float, N_send_l), dtype=np.float64)
        uint_send_left = np.empty((n_int, N_send_l), dtype=np.uint64)
    else:
        float_send_left = np.empty((n_float, 0), dtype=np.float64)
        uint_send_left = np.empty((n_int, 0), dtype=np.uint64)
    if right_proc is not None:
        float_send_right = np.empty((n_float, N_send_r), dtype=np.float64)
        uint_send_right = np.empty((n_int, N_send_r), dtype=np.uint64)
    else:
        float_send_right = np.empty((n_float, 0), dtype=np.float64)
        uint_send_right = np.empty((n_int, 0), dtype=np.uint64)

    # Get the threads per block and the blocks per grid
    dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d(species.Ntot)
    # Float quantities:
    # Build list of float attributes to copy
    attr_list = [(species, 'x'), (species, 'y'), (species, 'z'),
                 (species, 'ux'), (species, 'uy'), (species, 'uz'),
                 (species, 'inv_gamma'), (species, 'w')]
    if species.ionizer is not None:
        attr_list.append((species.ionizer, 'w_times_level'))
    # Loop through the float attributes
    for i_attr in range(n_float):
        # Initialize 3 buffer arrays on the GPU (need to be initialized
        # inside the loop, as `copy_to_host` invalidates these arrays)
        left_buffer = cupy.empty((N_send_l, ), dtype=np.float64)
        right_buffer = cupy.empty((N_send_r, ), dtype=np.float64)
        stay_buffer = cupy.empty((new_Ntot, ), dtype=np.float64)
        # Check that the buffers are still on GPU
        # (safeguard against automatic memory management)
        assert type(left_buffer) != np.ndarray
        assert type(right_buffer) != np.ndarray
        assert type(left_buffer) != np.ndarray
        # Split the particle array into the 3 buffers on the GPU
        particle_array = getattr(attr_list[i_attr][0], attr_list[i_attr][1])
        split_particles_to_buffers[dim_grid_1d,
                                   dim_block_1d](particle_array, left_buffer,
                                                 stay_buffer, right_buffer,
                                                 i_min, i_max)
        # Assign the stay_buffer to the initial particle data array
        # and fill the sending buffers (if needed for MPI)
        setattr(attr_list[i_attr][0], attr_list[i_attr][1], stay_buffer)
        if left_proc is not None:
            left_buffer.get(out=float_send_left[i_attr])
        if right_proc is not None:
            right_buffer.get(out=float_send_right[i_attr])

    # Integer quantities:
    if n_int > 0:
        attr_list = []
    if species.tracker is not None:
        attr_list.append((species.tracker, 'id'))
    if species.ionizer is not None:
        attr_list.append((species.ionizer, 'ionization_level'))
    for i_attr in range(n_int):
        # Initialize 3 buffer arrays on the GPU (need to be initialized
        # inside the loop, as `copy_to_host` invalidates these arrays)
        left_buffer = cupy.empty((N_send_l, ), dtype=np.uint64)
        right_buffer = cupy.empty((N_send_r, ), dtype=np.uint64)
        stay_buffer = cupy.empty((new_Ntot, ), dtype=np.uint64)
        # Split the particle array into the 3 buffers on the GPU
        particle_array = getattr(attr_list[i_attr][0], attr_list[i_attr][1])
        split_particles_to_buffers[dim_grid_1d,
                                   dim_block_1d](particle_array, left_buffer,
                                                 stay_buffer, right_buffer,
                                                 i_min, i_max)
        # Assign the stay_buffer to the initial particle data array
        # and fill the sending buffers (if needed for MPI)
        setattr(attr_list[i_attr][0], attr_list[i_attr][1], stay_buffer)
        if left_proc is not None:
            left_buffer.get(out=uint_send_left[i_attr])
        if right_proc is not None:
            right_buffer.get(out=uint_send_right[i_attr])

    # Change the new total number of particles
    species.Ntot = new_Ntot

    # Return the sending buffers
    return (float_send_left, float_send_right, uint_send_left, uint_send_right)
Beispiel #35
0
def geam(transa, transb, alpha, a, beta, b, out=None):
    """Computes alpha * op(a) + beta * op(b)

    op(a) = a if transa is 'N', op(a) = a.T if transa is 'T',
    op(a) = a.T.conj() if transa is 'H'.
    op(b) = b if transb is 'N', op(b) = b.T if transb is 'T',
    op(b) = b.T.conj() if transb is 'H'.
    """
    assert a.ndim == b.ndim == 2
    assert a.dtype == b.dtype
    dtype = a.dtype.char
    if dtype == 'f':
        func = cublas.sgeam
    elif dtype == 'd':
        func = cublas.dgeam
    elif dtype == 'F':
        func = cublas.cgeam
    elif dtype == 'D':
        func = cublas.zgeam
    else:
        raise TypeError('invalid dtype')

    transa = _trans_to_cublas_op(transa)
    transb = _trans_to_cublas_op(transb)
    if transa == cublas.CUBLAS_OP_N:
        m, n = a.shape
    else:
        n, m = a.shape
    if transb == cublas.CUBLAS_OP_N:
        assert b.shape == (m, n)
    else:
        assert b.shape == (n, m)
    if out is None:
        out = cupy.empty((m, n), dtype=dtype, order='F')
    else:
        assert out.ndim == 2
        assert out.shape == (m, n)
        assert out.dtype == dtype

    alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype)
    beta, beta_ptr = _get_scalar_ptr(beta, a.dtype)
    handle = device.get_cublas_handle()
    orig_mode = cublas.getPointerMode(handle)
    if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray):
        if not isinstance(alpha, cupy.ndarray):
            alpha = cupy.array(alpha)
            alpha_ptr = alpha.data.ptr
        if not isinstance(beta, cupy.ndarray):
            beta = cupy.array(beta)
            beta_ptr = beta.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    else:
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)

    lda, transa = _decide_ld_and_trans(a, transa)
    ldb, transb = _decide_ld_and_trans(b, transb)
    if not (lda is None or ldb is None):
        if out._f_contiguous:
            try:
                func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr, lda,
                     beta_ptr, b.data.ptr, ldb, out.data.ptr, m)
            finally:
                cublas.setPointerMode(handle, orig_mode)
            return out
        elif out._c_contiguous:
            # Computes alpha * a.T + beta * b.T
            try:
                func(handle, 1 - transa, 1 - transb, n, m, alpha_ptr,
                     a.data.ptr, lda, beta_ptr, b.data.ptr, ldb, out.data.ptr,
                     n)
            finally:
                cublas.setPointerMode(handle, orig_mode)
            return out

    a, lda = _change_order_if_necessary(a, lda)
    b, ldb = _change_order_if_necessary(b, ldb)
    c = out
    if not out._f_contiguous:
        c = out.copy(order='F')
    try:
        func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr, lda,
             beta_ptr, b.data.ptr, ldb, c.data.ptr, m)
    finally:
        cublas.setPointerMode(handle, orig_mode)
    if not out._f_contiguous:
        out[...] = c
    return out
Beispiel #36
0
def create_dropout_states(handle):
    state_size = cudnn.dropoutGetStatesSize(handle)
    return cupy.empty((state_size, ), dtype='b')
Beispiel #37
0
 def test_empty_zero_sized_array_strides(self, order):
     a = numpy.empty((1, 0, 2), dtype='d', order=order)
     b = cupy.empty((1, 0, 2), dtype='d', order=order)
     self.assertEqual(b.strides, a.strides)
Beispiel #38
0
def solve(a, b):
    '''Solves a linear matrix equation.

    It computes the exact solution of ``x`` in ``ax = b``,
    where ``a`` is a square and full rank matrix.

    Args:
        a (cupy.ndarray): The matrix with dimension ``(M, M)``
        b (cupy.ndarray): The vector with ``M`` elements, or
            the matrix with dimension ``(M, K)``

    Returns:
        cupy.ndarray:
            The vector with ``M`` elements, or the matrix with dimension
            ``(M, K)``.

    .. seealso:: :func:`numpy.linalg.solve`
    '''
    # NOTE: Since cusolver in CUDA 8.0 does not support gesv,
    #       we manually solve a linear system with QR decomposition.
    #       For details, please see the following:
    #       http://docs.nvidia.com/cuda/cusolver/index.html#qr_examples
    if not cuda.cusolver_enabled:
        raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0')

    # TODO(Saito): Current implementation only accepts two-dimensional arrays
    util._assert_cupy_array(a, b)
    util._assert_rank2(a)
    util._assert_nd_squareness(a)
    if 2 < b.ndim:
        raise linalg.LinAlgError('{}-dimensional array given. Array must be '
                                 'one or two-dimensional'.format(b.ndim))
    if len(a) != len(b):
        raise linalg.LinAlgError('The number of rows of array a must be '
                                 'the same as that of array b')

    # Cast to float32 or float64
    if a.dtype.char == 'f' or a.dtype.char == 'd':
        dtype = a.dtype.char
    else:
        dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char

    m, k = (b.size, 1) if b.ndim == 1 else b.shape
    a = a.transpose().astype(dtype, order='C', copy=True)
    b = b.transpose().astype(dtype, order='C', copy=True)
    cusolver_handle = device.get_cusolver_handle()
    cublas_handle = device.get_cublas_handle()
    dev_info = cupy.empty(1, dtype=numpy.int32)

    if dtype == 'f':
        geqrf = cusolver.sgeqrf
        geqrf_bufferSize = cusolver.sgeqrf_bufferSize
        ormqr = cusolver.sormqr
        trsm = cublas.strsm
    else:  # dtype == 'd'
        geqrf = cusolver.dgeqrf
        geqrf_bufferSize = cusolver.dgeqrf_bufferSize
        ormqr = cusolver.dormqr
        trsm = cublas.dtrsm

    # 1. QR decomposition (A = Q * R)
    buffersize = geqrf_bufferSize(cusolver_handle, m, m, a.data.ptr, m)
    workspace = cupy.empty(buffersize, dtype=dtype)
    tau = cupy.empty(m, dtype=dtype)
    geqrf(cusolver_handle, m, m, a.data.ptr, m, tau.data.ptr,
          workspace.data.ptr, buffersize, dev_info.data.ptr)
    _check_status(dev_info)
    # 2. ormqr (Q^T * B)
    ormqr(cusolver_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_OP_T, m, k,
          m, a.data.ptr, m, tau.data.ptr, b.data.ptr, m, workspace.data.ptr,
          buffersize, dev_info.data.ptr)
    _check_status(dev_info)
    # 3. trsm (X = R^{-1} * (Q^T * B))
    trsm(cublas_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_FILL_MODE_UPPER,
         cublas.CUBLAS_OP_N, cublas.CUBLAS_DIAG_NON_UNIT, m, k, 1, a.data.ptr,
         m, b.data.ptr, m)
    return b.transpose()
Beispiel #39
0
def warp_coords(coord_map, shape, dtype=np.float64):
    """Build the source coordinates for the output of a 2-D image warp.

    Parameters
    ----------
    coord_map : callable like GeometricTransform.inverse
        Return input coordinates for given output coordinates.
        Coordinates are in the shape (P, 2), where P is the number
        of coordinates and each element is a ``(row, col)`` pair.
    shape : tuple
        Shape of output image ``(rows, cols[, bands])``.
    dtype : np.dtype or string
        dtype for return value (sane choices: float32 or float64).

    Returns
    -------
    coords : (ndim, rows, cols[, bands]) array of dtype `dtype`
            Coordinates for `scipy.ndimage.map_coordinates`, that will yield
            an image of shape (orows, ocols, bands) by drawing from source
            points according to the `coord_transform_fn`.

    Notes
    -----

    This is a lower-level routine that produces the source coordinates for 2-D
    images used by `warp()`.

    It is provided separately from `warp` to give additional flexibility to
    users who would like, for example, to re-use a particular coordinate
    mapping, to use specific dtypes at various points along the the
    image-warping process, or to implement different post-processing logic
    than `warp` performs after the call to `ndi.map_coordinates`.


    Examples
    --------
    Produce a coordinate map that shifts an image up and to the right:

    >>> import cupy as cp
    >>> from skimage import data
    >>> from scipy.ndimage import map_coordinates
    >>>
    >>> def shift_up10_left20(xy):
    ...     return xy - cp.array([-20, 10])[None, :]
    >>>
    >>> image = data.astronaut().astype(np.float32)
    >>> coords = warp_coords(shift_up10_left20, image.shape)
    >>> warped_image = map_coordinates(image, coords)

    """
    shape = safe_as_int(shape)
    rows, cols = shape[0], shape[1]
    coords_shape = [len(shape), rows, cols]
    if len(shape) == 3:
        coords_shape.append(shape[2])
    coords = cp.empty(coords_shape, dtype=dtype)

    # Reshape grid coordinates into a (P, 2) array of (row, col) pairs
    tf_coords = cp.indices((cols, rows), dtype=dtype).reshape(2, -1).T

    # Map each (row, col) pair to the source image according to
    # the user-provided mapping
    tf_coords = coord_map(tf_coords)

    # Reshape back to a (2, M, N) coordinate grid
    tf_coords = tf_coords.T.reshape((-1, cols, rows)).swapaxes(1, 2)

    # Place the y-coordinate mapping
    _stackcopy(coords[1, ...], tf_coords[0, ...])

    # Place the x-coordinate mapping
    _stackcopy(coords[0, ...], tf_coords[1, ...])

    if len(shape) == 3:
        coords[2, ...] = cp.arange(shape[2], dtype=coords.dtype)

    return coords
Beispiel #40
0
def binopt_csr(a, b, op_name):
    check_shape_for_pointwise_op(a.shape, b.shape)
    a_m, a_n = a.shape
    b_m, b_n = b.shape
    m, n = max(a_m, b_m), max(a_n, b_n)
    a_nnz = a.nnz * (m // a_m) * (n // a_n)
    b_nnz = b.nnz * (m // b_m) * (n // b_n)

    a_info = cupy.zeros(a_nnz + 1, dtype=a.indices.dtype)
    b_info = cupy.zeros(b_nnz + 1, dtype=b.indices.dtype)
    a_valid = cupy.zeros(a_nnz, dtype=numpy.int8)
    b_valid = cupy.zeros(b_nnz, dtype=numpy.int8)
    c_indptr = cupy.zeros(m + 1, dtype=a.indptr.dtype)
    in_dtype = numpy.promote_types(a.dtype, b.dtype)
    a_data = a.data.astype(in_dtype, copy=False)
    b_data = b.data.astype(in_dtype, copy=False)
    funcs = _GET_ROW_ID_
    if op_name == '_maximum_':
        funcs += _BINOPT_MAX_
        out_dtype = in_dtype
    elif op_name == '_minimum_':
        funcs += _BINOPT_MIN_
        out_dtype = in_dtype
    elif op_name == '_eq_':
        funcs += _BINOPT_EQ_
        out_dtype = numpy.bool
    elif op_name == '_ne_':
        funcs += _BINOPT_NE_
        out_dtype = numpy.bool
    elif op_name == '_lt_':
        funcs += _BINOPT_LT_
        out_dtype = numpy.bool
    elif op_name == '_gt_':
        funcs += _BINOPT_GT_
        out_dtype = numpy.bool
    elif op_name == '_le_':
        funcs += _BINOPT_LE_
        out_dtype = numpy.bool
    elif op_name == '_ge_':
        funcs += _BINOPT_GE_
        out_dtype = numpy.bool
    else:
        raise ValueError('invalid op_name: {}'.format(op_name))
    a_tmp_data = cupy.empty(a_nnz, dtype=out_dtype)
    b_tmp_data = cupy.empty(b_nnz, dtype=out_dtype)
    a_tmp_indices = cupy.empty(a_nnz, dtype=a.indices.dtype)
    b_tmp_indices = cupy.empty(b_nnz, dtype=b.indices.dtype)
    _size = a_nnz + b_nnz
    cupy_binopt_csr_step1(op_name, preamble=funcs)(m,
                                                   n,
                                                   a.indptr,
                                                   a.indices,
                                                   a_data,
                                                   a_m,
                                                   a_n,
                                                   a.nnz,
                                                   a_nnz,
                                                   b.indptr,
                                                   b.indices,
                                                   b_data,
                                                   b_m,
                                                   b_n,
                                                   b.nnz,
                                                   b_nnz,
                                                   a_info,
                                                   a_valid,
                                                   a_tmp_indices,
                                                   a_tmp_data,
                                                   b_info,
                                                   b_valid,
                                                   b_tmp_indices,
                                                   b_tmp_data,
                                                   c_indptr,
                                                   size=_size)
    a_info = cupy.cumsum(a_info, dtype=a_info.dtype)
    b_info = cupy.cumsum(b_info, dtype=b_info.dtype)
    c_indptr = cupy.cumsum(c_indptr, dtype=c_indptr.dtype)
    c_nnz = int(c_indptr[-1])
    c_indices = cupy.empty(c_nnz, dtype=a.indices.dtype)
    c_data = cupy.empty(c_nnz, dtype=out_dtype)
    cupy_binopt_csr_step2(op_name)(a_info,
                                   a_valid,
                                   a_tmp_indices,
                                   a_tmp_data,
                                   a_nnz,
                                   b_info,
                                   b_valid,
                                   b_tmp_indices,
                                   b_tmp_data,
                                   b_nnz,
                                   c_indices,
                                   c_data,
                                   size=_size)
    return csr_matrix((c_data, c_indices, c_indptr), shape=(m, n))
Beispiel #41
0
def cross(a, b, axisa=-1, axisb=-1, axisc=-1, axis=None):
    """Returns the cross product of two vectors.

    The cross product of ``a`` and ``b`` in :math:`R^3` is a vector
    perpendicular to both ``a`` and ``b``.  If ``a`` and ``b`` are arrays
    of vectors, the vectors are defined by the last axis of ``a`` and ``b``
    by default, and these axes can have dimensions 2 or 3.  Where the
    dimension of either ``a`` or ``b`` is 2, the third component of the input
    vector is assumed to be zero and the cross product calculated accordingly.
    In cases where both input vectors have dimension 2, the z-component of
    the cross product is returned.

    Args:
        a (cupy.ndarray): Components of the first vector(s).
        b (cupy.ndarray): Components of the second vector(s).
        axisa (int, optional):
            Axis of ``a`` that defines the vector(s).
            By default, the last axis.
        axisb (int, optional):
            Axis of ``b`` that defines the vector(s).
            By default, the last axis.
        axisc (int, optional):
            Axis of ``c`` containing the cross product vector(s).  Ignored if
            both input vectors have dimension 2, as the return is scalar.
            By default, the last axis.
        axis (int, optional):
            If defined, the axis of ``a``, ``b`` and ``c``
            that defines the vector(s) and cross product(s).
            Overrides ``axisa``, ``axisb`` and ``axisc``.

    Returns:
        cupy.ndarray :
            Vector cross product(s).

    .. seealso:: :func:`numpy.cross`

    """

    if axis is not None:
        axisa, axisb, axisc = (axis, ) * 3
    a = cupy.asarray(a)
    b = cupy.asarray(b)
    # Check axisa and axisb are within bounds
    axisa = internal._normalize_axis_index(axisa, a.ndim)
    axisb = internal._normalize_axis_index(axisb, b.ndim)

    # Move working axis to the end of the shape
    a = cupy.moveaxis(a, axisa, -1)
    b = cupy.moveaxis(b, axisb, -1)
    if a.shape[-1] not in (2, 3) or b.shape[-1] not in (2, 3):
        msg = ('incompatible dimensions for cross product\n'
               '(dimension must be 2 or 3)')
        raise ValueError(msg)

    # Create the output array
    shape = cupy.broadcast(a[..., 0], b[..., 0]).shape
    if a.shape[-1] == 3 or b.shape[-1] == 3:
        shape += (3, )
        # Check axisc is within bounds
        axisc = internal._normalize_axis_index(axisc, len(shape))
    dtype = cupy.promote_types(a.dtype, b.dtype)
    cp = cupy.empty(shape, dtype)

    # create local aliases for readability
    a0 = a[..., 0]
    a1 = a[..., 1]
    if a.shape[-1] == 3:
        a2 = a[..., 2]
    b0 = b[..., 0]
    b1 = b[..., 1]
    if b.shape[-1] == 3:
        b2 = b[..., 2]
    if cp.ndim != 0 and cp.shape[-1] == 3:
        cp0 = cp[..., 0]
        cp1 = cp[..., 1]
        cp2 = cp[..., 2]

    if a.shape[-1] == 2:
        if b.shape[-1] == 2:
            # a0 * b1 - a1 * b0
            cupy.multiply(a0, b1, out=cp)
            cp -= a1 * b0
            return cp
        else:
            assert b.shape[-1] == 3
            # cp0 = a1 * b2 - 0  (a2 = 0)
            # cp1 = 0 - a0 * b2  (a2 = 0)
            # cp2 = a0 * b1 - a1 * b0
            cupy.multiply(a1, b2, out=cp0)
            cupy.multiply(a0, b2, out=cp1)
            cupy.negative(cp1, out=cp1)
            cupy.multiply(a0, b1, out=cp2)
            cp2 -= a1 * b0
    else:
        assert a.shape[-1] == 3
        if b.shape[-1] == 3:
            # cp0 = a1 * b2 - a2 * b1
            # cp1 = a2 * b0 - a0 * b2
            # cp2 = a0 * b1 - a1 * b0
            cupy.multiply(a1, b2, out=cp0)
            tmp = a2 * b1
            cp0 -= tmp
            cupy.multiply(a2, b0, out=cp1)
            cupy.multiply(a0, b2, out=tmp)
            cp1 -= tmp
            cupy.multiply(a0, b1, out=cp2)
            cupy.multiply(a1, b0, out=tmp)
            cp2 -= tmp
        else:
            assert b.shape[-1] == 2
            # cp0 = 0 - a2 * b1  (b2 = 0)
            # cp1 = a2 * b0 - 0  (b2 = 0)
            # cp2 = a0 * b1 - a1 * b0
            cupy.multiply(a2, b1, out=cp0)
            cupy.negative(cp0, out=cp0)
            cupy.multiply(a2, b0, out=cp1)
            cupy.multiply(a0, b1, out=cp2)
            cp2 -= a1 * b0

    return cupy.moveaxis(cp, -1, axisc)
Beispiel #42
0
    def sum_duplicates(self):
        """Eliminate duplicate matrix entries by adding them together.

        .. seealso::
           :meth:`scipy.sparse.coo_matrix.sum_duplicates`

        """
        if self._has_canonical_format:
            return
        if self.data.size == 0:
            self._has_canonical_format = True
            return
        keys = cupy.stack([self.row, self.col])
        order = cupy.lexsort(keys)
        src_data = self.data[order]
        src_row = self.row[order]
        src_col = self.col[order]
        diff = cupy.ElementwiseKernel(
            'raw int32 row, raw int32 col', 'int32 diff', '''
            int index;
            if (i == 0 || row[i - 1] == row[i] && col[i - 1] == col[i]) {
              diff = 0;
            } else {
              diff = 1;
            }
            ''', 'sum_duplicates_diff')(src_row, src_col, size=self.row.size)

        if diff[1:].all():
            # All elements have different indices.
            data = src_data
            row = src_row
            col = src_col
        else:
            index = cupy.cumsum(diff, dtype='i')
            size = int(index[-1]) + 1
            data = cupy.zeros(size, dtype=self.data.dtype)
            row = cupy.empty(size, dtype='i')
            col = cupy.empty(size, dtype='i')
            if self.data.dtype.kind == 'f':
                cupy.ElementwiseKernel(
                    'T src_data, int32 src_row, int32 src_col, int32 index',
                    'raw T data, raw int32 row, raw int32 col', '''
                    atomicAdd(&data[index], src_data);
                    row[index] = src_row;
                    col[index] = src_col;
                    ''', 'sum_duplicates_assign')(src_data, src_row, src_col,
                                                  index, data, row, col)
            elif self.data.dtype.kind == 'c':
                cupy.ElementwiseKernel(
                    'T src_real, T src_imag, int32 src_row, int32 src_col, '
                    'int32 index',
                    'raw T real, raw T imag, raw int32 row, raw int32 col', '''
                    atomicAdd(&real[index], src_real);
                    atomicAdd(&imag[index], src_imag);
                    row[index] = src_row;
                    col[index] = src_col;
                    ''',
                    'sum_duplicates_assign_complex')(src_data.real,
                                                     src_data.imag, src_row,
                                                     src_col, index, data.real,
                                                     data.imag, row, col)

        self.data = data
        self.row = row
        self.col = col
        self._has_canonical_format = True
Beispiel #43
0
def inv(a):
    """Computes the inverse of a matrix.

    This function computes matrix ``a_inv`` from n-dimensional regular matrix
    ``a`` such that ``dot(a, a_inv) == eye(n)``.

    Args:
        a (cupy.ndarray): The regular matrix

    Returns:
        cupy.ndarray: The inverse of a matrix.

    .. seealso:: :func:`numpy.linalg.inv`
    """
    if a.ndim >= 3:
        return _batched_inv(a)

    if not cuda.cusolver_enabled:
        raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0')

    # to prevent `a` to be overwritten
    a = a.copy()

    util._assert_cupy_array(a)
    util._assert_rank2(a)
    util._assert_nd_squareness(a)

    # support float32, float64, complex64, and complex128
    if a.dtype.char in 'fdFD':
        dtype = a.dtype.char
    else:
        dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char

    cusolver_handle = device.get_cusolver_handle()
    dev_info = cupy.empty(1, dtype=numpy.int32)

    ipiv = cupy.empty((a.shape[0], 1), dtype=numpy.intc)

    if dtype == 'f':
        getrf = cusolver.sgetrf
        getrf_bufferSize = cusolver.sgetrf_bufferSize
        getrs = cusolver.sgetrs
    elif dtype == 'd':
        getrf = cusolver.dgetrf
        getrf_bufferSize = cusolver.dgetrf_bufferSize
        getrs = cusolver.dgetrs
    elif dtype == 'F':
        getrf = cusolver.cgetrf
        getrf_bufferSize = cusolver.cgetrf_bufferSize
        getrs = cusolver.cgetrs
    elif dtype == 'D':
        getrf = cusolver.zgetrf
        getrf_bufferSize = cusolver.zgetrf_bufferSize
        getrs = cusolver.zgetrs
    else:
        raise ValueError('unsupported dtype')

    m = a.shape[0]

    buffersize = getrf_bufferSize(cusolver_handle, m, m, a.data.ptr, m)
    workspace = cupy.empty(buffersize, dtype=dtype)

    # LU factorization
    getrf(cusolver_handle, m, m, a.data.ptr, m, workspace.data.ptr,
          ipiv.data.ptr, dev_info.data.ptr)

    b = cupy.eye(m, dtype=dtype)

    # solve for the inverse
    getrs(cusolver_handle, 0, m, m, a.data.ptr, m, ipiv.data.ptr, b.data.ptr,
          m, dev_info.data.ptr)

    return b
Beispiel #44
0
def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9,
                   verbose=False, log=False, to_numpy=True, **kwargs):
    """
    Solve the entropic regularization optimal transport on GPU

    If the input matrix are in numpy format, they will be uploaded to the
    GPU first which can incur significant time overhead.

    The function solves the following optimization problem:

    .. math::
        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)

        s.t. \gamma 1 = a

             \gamma^T 1= b

             \gamma\geq 0
    where :

    - M is the (ns,nt) metric cost matrix
    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
    - a and b are source and target weights (sum to 1)

    The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [2]_


    Parameters
    ----------
    a : np.ndarray (ns,)
        samples weights in the source domain
    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
        samples in the target domain, compute sinkhorn with multiple targets
        and fixed M if b is a matrix (return OT loss + dual variables in log)
    M : np.ndarray (ns,nt)
        loss matrix
    reg : float
        Regularization term >0
    numItermax : int, optional
        Max number of iterations
    stopThr : float, optional
        Stop threshol on error (>0)
    verbose : bool, optional
        Print information along iterations
    log : bool, optional
        record log if True
    to_numpy : boolean, optional (default True)
        If true convert back the GPU array result to numpy format.


    Returns
    -------
    gamma : (ns x nt) ndarray
        Optimal transportation matrix for the given parameters
    log : dict
        log dictionary return only if log==True in parameters

    Examples
    --------

    >>> import ot
    >>> a=[.5,.5]
    >>> b=[.5,.5]
    >>> M=[[0.,1.],[1.,0.]]
    >>> ot.sinkhorn(a,b,M,1)
    array([[ 0.36552929,  0.13447071],
           [ 0.13447071,  0.36552929]])


    References
    ----------

    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013


    See Also
    --------
    ot.lp.emd : Unregularized OT
    ot.optim.cg : General regularized OT

    """

    a = cp.asarray(a)
    b = cp.asarray(b)
    M = cp.asarray(M)

    if len(a) == 0:
        a = np.ones((M.shape[0],)) / M.shape[0]
    if len(b) == 0:
        b = np.ones((M.shape[1],)) / M.shape[1]

    # init data
    Nini = len(a)
    Nfin = len(b)

    if len(b.shape) > 1:
        nbb = b.shape[1]
    else:
        nbb = 0

    if log:
        log = {'err': []}

    # we assume that no distances are null except those of the diagonal of
    # distances
    if nbb:
        u = np.ones((Nini, nbb)) / Nini
        v = np.ones((Nfin, nbb)) / Nfin
    else:
        u = np.ones(Nini) / Nini
        v = np.ones(Nfin) / Nfin

    # print(reg)

    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
    K = np.empty(M.shape, dtype=M.dtype)
    np.divide(M, -reg, out=K)
    np.exp(K, out=K)

    # print(np.min(K))
    tmp2 = np.empty(b.shape, dtype=M.dtype)

    Kp = (1 / a).reshape(-1, 1) * K
    cpt = 0
    err = 1
    while (err > stopThr and cpt < numItermax):
        uprev = u
        vprev = v

        KtransposeU = np.dot(K.T, u)
        v = np.divide(b, KtransposeU)
        u = 1. / np.dot(Kp, v)

        if (np.any(KtransposeU == 0) or
                np.any(np.isnan(u)) or np.any(np.isnan(v)) or
                np.any(np.isinf(u)) or np.any(np.isinf(v))):
            # we have reached the machine precision
            # come back to previous solution and quit loop
            print('Warning: numerical errors at iteration', cpt)
            u = uprev
            v = vprev
            break
        if cpt % 10 == 0:
            # we can speed up the process by checking for the error only all
            # the 10th iterations
            if nbb:
                err = np.sum((u - uprev)**2) / np.sum((u)**2) + \
                    np.sum((v - vprev)**2) / np.sum((v)**2)
            else:
                # compute right marginal tmp2= (diag(u)Kdiag(v))^T1
                tmp2 = np.sum(u[:, None] * K * v[None, :], 0)
                #tmp2=np.einsum('i,ij,j->j', u, K, v)
                err = np.linalg.norm(tmp2 - b)**2  # violation of marginal
            if log:
                log['err'].append(err)

            if verbose:
                if cpt % 200 == 0:
                    print(
                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
                print('{:5d}|{:8e}|'.format(cpt, err))
        cpt = cpt + 1
    if log:
        log['u'] = u
        log['v'] = v

    if nbb:  # return only loss
        #res = np.einsum('ik,ij,jk,ij->k', u, K, v, M) (explodes cupy memory)
        res = np.empty(nbb)
        for i in range(nbb):
            res[i] = np.sum(u[:, None, i] * (K * M) * v[None, :, i])
        if to_numpy:
            res = utils.to_np(res)
        if log:
            return res, log
        else:
            return res

    else:  # return OT matrix
        res = u.reshape((-1, 1)) * K * v.reshape((1, -1))
        if to_numpy:
            res = utils.to_np(res)
        if log:
            return res, log
        else:
            return res
Beispiel #45
0
def f3():
    [cupy.empty((s,), dtype='b') for s in sizes]
Beispiel #46
0
 def test_sum_out_wrong_shape(self):
     a = testing.shaped_arange((2, 3, 4))
     b = cupy.empty((2, 3))
     with self.assertRaises(ValueError):
         a.sum(axis=1, out=b)
Beispiel #47
0
    def make_data(self, parse=False):
        if self.mask_sum == 0.:
            if parse:
                self.parse_mask()
            else:
                self.make_mask()

        if self.bg_count is not None:
            if parse:
                self.parse_mask(bg=True)
            else:
                self.make_mask(bg=True)

        with h5py.File(self.out_file, 'a') as fptr:
            if 'ones' in fptr:
                del fptr['ones']
            if 'multi' in fptr:
                del fptr['multi']
            if 'place_ones' in fptr:
                del fptr['place_ones']
            if 'place_multi' in fptr:
                del fptr['place_multi']
            if 'count_multi' in fptr:
                del fptr['count_multi']
            if 'num_pix' in fptr:
                del fptr['num_pix']
            if 'true_angles' in fptr:
                del fptr['true_angles']
            if 'bg' in fptr:
                del fptr['bg']
            if self.bgmask_sum > 0:
                fptr['bg'] = self.bgmask.get()
            fptr['num_pix'] = np.array([self.size**2])
            dtype = h5py.special_dtype(vlen=np.dtype('i4'))
            place_ones = fptr.create_dataset('place_ones', (self.num_data, ),
                                             dtype=dtype)
            place_multi = fptr.create_dataset('place_multi', (self.num_data, ),
                                              dtype=dtype)
            count_multi = fptr.create_dataset('count_multi', (self.num_data, ),
                                              dtype=dtype)
            ones = fptr.create_dataset('ones', (self.num_data, ), dtype='i4')
            multi = fptr.create_dataset('multi', (self.num_data, ), dtype='i4')

            ang = np.random.rand(self.num_data).astype('f8') * 2. * cp.pi
            fptr['true_angles'] = ang
            if self.fluence == 'gamma':
                if 'scale' in fptr:
                    del fptr['scale']
                scale = np.random.gamma(2., 0.5, self.num_data)
            else:
                scale = np.ones(self.num_data, dtype='f8')

            rot_mask = cp.empty(self.size**2, dtype='f8')
            bsize_model = int(np.ceil(self.size / 32.))
            stime = time.time()
            for i in range(self.num_data):
                kernels.slice_gen((bsize_model, ) * 2, (32, ) * 2,
                                  (self.mask, ang[i], scale[i], self.size,
                                   self.bgmask, 0, rot_mask))
                frame = cp.random.poisson(rot_mask, dtype='i4').ravel()
                place_ones[i] = cp.where(frame == 1)[0].get()
                place_multi[i] = cp.where(frame > 1)[0].get()
                count_multi[i] = frame[frame > 1].get()
                ones[i] = place_ones[i].shape[0]
                multi[i] = place_multi[i].shape[0]
                sys.stderr.write('\rWritten %d/%d frames (%d)  ' %
                                 (i + 1, self.num_data, int(frame.sum())))
            etime = time.time()
            sys.stderr.write('\nTime taken (make_data): %f s\n' %
                             (etime - stime))
Beispiel #48
0
def interp(x, xp, fp, left=None, right=None, period=None):
    """ One-dimensional linear interpolation.

    Args:
        x (cupy.ndarray): a 1D array of points on which the interpolation
            is performed.
        xp (cupy.ndarray): a 1D array of points on which the function values
            (``fp``) are known.
        fp (cupy.ndarray): a 1D array containing the function values at the
            the points ``xp``.
        left (float or complex): value to return if ``x < xp[0]``. Default is
            ``fp[0]``.
        right (float or complex): value to return if ``x > xp[-1]``. Default is
            ``fp[-1]``.
        period (None or float): a period for the x-coordinates. Parameters
            ``left`` and ``right`` are ignored if ``period`` is specified.
            Default is ``None``.

    Returns:
        cupy.ndarray: The interpolated values, same shape as ``x``.

    .. note::
        This function may synchronize if ``left`` or ``right`` is not already
        on the device.

    .. seealso:: :func:`numpy.interp`

    """

    if xp.ndim != 1 or fp.ndim != 1:
        raise ValueError('xp and fp must be 1D arrays')
    if xp.size != fp.size:
        raise ValueError('fp and xp are not of the same length')
    if xp.size == 0:
        raise ValueError('array of sample points is empty')
    if not x.flags.c_contiguous:
        raise NotImplementedError('Non-C-contiguous x is currently not '
                                  'supported')
    x_dtype = cupy.common_type(x, xp)
    if not cupy.can_cast(x_dtype, cupy.float64):
        raise TypeError('Cannot cast array data from'
                        ' {} to {} according to the rule \'safe\''.format(
                            x_dtype, cupy.float64))

    if period is not None:
        # The handling of "period" below is modified from NumPy's

        if period == 0:
            raise ValueError("period must be a non-zero value")
        period = abs(period)
        left = None
        right = None

        x = x.astype(cupy.float64)
        xp = xp.astype(cupy.float64)

        # normalizing periodic boundaries
        x %= period
        xp %= period
        asort_xp = cupy.argsort(xp)
        xp = xp[asort_xp]
        fp = fp[asort_xp]
        xp = cupy.concatenate((xp[-1:] - period, xp, xp[0:1] + period))
        fp = cupy.concatenate((fp[-1:], fp, fp[0:1]))
        assert xp.flags.c_contiguous
        assert fp.flags.c_contiguous

    # NumPy always returns float64 or complex128, so we upcast all values
    # on the fly in the kernel
    out_dtype = 'D' if fp.dtype.kind == 'c' else 'd'
    output = cupy.empty(x.shape, dtype=out_dtype)
    idx = cupy.searchsorted(xp, x, side='right')
    left = fp[0] if left is None else cupy.array(left, fp.dtype)
    right = fp[-1] if right is None else cupy.array(right, fp.dtype)
    kern = _get_interp_kernel(out_dtype == 'D')
    kern(x, idx, xp, fp, xp.size, left, right, output)
    return output
Beispiel #49
0
def add_buffers_to_particles(species, float_recv_left, float_recv_right,
                             uint_recv_left, uint_recv_right):
    """
    Add the particles stored in recv_left and recv_right
    to the existing particle in species.

    Resize the auxiliary arrays of the particles Ex, Ey, Ez, Bx, By, Bz,
    as well as cell_idx, sorted_idx and sorting_buffer

    Parameters
    ----------
    species: a Particles object
        Contain the particles that stayed on the present processors

    float_recv_left, float_recv_right, uint_recv_left, uint_recv_right:
        arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl
        is the number of particles that are received to the left
        proc and right proc respectively, and where n_float and n_int
        are the number of float and integer quantities respectively
        These arrays are always on the CPU (since they were used for MPI)
    """
    # Copy the buffers to an enlarged array
    if species.use_cuda:
        add_buffers_gpu(species, float_recv_left, float_recv_right,
                        uint_recv_left, uint_recv_right)
    else:
        add_buffers_cpu(species, float_recv_left, float_recv_right,
                        uint_recv_left, uint_recv_right)

    # Reallocate the particles auxiliary arrays. This needs to be done,
    # as the total number of particles in this domain has changed.
    if species.use_cuda:
        shape = (species.Ntot, )
        # Reallocate empty field-on-particle arrays on the GPU
        species.Ex = cupy.empty(shape, dtype=np.float64)
        species.Ex = cupy.empty(shape, dtype=np.float64)
        species.Ey = cupy.empty(shape, dtype=np.float64)
        species.Ez = cupy.empty(shape, dtype=np.float64)
        species.Bx = cupy.empty(shape, dtype=np.float64)
        species.By = cupy.empty(shape, dtype=np.float64)
        species.Bz = cupy.empty(shape, dtype=np.float64)
        # Reallocate empty auxiliary sorting arrays on the GPU
        species.cell_idx = cupy.empty(shape, dtype=np.int32)
        species.sorted_idx = cupy.empty(shape, dtype=np.intp)
        species.sorting_buffer = cupy.empty(shape, dtype=np.float64)
        if species.n_integer_quantities > 0:
            species.int_sorting_buffer = \
                cupy.empty( shape, dtype=np.uint64 )
    else:
        # Reallocate empty field-on-particle arrays on the CPU
        species.Ex = np.empty(species.Ntot, dtype=np.float64)
        species.Ey = np.empty(species.Ntot, dtype=np.float64)
        species.Ez = np.empty(species.Ntot, dtype=np.float64)
        species.Bx = np.empty(species.Ntot, dtype=np.float64)
        species.By = np.empty(species.Ntot, dtype=np.float64)
        species.Bz = np.empty(species.Ntot, dtype=np.float64)

    # The particles are unsorted after adding new particles.
    species.sorted = False
Beispiel #50
0
    def sum_duplicates(self):
        """Eliminate duplicate matrix entries by adding them together.

        .. warning::
            When sorting the indices, CuPy follows the convention of cuSPARSE,
            which is different from that of SciPy. Therefore, the order of the
            output indices may differ:

            .. code-block:: python

                >>> #     1 0 0
                >>> # A = 1 1 0
                >>> #     1 1 1
                >>> data = cupy.array([1, 1, 1, 1, 1, 1], 'f')
                >>> row = cupy.array([0, 1, 1, 2, 2, 2], 'i')
                >>> col = cupy.array([0, 0, 1, 0, 1, 2], 'i')
                >>> A = cupyx.scipy.sparse.coo_matrix((data, (row, col)),
                ...                                   shape=(3, 3))
                >>> a = A.get()
                >>> A.sum_duplicates()
                >>> a.sum_duplicates()  # a is scipy.sparse.coo_matrix
                >>> A.row
                array([0, 1, 1, 2, 2, 2], dtype=int32)
                >>> a.row
                array([0, 1, 2, 1, 2, 2], dtype=int32)
                >>> A.col
                array([0, 0, 1, 0, 1, 2], dtype=int32)
                >>> a.col
                array([0, 0, 0, 1, 1, 2], dtype=int32)

        .. warning::
            Calling this function might synchronize the device.

        .. seealso::
           :meth:`scipy.sparse.coo_matrix.sum_duplicates`

        """
        if self.has_canonical_format:
            return
        # Note: The sorting order below follows the cuSPARSE convention (first
        # row then col, so-called row-major) and differs from that of SciPy, as
        # the cuSPARSE functions such as cusparseSpMV() assume this sorting
        # order.
        # See https://docs.nvidia.com/cuda/cusparse/index.html#coo-format
        keys = cupy.stack([self.col, self.row])
        order = cupy.lexsort(keys)
        src_data = self.data[order]
        src_row = self.row[order]
        src_col = self.col[order]
        diff = self._sum_duplicates_diff(src_row, src_col, size=self.row.size)

        if diff[1:].all():
            # All elements have different indices.
            data = src_data
            row = src_row
            col = src_col
        else:
            # TODO(leofang): move the kernels outside this method
            index = cupy.cumsum(diff, dtype='i')
            size = int(index[-1]) + 1
            data = cupy.zeros(size, dtype=self.data.dtype)
            row = cupy.empty(size, dtype='i')
            col = cupy.empty(size, dtype='i')
            if self.data.dtype.kind == 'f':
                cupy.ElementwiseKernel(
                    'T src_data, int32 src_row, int32 src_col, int32 index',
                    'raw T data, raw int32 row, raw int32 col',
                    '''
                    atomicAdd(&data[index], src_data);
                    row[index] = src_row;
                    col[index] = src_col;
                    ''',
                    'sum_duplicates_assign'
                )(src_data, src_row, src_col, index, data, row, col)
            elif self.data.dtype.kind == 'c':
                cupy.ElementwiseKernel(
                    'T src_real, T src_imag, int32 src_row, int32 src_col, '
                    'int32 index',
                    'raw T real, raw T imag, raw int32 row, raw int32 col',
                    '''
                    atomicAdd(&real[index], src_real);
                    atomicAdd(&imag[index], src_imag);
                    row[index] = src_row;
                    col[index] = src_col;
                    ''',
                    'sum_duplicates_assign_complex'
                )(src_data.real, src_data.imag, src_row, src_col, index,
                  data.real, data.imag, row, col)

        self.data = data
        self.row = row
        self.col = col
        self.has_canonical_format = True
Beispiel #51
0
 def test_concatenate_wrong_shape(self):
     a = cupy.empty((2, 3, 4))
     b = cupy.empty((3, 3, 4))
     c = cupy.empty((4, 4, 4))
     with self.assertRaises(ValueError):
         cupy.concatenate((a, b, c))
Beispiel #52
0
def batched_gesv(a, b):
    """Solves multiple linear matrix equations using cublas<t>getr[fs]Batched().

    Computes the solution to system of linear equation ``ax = b``.

    Args:
        a (cupy.ndarray): The matrix with dimension ``(..., M, M)``.
        b (cupy.ndarray): The matrix with dimension ``(..., M)`` or
            ``(..., M, K)``.

    Returns:
        cupy.ndarray:
            The matrix with dimension ``(..., M)`` or ``(..., M, K)``.
    """
    _util._assert_cupy_array(a, b)
    _util._assert_nd_squareness(a)

    if not ((a.ndim == b.ndim or a.ndim == b.ndim + 1)
            and a.shape[:-1] == b.shape[:a.ndim - 1]):
        raise ValueError(
            'a must have (..., M, M) shape and b must have (..., M) '
            'or (..., M, K)')

    dtype, out_dtype = _util.linalg_common_type(a, b)
    if dtype == 'f':
        t = 's'
    elif dtype == 'd':
        t = 'd'
    elif dtype == 'F':
        t = 'c'
    elif dtype == 'D':
        t = 'z'
    else:
        raise TypeError('invalid dtype')
    getrf = getattr(cublas, t + 'getrfBatched')
    getrs = getattr(cublas, t + 'getrsBatched')

    bs = numpy.prod(a.shape[:-2]) if a.ndim > 2 else 1
    n = a.shape[-1]
    nrhs = b.shape[-1] if a.ndim == b.ndim else 1
    b_shape = b.shape
    a_data_ptr = a.data.ptr
    b_data_ptr = b.data.ptr
    a = cupy.ascontiguousarray(a.reshape(bs, n, n).transpose(0, 2, 1),
                               dtype=dtype)
    b = cupy.ascontiguousarray(b.reshape(bs, n, nrhs).transpose(0, 2, 1),
                               dtype=dtype)
    if a.data.ptr == a_data_ptr:
        a = a.copy()
    if b.data.ptr == b_data_ptr:
        b = b.copy()

    if n > get_batched_gesv_limit():
        warnings.warn('The matrix size ({}) exceeds the set limit ({})'.format(
            n, get_batched_gesv_limit()))

    handle = device.get_cublas_handle()
    lda = n
    a_step = lda * n * a.itemsize
    a_array = cupy.arange(a.data.ptr,
                          a.data.ptr + a_step * bs,
                          a_step,
                          dtype=cupy.uintp)
    ldb = n
    b_step = ldb * nrhs * b.itemsize
    b_array = cupy.arange(b.data.ptr,
                          b.data.ptr + b_step * bs,
                          b_step,
                          dtype=cupy.uintp)
    pivot = cupy.empty((bs, n), dtype=numpy.int32)
    dinfo = cupy.empty((bs, ), dtype=numpy.int32)
    info = numpy.empty((1, ), dtype=numpy.int32)
    # LU factorization (A = L * U)
    getrf(handle, n, a_array.data.ptr, lda, pivot.data.ptr, dinfo.data.ptr, bs)
    _util._check_cublas_info_array_if_synchronization_allowed(getrf, dinfo)
    # Solves Ax = b
    getrs(handle, cublas.CUBLAS_OP_N, n, nrhs, a_array.data.ptr, lda,
          pivot.data.ptr, b_array.data.ptr, ldb, info.ctypes.data, bs)
    if info[0] != 0:
        msg = 'Error reported by {} in cuBLAS. '.format(getrs.__name__)
        if info[0] < 0:
            msg += 'The {}-th parameter had an illegal value.'.format(-info[0])
        raise linalg.LinAlgError(msg)

    return b.transpose(0, 2, 1).reshape(b_shape).astype(out_dtype, copy=False)
Beispiel #53
0
    def interval(self, mx, size):
        """Generate multiple integers independently sampled uniformly from ``[0, mx]``.

        Args:
            mx (int): Upper bound of the interval
            size (None or int or tuple): Shape of the array or the scalar
                returned.
        Returns:
            int or cupy.ndarray: If ``None``, an :class:`cupy.ndarray` with
            shape ``()`` is returned.
            If ``int``, 1-D array of length size is returned.
            If ``tuple``, multi-dimensional array with shape
            ``size`` is returned.
            Currently, only 32 bit integers can be sampled.
            If 0 :math:`\\leq` ``mx`` :math:`\\leq` 0x7fffffff,
            a ``numpy.int32`` array is returned.
            If 0x80000000 :math:`\\leq` ``mx`` :math:`\\leq` 0xffffffff,
            a ``numpy.uint32`` array is returned.
        """
        if size is None:
            return self.interval(mx, 1).reshape(())
        elif isinstance(size, int):
            size = (size, )

        if mx == 0:
            return cupy.zeros(size, dtype=numpy.int32)

        if mx < 0:
            raise ValueError('mx must be non-negative (actual: {})'.format(mx))
        elif mx <= 0x7fffffff:
            dtype = numpy.int32
        elif mx <= 0xffffffff:
            dtype = numpy.uint32
        else:
            raise ValueError(
                'mx must be within uint32 range (actual: {})'.format(mx))

        mask = (1 << mx.bit_length()) - 1
        mask = cupy.array(mask, dtype=dtype)

        n = functools.reduce(operator.mul, size, 1)

        sample = cupy.empty((n, ), dtype=dtype)
        n_rem = n  # The number of remaining elements to sample
        ret = None
        while n_rem > 0:
            curand.generate(self._generator, sample.data.ptr, sample.size)
            # Drop the samples that exceed the upper limit
            sample &= mask
            success = sample <= mx

            if ret is None:
                # If the sampling has finished in the first iteration,
                # just return the sample.
                if success.all():
                    n_rem = 0
                    ret = sample
                    break

                # Allocate the return array.
                ret = cupy.empty((n, ), dtype=dtype)

            n_succ = min(n_rem, int(success.sum()))
            ret[n - n_rem:n - n_rem + n_succ] = sample[success][:n_succ]
            n_rem -= n_succ

        assert n_rem == 0
        return ret.reshape(size)
Beispiel #54
0
 def test_vstack_wrong_ndim(self):
     a = cupy.empty((3, ))
     b = cupy.empty((3, 1))
     with self.assertRaises(ValueError):
         cupy.vstack((a, b))
Beispiel #55
0
def create_dropout_states(handle):
    state_size = cudnn.dropoutGetStatesSize(handle)
    return cupy.empty((state_size,), dtype="b")
Beispiel #56
0
def f4():
    buf = []
    for i, s in enumerate(sizes):
        buf.append(cupy.empty((s,), dtype='b'))
        if i % 10 == 0:
            buf[i // 10] = None
Beispiel #57
0
def _tensordot_core(a, b, out, n, m, k, ret_shape):
    ret_dtype = a.dtype.char
    if ret_dtype != b.dtype.char:
        ret_dtype = numpy.find_common_type((ret_dtype, b.dtype), ()).char

    # Cast to float32 or float64
    if ret_dtype == 'f' or ret_dtype == 'd':
        dtype = ret_dtype
    else:
        dtype = numpy.find_common_type((ret_dtype, 'f'), ()).char

    a = a.astype(dtype, copy=False)
    b = b.astype(dtype, copy=False)

    if not a.size or not b.size:
        if a.size or b.size:
            raise ValueError('cannot dot zero-sized and non-zero-sized arrays')
        if out is None:
            return cupy.zeros(ret_shape, dtype=ret_dtype)
        else:
            out.fill(0)
            return out

    if out is None:
        out = cupy.empty(ret_shape, dtype)
        if dtype == ret_dtype:
            ret = out
        else:
            ret = cupy.empty(ret_shape, ret_dtype)
    else:
        ret = out
        if out.dtype != dtype:
            out = cupy.empty(ret_shape, dtype)

    # It copies the operands if needed
    if a.shape != (k, n):
        a = cupy.reshape(a, (k, n))
    if b.shape != (k, m):
        b = cupy.reshape(b, (k, m))
    c = out
    if c.shape != (n, m):
        c = c.view()
        c.shape = (n, m)

    # Be careful that cuBLAS uses the FORTRAN-order matrix representation.
    if k == 1:
        if n == 1:
            # Scalar-vector product
            cupy.multiply(a, b, c)
        elif m == 1:
            # Scalar-vector product
            cupy.multiply(a.T, b, c)
        else:
            # Outer product A^T * B
            # c is C-contiguous while cuBLAS requires F-contiguous arrays, so
            # we compute C^T = B^T * A here.
            handle = cuda.Device().cublas_handle
            c.fill(0)
            a, inca = _to_cublas_vector(a, 1)
            b, incb = _to_cublas_vector(b, 1)
            if dtype == 'f':
                ger = cublas.sger
            elif dtype == 'd':
                ger = cublas.dger
            ger(handle, m, n, 1, b.data.ptr, incb, a.data.ptr, inca,
                c.data.ptr, m)

        if dtype != ret_dtype:
            elementwise.copy(out, ret)
        return ret

    handle = cuda.Device().cublas_handle
    if n == 1:
        if m == 1:
            # Inner product
            a, inca = _to_cublas_vector(a, 0)
            b, incb = _to_cublas_vector(b, 0)
            mode = cublas.getPointerMode(handle)
            cublas.setPointerMode(handle,
                                  cublas.CUBLAS_POINTER_MODE_DEVICE)
            if dtype == 'f':
                dot = cublas.sdot
            elif dtype == 'd':
                dot = cublas.ddot
            try:
                dot(handle, k, a.data.ptr, inca, b.data.ptr, incb, c.data.ptr)
            finally:
                cublas.setPointerMode(handle, mode)
        else:
            # Matrix-vector product B^T * A
            a, inca = _to_cublas_vector(a, 0)
            b, transb, ldb = _mat_to_cublas_contiguous(b, 1)
            if transb:
                # gemv requires (m, k) as the original matrix dimensions
                # rather than the transposed dimensions.
                m, k = k, m
            if dtype == 'f':
                gemv = cublas.sgemv
            elif dtype == 'd':
                gemv = cublas.dgemv
            gemv(handle, transb, m, k, 1, b.data.ptr, ldb, a.data.ptr, inca,
                 0, c.data.ptr, 1)
    elif m == 1:
        # Matrix-vector product A^T * B
        a, transa, lda = _mat_to_cublas_contiguous(a, 1)
        b, incb = _to_cublas_vector(b, 0)
        if transa:
            # gemv requires (n, k) as the original matrix dimensions rather
            # than the transposed dimensions.
            n, k = k, n
        if dtype == 'f':
            gemv = cublas.sgemv
        elif dtype == 'd':
            gemv = cublas.dgemv
        gemv(handle, transa, n, k, 1, a.data.ptr, lda, b.data.ptr, incb, 0,
             c.data.ptr, 1)
    else:
        # Matrix-Matrix product A^T * B
        # c is C-contiguous while cuBLAS assumes F-contiguous inputs, so we
        # compute C^T = B^T * A here.
        a, transa, lda = _mat_to_cublas_contiguous(a, 0)
        b, transb, ldb = _mat_to_cublas_contiguous(b, 1)
        if dtype == 'f':
            gemm = cublas.sgemm
        elif dtype == 'd':
            gemm = cublas.dgemm
        gemm(handle, transb, transa, m, n, k, 1, b.data.ptr, ldb, a.data.ptr,
             lda, 0, c.data.ptr, m)

    if dtype != ret_dtype:
        elementwise.copy(out, ret)
    return ret
Beispiel #58
0
 def test_concatenate_wrong_ndim(self):
     a = cupy.empty((2, 3))
     b = cupy.empty((2, ))
     with self.assertRaises(ValueError):
         cupy.concatenate((a, b))
Beispiel #59
0
def _batched_inv(a):

    assert (a.ndim >= 3)
    util._assert_cupy_array(a)
    util._assert_nd_squareness(a)

    if a.dtype == cupy.float32:
        getrf = cupy.cuda.cublas.sgetrfBatched
        getri = cupy.cuda.cublas.sgetriBatched
    elif a.dtype == cupy.float64:
        getrf = cupy.cuda.cublas.dgetrfBatched
        getri = cupy.cuda.cublas.dgetriBatched
    elif a.dtype == cupy.complex64:
        getrf = cupy.cuda.cublas.cgetrfBatched
        getri = cupy.cuda.cublas.cgetriBatched
    elif a.dtype == cupy.complex128:
        getrf = cupy.cuda.cublas.zgetrfBatched
        getri = cupy.cuda.cublas.zgetriBatched
    else:
        msg = ('dtype must be float32, float64, complex64 or float128'
               ' (actual: {})'.format(a.dtype))
        raise ValueError(msg)

    if 0 in a.shape:
        return cupy.empty_like(a)
    a_shape = a.shape

    # copy is necessary to present `a` to be overwritten.
    a = a.copy().reshape(-1, a_shape[-2], a_shape[-1])

    handle = device.get_cublas_handle()
    batch_size = a.shape[0]
    n = a.shape[1]
    lda = n
    step = n * lda * a.itemsize
    start = a.data.ptr
    stop = start + step * batch_size
    a_array = cupy.arange(start, stop, step, dtype=cupy.uintp)
    pivot_array = cupy.empty((batch_size, n), dtype=cupy.int32)
    info_array = cupy.empty((batch_size, ), dtype=cupy.int32)

    getrf(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr,
          info_array.data.ptr, batch_size)

    err = False
    err_detail = ''
    for i in range(batch_size):
        info = info_array[i]
        if info < 0:
            err = True
            err_detail += ('\tmatrix[{}]: illegal value at {}-the parameter.'
                           '\n'.format(i, -info))
        if info > 0:
            err = True
            err_detail += '\tmatrix[{}]: matrix is singular.\n'.format(i)
    if err:
        raise RuntimeError('matrix inversion failed at getrf.\n' + err_detail)

    c = cupy.empty_like(a)
    ldc = lda
    step = n * ldc * c.itemsize
    start = c.data.ptr
    stop = start + step * batch_size
    c_array = cupy.arange(start, stop, step, dtype=cupy.uintp)

    getri(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr,
          c_array.data.ptr, ldc, info_array.data.ptr, batch_size)

    for i in range(batch_size):
        info = info_array[i]
        if info > 0:
            err = True
            err_detail += '\tmatrix[{}]: matrix is singular.\n'.format(i)
    if err:
        raise RuntimeError('matrix inversion failed at getri.\n' + err_detail)

    return c.reshape(a_shape)
Beispiel #60
0
def unique(ar, return_index=False, return_inverse=False,
           return_counts=False, axis=None):
    """Find the unique elements of an array.

    Returns the sorted unique elements of an array. There are three optional
    outputs in addition to the unique elements:

    * the indices of the input array that give the unique values
    * the indices of the unique array that reconstruct the input array
    * the number of times each unique value comes up in the input array

    Args:
        ar(array_like): Input array. This will be flattened if it is not
            already 1-D.
        return_index(bool, optional): If True, also return the indices of `ar`
            (along the specified axis, if provided, or in the flattened array)
            that result in the unique array.
        return_inverse(bool, optional): If True, also return the indices of the
            unique array (for the specified axis, if provided) that can be used
            to reconstruct `ar`.
        return_counts(bool, optional): If True, also return the number of times
            each unique item appears in `ar`.
        axis(int or None, optional): Not supported yet.

    Returns:
        cupy.ndarray or tuple:
            If there are no optional outputs, it returns the
            :class:`cupy.ndarray` of the sorted unique values. Otherwise, it
            returns the tuple which contains the sorted unique values and
            followings.

            * The indices of the first occurrences of the unique values in the
              original array. Only provided if `return_index` is True.
            * The indices to reconstruct the original array from the
              unique array. Only provided if `return_inverse` is True.
            * The number of times each of the unique values comes up in the
              original array. Only provided if `return_counts` is True.

    .. warning::

        This function may synchronize the device.

    .. seealso:: :func:`numpy.unique`
    """
    if axis is not None:
        raise NotImplementedError('axis option is not supported yet.')

    ar = cupy.asarray(ar).flatten()

    if return_index or return_inverse:
        perm = ar.argsort()
        aux = ar[perm]
    else:
        ar.sort()
        aux = ar
    mask = cupy.empty(aux.shape, dtype=cupy.bool_)
    mask[0] = True
    mask[1:] = aux[1:] != aux[:-1]

    ret = aux[mask]
    if not return_index and not return_inverse and not return_counts:
        return ret

    ret = ret,
    if return_index:
        ret += perm[mask],
    if return_inverse:
        imask = cupy.cumsum(mask) - 1
        inv_idx = cupy.empty(mask.shape, dtype=cupy.intp)
        inv_idx[perm] = imask
        ret += inv_idx,
    if return_counts:
        nonzero = cupy.nonzero(mask)[0]  # may synchronize
        idx = cupy.empty((nonzero.size + 1,), nonzero.dtype)
        idx[:-1] = nonzero
        idx[-1] = mask.size
        ret += idx[1:] - idx[:-1],
    return ret