Example #1
0
def e1z(z_gpu, dev):
    """
    Exponential integral with `n = 1` of complex arguments.

    Parameters
    ----------
    x_gpu : GPUArray
        Input matrix of shape `(m, n)`.
    dev : pycuda.driver.Device
        Device object to be used.
        
    Returns
    -------
    e_gpu : GPUArray
        GPUarrays containing the exponential integrals of
        the entries of `z_gpu`.

    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import scipy.special
    >>> import special
    >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64)
    >>> z_gpu = gpuarray.to_gpu(z)
    >>> e_gpu = e1z(z_gpu, pycuda.autoinit.device)
    >>> e_sp = scipy.special.exp1(z)
    >>> np.allclose(e_sp, e_gpu.get())
    True

    """

    if z_gpu.dtype == np.complex64:
        use_double = 0
    elif z_gpu.dtype == np.complex128:
        use_double = 1
    else:
        raise ValueError("unsupported type")

    # Get block/grid sizes:
    max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev)
    block_dim, grid_dim = select_block_grid_sizes(dev, z_gpu.shape)
    max_blocks_per_grid = max(max_grid_dim)

    # Set this to False when debugging to make sure the compiled kernel is
    # not cached:
    cache_dir = None
    e1z_mod = SourceModule(
        e1z_mod_template.substitute(
            use_double=use_double, max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid
        ),
        cache_dir=cache_dir,
        options=["-I", install_headers],
    )
    e1z_func = e1z_mod.get_function("e1z")

    e_gpu = gpuarray.empty_like(z_gpu)
    e1z_func(z_gpu.gpudata, e_gpu.gpudata, np.uint32(z_gpu.size), block=block_dim, grid=grid_dim)
    return e_gpu
Example #2
0
def sici(x_gpu, dev):
    """
    Sine/Cosine integral.

    Computes the sine and cosine integral of every element in the
    input matrix.

    Parameters
    ----------
    x_gpu : GPUArray
        Input matrix of shape `(m, n)`.
    dev : pycuda.driver.Device
        Device object to be used.
        
    Returns
    -------
    (si_gpu, ci_gpu) : tuple of GPUArrays
        Tuple of GPUarrays containing the sine integrals and cosine
        integrals of the entries of `x_gpu`.
        
    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import scipy.special
    >>> import special
    >>> x = np.array([[1, 2], [3, 4]], np.float32)
    >>> x_gpu = gpuarray.to_gpu(x)
    >>> (si_gpu, ci_gpu) = sici(x_gpu, pycuda.autoinit.device)
    >>> (si, ci) = scipy.special.sici(x)
    >>> np.allclose(si, si_gpu.get())
    True
    >>> np.allclose(ci, ci_gpu.get())
    True
    
    """

    if x_gpu.dtype == np.float32:
        use_double = 0
    elif x_gpu.dtype == np.float64:
        use_double = 1
    else:
        raise ValueError("unsupported type")

    # Get block/grid sizes:
    max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev)
    block_dim, grid_dim = select_block_grid_sizes(dev, x_gpu.shape)
    max_blocks_per_grid = max(max_grid_dim)

    # Set this to False when debugging to make sure the compiled kernel is
    # not cached:
    cache_dir = None
    sici_mod = SourceModule(
        sici_mod_template.substitute(
            use_double=use_double, max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid
        ),
        cache_dir=cache_dir,
        options=["-I", install_headers],
    )
    sici_func = sici_mod.get_function("sici_array")

    si_gpu = gpuarray.empty_like(x_gpu)
    ci_gpu = gpuarray.empty_like(x_gpu)
    sici_func(x_gpu.gpudata, si_gpu.gpudata, ci_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim)
    return (si_gpu, ci_gpu)
Example #3
0
def pinv(a_gpu, dev, rcond=1e-15):
    """
    Moore-Penrose pseudoinverse.

    Compute the Moore-Penrose pseudoinverse of the specified matrix.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)`.
    dev : pycuda.driver.Device
        Device object to be used.
    rcond : float
        Singular values smaller than `rcond`*max(singular_values)`
        are set to zero.
        
    Returns
    -------
    a_inv_gpu : pycuda.gpuarray.GPUArray
        Pseudoinverse of input matrix.
        
    Examples
    --------
    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(8, 4), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> a_inv_gpu = pinv(a_gpu, pycuda.autoinit.device)
    >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4)
    True
    >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> b_inv_gpu = pinv(b_gpu, pycuda.autoinit.device)
    >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4)
    True

    """

    # Check input dtype because the SVD can only be computed in single
    # precision:
    if a_gpu.dtype not in [np.float32, np.complex64]:
        raise ValueError('unsupported type')

    # Compute SVD:
    u_gpu, s_gpu, vh_gpu = svd(a_gpu, 0)
    uh_gpu = transpose(u_gpu, dev)

    # Get block/grid sizes:
    max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev)
    block_dim, grid_dim = select_block_grid_sizes(dev, s_gpu.shape)
    max_blocks_per_grid = max(max_grid_dim)

    # Suppress very small singular values:
    cutoff_invert_s_mod = \
        SourceModule(cutoff_invert_s_mod_template.substitute( 
        max_threads_per_block=max_threads_per_block,
        max_blocks_per_grid=max_blocks_per_grid))
    cutoff_invert_s = \
                    cutoff_invert_s_mod.get_function('cutoff_invert_s')
    cutoff_gpu = gpuarray.max(s_gpu)*rcond
    cutoff_invert_s(s_gpu.gpudata, cutoff_gpu.gpudata,
                    np.uint32(s_gpu.size),
                    block=block_dim, grid=grid_dim)
    
    # The singular values must data type is in uh_gpu:
    if s_gpu.dtype == uh_gpu.dtype:
        s_diag_gpu = diag(s_gpu, dev)
    else:
        s_diag_gpu = diag(s_gpu.astype(uh_gpu.dtype), dev)

    # Finish pinv computation:
    v_gpu = transpose(vh_gpu, dev)
    suh_gpu = dot(s_diag_gpu, uh_gpu)
    return dot(v_gpu, suh_gpu)
Example #4
0
def diag(v_gpu, dev):
    """
    Construct a diagonal matrix.

    Constructs a matrix in device memory whose diagonal elements
    correspond to the elements in the specified array; all
    non-diagonal elements are set to 0.

    Parameters
    ----------
    a_obj : pycuda.gpuarray.GPUArray
        Input array of length `n`.
    dev : pycuda.driver.Device
        Device object to be used.

    Returns
    -------
    d_gpu : pycuda.gpuarray.GPUArray
        Diagonal matrix of dimensions `[n, n]`.
        
    Examples
    --------
    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> v = np.array([1, 2, 3, 4, 5, 6], np.float32)
    >>> v_gpu = gpuarray.to_gpu(v)
    >>> d_gpu = diag(v_gpu, pycuda.autoinit.device);
    >>> np.all(d_gpu.get() == np.diag(v))
    True
    >>> v = np.array([1j, 2j, 3j, 4j, 5j, 6j], np.complex64)
    >>> v_gpu = gpuarray.to_gpu(v)
    >>> d_gpu = diag(v_gpu, pycuda.autoinit.device);
    >>> np.all(d_gpu.get() == np.diag(v))
    True
    
    """

    if v_gpu.dtype not in [np.float32, np.float64, np.complex64,
                           np.complex128]:
        raise ValueError('unrecognized type')

    if len(v_gpu.shape) > 1:
        raise ValueError('input array cannot be multidimensional')
    
    use_double = int(v_gpu.dtype in [np.float64, np.complex128])
    use_complex = int(v_gpu.dtype in [np.complex64, np.complex128])

    # Allocate output matrix:
    d_gpu = gpuarray.empty((v_gpu.size, v_gpu.size), v_gpu.dtype)
    
    # Get block/grid sizes:
    max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev)
    block_dim, grid_dim = select_block_grid_sizes(dev, d_gpu.shape)
    max_blocks_per_grid = max(max_grid_dim)

    # Set this to False when debugging to make sure the compiled kernel is
    # not cached:
    cache_dir=None
    diag_mod = \
             SourceModule(diag_mod_template.substitute(use_double=use_double,
                                                       use_complex=use_complex,
                          max_threads_per_block=max_threads_per_block,
                          max_blocks_per_grid=max_blocks_per_grid,
                          cols=v_gpu.size),
                          cache_dir=cache_dir)

    diag = diag_mod.get_function("diag")    
    diag(v_gpu.gpudata, d_gpu.gpudata, np.uint32(d_gpu.size),
         block=block_dim,
         grid=grid_dim)
    
    return d_gpu
Example #5
0
def conj(a_gpu, dev):
    """
    Complex conjugate.
    
    Compute the complex conjugate of the matrix in device memory.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)`.
    dev : pycuda.driver.Device
        Device object to be used.

    Notes
    -----
    The input matrix is modified in place.

    This function assumes that the input matrix contains complex
    numbers; undefined behavior may occur for other types.
    
    Examples
    --------
    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.array([[1+1j, 2-2j, 3+3j, 4-4j], [5+5j, 6-6j, 7+7j, 8-8j]], np.complex64)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> conj(a_gpu, pycuda.autoinit.device)
    >>> np.all(a == np.conj(a_gpu.get()))
    True
    
    """

    # Don't attempt to process non-complex matrix types:
    if a_gpu.dtype in [np.float32, np.float64]:
        return

    if a_gpu.dtype == np.complex64:
        use_double = 0
    elif a_gpu.dtype == np.complex128:
        use_double = 1
    else:
        raise ValueError('unsupported type')

    # Get block/grid sizes:
    max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev)
    block_dim, grid_dim = select_block_grid_sizes(dev, a_gpu.shape)
    max_blocks_per_grid = max(max_grid_dim)

    # Set this to False when debugging to make sure the compiled kernel is
    # not cached:
    cache_dir=None
    conj_mod = \
             SourceModule(conj_mod_template.substitute(use_double=use_double,
                          max_threads_per_block=max_threads_per_block,
                          max_blocks_per_grid=max_blocks_per_grid),
                          cache_dir=cache_dir)

    conj = conj_mod.get_function("conj")
    conj(a_gpu.gpudata,
         np.uint32(a_gpu.size),
         block=block_dim,
         grid=grid_dim)
Example #6
0
def transpose(a_gpu, dev):
    """
    Matrix transpose.
    
    Transpose a matrix in device memory and return an object
    representing the transposed matrix.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)`.
    dev : pycuda.driver.Device
        Device object to be used.

    Returns
    -------
    at_gpu : pycuda.gpuarray.GPUArray
        Transposed matrix of shape `(n, m)`.
    dev : pycuda.driver.Device
        Device object to be used.

    Notes
    -----
    If the specified matrix type is complex, the function will return
    the Hermitian of the input matrix.
    
    Examples
    --------
    >>> import pycuda.autoinit
    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> at_gpu = transpose(a_gpu, pycuda.autoinit.device)
    >>> np.all(a.T == at_gpu.get())
    True
    >>> b = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], np.complex64)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> bt_gpu = transpose(b_gpu, pycuda.autoinit.device)
    >>> np.all(np.conj(b.T) == bt_gpu.get())
    True

    """

    if a_gpu.dtype not in [np.float32, np.float64, np.complex64,
                           np.complex128]:
        raise ValueError('unrecognized type')

    use_double = int(a_gpu.dtype in [np.float64, np.complex128])
    use_complex = int(a_gpu.dtype in [np.complex64, np.complex128])

    # Get block/grid sizes:
    max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev)
    block_dim, grid_dim = select_block_grid_sizes(dev, a_gpu.shape)
    max_blocks_per_grid = max(max_grid_dim)

    # Set this to False when debugging to make sure the compiled kernel is
    # not cached:
    cache_dir=None            
    transpose_mod = \
                  SourceModule(transpose_mod_template.substitute(use_double=use_double,
                                                                 use_complex=use_complex,
                               max_threads_per_block=max_threads_per_block,
                               max_blocks_per_grid=max_blocks_per_grid,
                               cols=a_gpu.shape[1],
                               rows=a_gpu.shape[0]),
                               cache_dir=cache_dir)                                                                 

    transpose = transpose_mod.get_function("transpose")
    at_gpu = gpuarray.empty(a_gpu.shape[::-1], a_gpu.dtype)
    transpose(at_gpu.gpudata, a_gpu.gpudata,
              np.uint32(a_gpu.size),
              block=block_dim,
              grid=grid_dim)
                    
    return at_gpu