def e1z(z_gpu, dev): """ Exponential integral with `n = 1` of complex arguments. Parameters ---------- x_gpu : GPUArray Input matrix of shape `(m, n)`. dev : pycuda.driver.Device Device object to be used. Returns ------- e_gpu : GPUArray GPUarrays containing the exponential integrals of the entries of `z_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64) >>> z_gpu = gpuarray.to_gpu(z) >>> e_gpu = e1z(z_gpu, pycuda.autoinit.device) >>> e_sp = scipy.special.exp1(z) >>> np.allclose(e_sp, e_gpu.get()) True """ if z_gpu.dtype == np.complex64: use_double = 0 elif z_gpu.dtype == np.complex128: use_double = 1 else: raise ValueError("unsupported type") # Get block/grid sizes: max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev) block_dim, grid_dim = select_block_grid_sizes(dev, z_gpu.shape) max_blocks_per_grid = max(max_grid_dim) # Set this to False when debugging to make sure the compiled kernel is # not cached: cache_dir = None e1z_mod = SourceModule( e1z_mod_template.substitute( use_double=use_double, max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid ), cache_dir=cache_dir, options=["-I", install_headers], ) e1z_func = e1z_mod.get_function("e1z") e_gpu = gpuarray.empty_like(z_gpu) e1z_func(z_gpu.gpudata, e_gpu.gpudata, np.uint32(z_gpu.size), block=block_dim, grid=grid_dim) return e_gpu
def sici(x_gpu, dev): """ Sine/Cosine integral. Computes the sine and cosine integral of every element in the input matrix. Parameters ---------- x_gpu : GPUArray Input matrix of shape `(m, n)`. dev : pycuda.driver.Device Device object to be used. Returns ------- (si_gpu, ci_gpu) : tuple of GPUArrays Tuple of GPUarrays containing the sine integrals and cosine integrals of the entries of `x_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> x = np.array([[1, 2], [3, 4]], np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> (si_gpu, ci_gpu) = sici(x_gpu, pycuda.autoinit.device) >>> (si, ci) = scipy.special.sici(x) >>> np.allclose(si, si_gpu.get()) True >>> np.allclose(ci, ci_gpu.get()) True """ if x_gpu.dtype == np.float32: use_double = 0 elif x_gpu.dtype == np.float64: use_double = 1 else: raise ValueError("unsupported type") # Get block/grid sizes: max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev) block_dim, grid_dim = select_block_grid_sizes(dev, x_gpu.shape) max_blocks_per_grid = max(max_grid_dim) # Set this to False when debugging to make sure the compiled kernel is # not cached: cache_dir = None sici_mod = SourceModule( sici_mod_template.substitute( use_double=use_double, max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid ), cache_dir=cache_dir, options=["-I", install_headers], ) sici_func = sici_mod.get_function("sici_array") si_gpu = gpuarray.empty_like(x_gpu) ci_gpu = gpuarray.empty_like(x_gpu) sici_func(x_gpu.gpudata, si_gpu.gpudata, ci_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim) return (si_gpu, ci_gpu)
def pinv(a_gpu, dev, rcond=1e-15): """ Moore-Penrose pseudoinverse. Compute the Moore-Penrose pseudoinverse of the specified matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. dev : pycuda.driver.Device Device object to be used. rcond : float Singular values smaller than `rcond`*max(singular_values)` are set to zero. Returns ------- a_inv_gpu : pycuda.gpuarray.GPUArray Pseudoinverse of input matrix. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(8, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> a_inv_gpu = pinv(a_gpu, pycuda.autoinit.device) >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4) True >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> b_inv_gpu = pinv(b_gpu, pycuda.autoinit.device) >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4) True """ # Check input dtype because the SVD can only be computed in single # precision: if a_gpu.dtype not in [np.float32, np.complex64]: raise ValueError('unsupported type') # Compute SVD: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 0) uh_gpu = transpose(u_gpu, dev) # Get block/grid sizes: max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev) block_dim, grid_dim = select_block_grid_sizes(dev, s_gpu.shape) max_blocks_per_grid = max(max_grid_dim) # Suppress very small singular values: cutoff_invert_s_mod = \ SourceModule(cutoff_invert_s_mod_template.substitute( max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid)) cutoff_invert_s = \ cutoff_invert_s_mod.get_function('cutoff_invert_s') cutoff_gpu = gpuarray.max(s_gpu)*rcond cutoff_invert_s(s_gpu.gpudata, cutoff_gpu.gpudata, np.uint32(s_gpu.size), block=block_dim, grid=grid_dim) # The singular values must data type is in uh_gpu: if s_gpu.dtype == uh_gpu.dtype: s_diag_gpu = diag(s_gpu, dev) else: s_diag_gpu = diag(s_gpu.astype(uh_gpu.dtype), dev) # Finish pinv computation: v_gpu = transpose(vh_gpu, dev) suh_gpu = dot(s_diag_gpu, uh_gpu) return dot(v_gpu, suh_gpu)
def diag(v_gpu, dev): """ Construct a diagonal matrix. Constructs a matrix in device memory whose diagonal elements correspond to the elements in the specified array; all non-diagonal elements are set to 0. Parameters ---------- a_obj : pycuda.gpuarray.GPUArray Input array of length `n`. dev : pycuda.driver.Device Device object to be used. Returns ------- d_gpu : pycuda.gpuarray.GPUArray Diagonal matrix of dimensions `[n, n]`. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> v = np.array([1, 2, 3, 4, 5, 6], np.float32) >>> v_gpu = gpuarray.to_gpu(v) >>> d_gpu = diag(v_gpu, pycuda.autoinit.device); >>> np.all(d_gpu.get() == np.diag(v)) True >>> v = np.array([1j, 2j, 3j, 4j, 5j, 6j], np.complex64) >>> v_gpu = gpuarray.to_gpu(v) >>> d_gpu = diag(v_gpu, pycuda.autoinit.device); >>> np.all(d_gpu.get() == np.diag(v)) True """ if v_gpu.dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') if len(v_gpu.shape) > 1: raise ValueError('input array cannot be multidimensional') use_double = int(v_gpu.dtype in [np.float64, np.complex128]) use_complex = int(v_gpu.dtype in [np.complex64, np.complex128]) # Allocate output matrix: d_gpu = gpuarray.empty((v_gpu.size, v_gpu.size), v_gpu.dtype) # Get block/grid sizes: max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev) block_dim, grid_dim = select_block_grid_sizes(dev, d_gpu.shape) max_blocks_per_grid = max(max_grid_dim) # Set this to False when debugging to make sure the compiled kernel is # not cached: cache_dir=None diag_mod = \ SourceModule(diag_mod_template.substitute(use_double=use_double, use_complex=use_complex, max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, cols=v_gpu.size), cache_dir=cache_dir) diag = diag_mod.get_function("diag") diag(v_gpu.gpudata, d_gpu.gpudata, np.uint32(d_gpu.size), block=block_dim, grid=grid_dim) return d_gpu
def conj(a_gpu, dev): """ Complex conjugate. Compute the complex conjugate of the matrix in device memory. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. dev : pycuda.driver.Device Device object to be used. Notes ----- The input matrix is modified in place. This function assumes that the input matrix contains complex numbers; undefined behavior may occur for other types. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.array([[1+1j, 2-2j, 3+3j, 4-4j], [5+5j, 6-6j, 7+7j, 8-8j]], np.complex64) >>> a_gpu = gpuarray.to_gpu(a) >>> conj(a_gpu, pycuda.autoinit.device) >>> np.all(a == np.conj(a_gpu.get())) True """ # Don't attempt to process non-complex matrix types: if a_gpu.dtype in [np.float32, np.float64]: return if a_gpu.dtype == np.complex64: use_double = 0 elif a_gpu.dtype == np.complex128: use_double = 1 else: raise ValueError('unsupported type') # Get block/grid sizes: max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev) block_dim, grid_dim = select_block_grid_sizes(dev, a_gpu.shape) max_blocks_per_grid = max(max_grid_dim) # Set this to False when debugging to make sure the compiled kernel is # not cached: cache_dir=None conj_mod = \ SourceModule(conj_mod_template.substitute(use_double=use_double, max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid), cache_dir=cache_dir) conj = conj_mod.get_function("conj") conj(a_gpu.gpudata, np.uint32(a_gpu.size), block=block_dim, grid=grid_dim)
def transpose(a_gpu, dev): """ Matrix transpose. Transpose a matrix in device memory and return an object representing the transposed matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. dev : pycuda.driver.Device Device object to be used. Returns ------- at_gpu : pycuda.gpuarray.GPUArray Transposed matrix of shape `(n, m)`. dev : pycuda.driver.Device Device object to be used. Notes ----- If the specified matrix type is complex, the function will return the Hermitian of the input matrix. Examples -------- >>> import pycuda.autoinit >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> at_gpu = transpose(a_gpu, pycuda.autoinit.device) >>> np.all(a.T == at_gpu.get()) True >>> b = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> bt_gpu = transpose(b_gpu, pycuda.autoinit.device) >>> np.all(np.conj(b.T) == bt_gpu.get()) True """ if a_gpu.dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') use_double = int(a_gpu.dtype in [np.float64, np.complex128]) use_complex = int(a_gpu.dtype in [np.complex64, np.complex128]) # Get block/grid sizes: max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev) block_dim, grid_dim = select_block_grid_sizes(dev, a_gpu.shape) max_blocks_per_grid = max(max_grid_dim) # Set this to False when debugging to make sure the compiled kernel is # not cached: cache_dir=None transpose_mod = \ SourceModule(transpose_mod_template.substitute(use_double=use_double, use_complex=use_complex, max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, cols=a_gpu.shape[1], rows=a_gpu.shape[0]), cache_dir=cache_dir) transpose = transpose_mod.get_function("transpose") at_gpu = gpuarray.empty(a_gpu.shape[::-1], a_gpu.dtype) transpose(at_gpu.gpudata, a_gpu.gpudata, np.uint32(a_gpu.size), block=block_dim, grid=grid_dim) return at_gpu