def get_by_index(src_gpu, ind): """ Get values in a GPUArray by index. Parameters ---------- src_gpu : pycuda.gpuarray.GPUArray GPUArray instance from which to extract values. ind : pycuda.gpuarray.GPUArray or numpy.ndarray Array of element indices to set. Must have an integer dtype. Returns ------- res_gpu : pycuda.gpuarray.GPUArray GPUArray with length of `ind` and dtype of `src_gpu` containing selected values. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> src = np.random.rand(5).astype(np.float32) >>> src_gpu = gpuarray.to_gpu(src) >>> ind = gpuarray.to_gpu(np.array([0, 2, 4])) >>> res_gpu = misc.get_by_index(src_gpu, ind) >>> np.allclose(res_gpu.get(), src[[0, 2, 4]]) True Notes ----- Only supports 1D index arrays. May not be efficient for certain index patterns because of lack of inability to coalesce memory operations. """ # Only support 1D index arrays: assert len(np.shape(ind)) == 1 assert issubclass(ind.dtype.type, numbers.Integral) N = len(ind) if not isinstance(ind, gpuarray.GPUArray): ind = gpuarray.to_gpu(ind) dest_gpu = gpuarray.empty(N, dtype=src_gpu.dtype) # Manually handle empty index array because it will cause the kernel to # fail if processed: if N == 0: return dest_gpu try: func = get_by_index.cache[(src_gpu.dtype, ind.dtype)] except KeyError: data_ctype = tools.dtype_to_ctype(src_gpu.dtype) ind_ctype = tools.dtype_to_ctype(ind.dtype) v = "{data_ctype} *dest, {ind_ctype} *ind, {data_ctype} *src".format(data_ctype=data_ctype, ind_ctype=ind_ctype) func = elementwise.ElementwiseKernel(v, "dest[i] = src[ind[i]]") get_by_index.cache[(src_gpu.dtype, ind.dtype)] = func func(dest_gpu, ind, src_gpu, range=slice(0, N, 1)) return dest_gpu
def get_inds_kernel(inds_ctype, src_ctype): v = ("{data_ctype} *dest, int src_shift, " +\ "{inds_ctype} *inds, {data_ctype} *src").format(\ data_ctype=src_ctype,inds_ctype=inds_ctype) func = elementwise.ElementwiseKernel(v,\ "dest[i] = src[src_shift+inds[i]]") return func
def gen_trapz_mult(N, dtype): """ Generate multiplication array for 1D trapezoidal integration. Generates an array whose dot product with some array of equal length is equivalent to the definite integral of the latter computed using trapezoidal integration. Parameters ---------- N : int Length of array. dtype : float type Floating point type to use when generating the array. Returns ------- result : pycuda.gpuarray.GPUArray Generated array. """ if dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') ctype = tools.dtype_to_ctype(dtype) func = elementwise.ElementwiseKernel( "{ctype} *x".format(ctype=ctype), "x[i] = ((i == 0) || (i == {M})) ? 0.5 : 1".format(M=N - 1)) x_gpu = gpuarray.empty(N, dtype) func(x_gpu) return x_gpu
def sici(x_gpu): """ Sine/Cosine integral. Computes the sine and cosine integral of every element in the input matrix. Parameters ---------- x_gpu : GPUArray Input matrix of shape `(m, n)`. Returns ------- (si_gpu, ci_gpu) : tuple of GPUArrays Tuple of GPUarrays containing the sine integrals and cosine integrals of the entries of `x_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> x = np.array([[1, 2], [3, 4]], np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> (si_gpu, ci_gpu) = sici(x_gpu) >>> (si, ci) = scipy.special.sici(x) >>> np.allclose(si, si_gpu.get()) True >>> np.allclose(ci, ci_gpu.get()) True """ if x_gpu.dtype == np.float32: args = 'float *x, float *si, float *ci' op = 'sicif(x[i], &si[i], &ci[i])' elif x_gpu.dtype == np.float64: args = 'double *x, double *si, double *ci' op = 'sici(x[i], &si[i], &ci[i])' else: raise ValueError('unsupported type') try: func = sici.cache[x_gpu.dtype] except KeyError: func = elementwise.ElementwiseKernel( args, op, options=["-I", install_headers], preamble='#include "cuSpecialFuncs.h"') sici.cache[x_gpu.dtype] = func si_gpu = gpuarray.empty_like(x_gpu) ci_gpu = gpuarray.empty_like(x_gpu) func(x_gpu, si_gpu, ci_gpu) return (si_gpu, ci_gpu)
def clipupper_GPU(x_gpu, ub): clipupper = cuelement.ElementwiseKernel( "float *x, float ub", "x[i] = x[i] > ub ? ub : x[i]", "clipupper") clipupper(x_gpu, ub)
def cliplower_GPU(x_gpu, lb): cliplower = cuelement.ElementwiseKernel( "float *x, float lb", "x[i] = x[i] < lb ? lb : x[i]", "cliplower") cliplower(x_gpu, lb)
def _scale_inplace(a, x_gpu): """ Scale an array by a specified value in-place. """ ctype = tools.dtype_to_ctype(x_gpu.dtype) inplace = el.ElementwiseKernel("{ctype} a, {ctype} *x".format(ctype=ctype), "x[i] /= a") inplace(np.cast[x_gpu.dtype](a), x_gpu)
def conj(x_gpu, overwrite=True): """ Complex conjugate. Compute the complex conjugate of the array in device memory. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array of shape `(m, n)`. overwrite : bool If true (default), save the result in the specified array. If false, return the result in a newly allocated array. Returns ------- xc_gpu : pycuda.gpuarray.GPUArray Conjugate of the input array. If `overwrite` is true, the returned matrix is the same as the input array. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> x = np.array([[1+1j, 2-2j, 3+3j, 4-4j], [5+5j, 6-6j, 7+7j, 8-8j]], np.complex64) >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = linalg.conj(x_gpu) >>> np.all(x == np.conj(y_gpu.get())) True """ # Don't attempt to process non-complex matrix types: if x_gpu.dtype in [np.float32, np.float64]: return x_gpu try: func = conj.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) func = el.ElementwiseKernel( "{ctype} *x, {ctype} *y".format(ctype=ctype), "y[i] = conj(x[i])") conj.cache[x_gpu.dtype] = func if overwrite: func(x_gpu, x_gpu) return x_gpu else: y_gpu = gpuarray.empty_like(x_gpu) func(x_gpu, y_gpu) return y_gpu
def exp1(z_gpu): """ Exponential integral with `n = 1` of complex arguments. Parameters ---------- z_gpu : GPUArray Input matrix of shape `(m, n)`. Returns ------- e_gpu : GPUArray GPUarrays containing the exponential integrals of the entries of `z_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64) >>> z_gpu = gpuarray.to_gpu(z) >>> e_gpu = exp1(z_gpu) >>> e_sp = scipy.special.exp1(z) >>> np.allclose(e_sp, e_gpu.get()) True """ if z_gpu.dtype == np.complex64: args = 'pycuda::complex<float> *z, pycuda::complex<float> *e' elif z_gpu.dtype == np.complex128: args = 'pycuda::complex<double> *z, pycuda::complex<double> *e' else: raise ValueError('unsupported type') op = 'e[i] = exp1(z[i])' try: func = exp1.cache[z_gpu.dtype] except KeyError: func = elementwise.ElementwiseKernel( args, op, options=["-I", install_headers], preamble='#include "cuSpecialFuncs.h"') exp1.cache[z_gpu.dtype] = func e_gpu = gpuarray.empty_like(z_gpu) func(z_gpu, e_gpu) return e_gpu
def eye(N, dtype=np.float32): """ Construct a 2D matrix with ones on the diagonal and zeros elsewhere. Constructs a matrix in device memory whose diagonal elements are set to 1 and non-diagonal elements are set to 0. Parameters ---------- N : int Number of rows or columns in the output matrix. dtype : type Matrix data type. Returns ------- e_gpu : pycuda.gpuarray.GPUArray Diagonal matrix of dimensions `[N, N]` with diagonal values set to 1. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> N = 5 >>> e_gpu = linalg.eye(N) >>> np.all(e_gpu.get() == np.eye(N)) True >>> e_gpu = linalg.eye(N, np.complex64) >>> np.all(e_gpu.get() == np.eye(N, dtype=np.complex64)) True """ if dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') if N <= 0: raise ValueError('N must be greater than 0') alloc = misc._global_cublas_allocator e_gpu = misc.zeros((N, N), dtype, allocator=alloc) func = el.ElementwiseKernel("{ctype} *e".format(ctype=tools.dtype_to_ctype(dtype)), "e[i] = 1") func(e_gpu, slice=slice(0, N*N, N+1)) return e_gpu
def set_inds(self, src, dest, inds, dest_shift=0): assert isinstance(dest_shift, numbers.Integral) try: func = self.set_inds.cache[(inds.dtype, dest_shift)] except KeyError: inds_ctype = dtype_to_ctype(inds.dtype) data_ctype = dtype_to_ctype(src.dtype) v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src"\ .format(data_ctype=data_ctype, inds_ctype=inds_ctype) func = elementwise.ElementwiseKernel(v, "dest[i+%i] = src[inds[i]]" % dest_shift) self.set_inds.cache[(inds.dtype, dest_shift)] = func func(dest, inds, src, range=slice(0, len(inds), 1) )
def _get_exp1_kernel(dtype): if dtype == np.complex64: args = 'pycuda::complex<float> *z, pycuda::complex<float> *e' elif dtype == np.complex128: args = 'pycuda::complex<double> *z, pycuda::complex<double> *e' else: raise ValueError('unsupported type') op = 'e[i] = exp1(z[i])' return elementwise.ElementwiseKernel( args, op, options=["-I", install_headers], preamble='#include "cuSpecialFuncs.h"')
def _scale_inplace(a, x_gpu): """ Scale an array by a specified value in-place. """ # Cache the kernel to avoid invoking the compiler if the # specified scale factor and array type have already been encountered: try: func = _scale_inplace.cache[(a, x_gpu.dtype)] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) func = el.ElementwiseKernel( "{ctype} a, {ctype} *x".format(ctype=ctype), "x[i] /= a") _scale_inplace.cache[(a, x_gpu.dtype)] = func func(x_gpu.dtype.type(a), x_gpu)
def _get_sici_kernel(dtype): if dtype == np.float32: args = 'float *x, float *si, float *ci' op = 'sicif(x[i], &si[i], &ci[i])' elif dtype == np.float64: args = 'double *x, double *si, double *ci' op = 'sici(x[i], &si[i], &ci[i])' else: raise ValueError('unsupported type') return elementwise.ElementwiseKernel( args, op, options=["-I", install_headers], preamble='#include "cuSpecialFuncs.h"')
def init_gpu(dev=0): global gp, lg, cm, msc, slf, _gpu from pycuda import gpuarray as gp from pycuda import elementwise as ew from pycuda import cumath as cm from skcuda import linalg as lg from skcuda import misc as msc msc.init_context(msc.init_device(dev)) lg.init() slf = ew.ElementwiseKernel("float * y, float * x, unsigned * ind", "y[i] = x[ind[i]]") _gpu = True
def _fill_zeros_kernel(self, dest, inds): """ Set `dest[inds[i]] = 0 for i in range(len(inds))` """ try: func = self._fill_zeros_kernel.cache[(inds.dtype, dest.dtype)] except KeyError: inds_ctype = dtype_to_ctype(inds.dtype) data_ctype = dtype_to_ctype(dest.dtype) v = ("{data_ctype} *dest," +\ "{inds_ctype} *inds").format(\ data_ctype=data_ctype,inds_ctype=inds_ctype) func = elementwise.ElementwiseKernel(v,\ "dest[inds[i]] =0") self._fill_zeros_kernel.cache[(inds.dtype, dest.dtype)] = func func(dest, inds, range=slice(0, len(inds), 1) )
def set_by_inds_array(self, inds, data): """ Set mapped data with array by integer indices. Parameters ---------- inds : array-like Integer indices of data elements to update. data : numpy.ndarray Data to assign. """ if np.isscalar(data): raise ValueError('data must be array-like') if len(np.shape(inds)) > 1: raise ValueError('index array must be 1D') N = len(inds) if N == 0: return if not isinstance(inds, gpuarray.GPUArray): inds = gpuarray.to_gpu(inds) if not issubclass(inds.dtype.type, numbers.Integral): raise ValueError('index array must contain integers') if N != len(data): raise ValueError('len(inds) = %s != %s = len(data)' % (N, len(data))) if not isinstance(data, gpuarray.GPUArray): data = gpuarray.to_gpu(data) # Allocate data array if it doesn't exist: if not self.data: self.data = gpuarray.empty(N, data.dtype) else: assert self.data.dtype == data.dtype try: func = self.set_by_inds_array.cache[(inds.dtype, self.data.dtype)] except KeyError: inds_ctype = tools.dtype_to_ctype(inds.dtype) v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src".format( data_ctype=self.data_ctype, inds_ctype=inds_ctype) func = elementwise.ElementwiseKernel(v, "dest[inds[i]] = src[i]") self.set_by_inds_array.cache[(inds.dtype, self.data.dtype)] = func func(self.data, inds, data, range=slice(0, N, 1))
def add_inds(self, src, dest, inds, dest_shift=0): """ Set `dest[inds[i]+dest_shift] = src[i] for i in range(len(inds))` """ assert src.dtype == dest.dtype try: func = self.add_inds.cache[(inds.dtype, src.dtype)] except KeyError: inds_ctype = dtype_to_ctype(inds.dtype) data_ctype = dtype_to_ctype(src.dtype) v = ("{data_ctype} *dest, int dest_shift," +\ "{inds_ctype} *inds, {data_ctype} *src").format(\ data_ctype=data_ctype,inds_ctype=inds_ctype) func = elementwise.ElementwiseKernel(v,\ "dest[inds[i]+dest_shift] = dest[inds[i]+dest_shift] + src[i]") self.add_inds.cache[(inds.dtype, src.dtype)] = func func(dest, int(dest_shift), inds, src, range=slice(0, len(inds), 1) )
def set_by_inds_from_inds(dest_gpu, ind_dest, src_gpu, ind_src): """ Set values in a GPUArray by index from indexed values in another GPUArray. Parameters ---------- dest_gpu : pycuda.gpuarray.GPUArray GPUArray instance to modify. ind_dest : pycuda.gpuarray.GPUArray or numpy.ndarray 1D array of element indices in `dest_gpu` to set. Must have an integer dtype. src_gpu : pycuda.gpuarray.GPUArray GPUArray instance from which to set values. ind_src : pycuda.gpuarray.GPUArray or numpy.ndarray 1D array of element indices in `src_gpu` to copy. Must have an integer dtype. """ assert len(np.shape(ind_dest)) == 1 assert len(np.shape(ind_src)) == 1 assert dest_gpu.dtype == src_gpu.dtype assert ind_dest.dtype == ind_src.dtype assert issubclass(ind_dest.dtype.type, numbers.Integral) assert issubclass(ind_src.dtype.type, numbers.Integral) N = len(ind_src) # Manually handle empty index array because it will cause the kernel to # fail if processed: if N == 0: return assert N == len(ind_dest) if not isinstance(ind_dest, gpuarray.GPUArray): ind_dest = gpuarray.to_gpu(ind_dest) if not isinstance(ind_src, gpuarray.GPUArray): ind_src = gpuarray.to_gpu(ind_src) try: func = set_by_inds_from_inds.cache[(dest_gpu.dtype, ind_dest.dtype)] except KeyError: data_ctype = dtype_to_ctype(dest_gpu.dtype) ind_ctype = dtype_to_ctype(ind_dest.dtype) v = "{data_ctype} *dest, {ind_ctype} *ind_dest,"\ "{data_ctype} *src, {ind_ctype} *ind_src".format(data_ctype=data_ctype, ind_ctype=ind_ctype) func = elementwise.ElementwiseKernel( v, "dest[ind_dest[i]] = src[ind_src[i]]") set_by_inds_from_inds.cache[(dest_gpu.dtype, ind_dest.dtype)] = func func(dest_gpu, ind_dest, src_gpu, ind_src, range=slice(0, N, 1))
def diff(x_gpu): """ Calculate the discrete difference. Calculates the first order difference between the successive entries of a vector. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input vector. Returns ------- y_gpu : pycuda.gpuarray.GPUArray Discrete difference. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> x = np.asarray(np.random.rand(5), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = misc.diff(x_gpu) >>> np.allclose(np.diff(x), y_gpu.get()) True """ y_gpu = gpuarray.empty(len(x_gpu) - 1, x_gpu.dtype) try: func = diff.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) func = elementwise.ElementwiseKernel( "{ctype} *a, {ctype} *b".format(ctype=ctype), "b[i] = a[i+1]-a[i]") diff.cache[x_gpu.dtype] = func func(x_gpu, y_gpu) return y_gpu
def get_by_inds(self, inds): """ Retrieve mapped data specified by integer index. Parameters ---------- inds : sequence of int Integer indices of data elements to return. Returns ------- result : numpy.ndarray Selected data. """ if not self.data: raise ValueError('port mapper contains no data') assert len(np.shape(inds)) == 1 assert issubclass(inds.dtype.type, numbers.Integral) N = len(inds) assert N <= len(self.data) if N == 0: return np.empty(N, dtype=self.data.dtype) result = gpuarray.empty(N, dtype=self.data.dtype) if not isinstance(inds, gpuarray.GPUArray): inds = gpuarray.to_gpu(inds) try: func = self.get_by_inds.cache[inds.dtype] except KeyError: inds_ctype = tools.dtype_to_ctype(inds.dtype) v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src".format( data_ctype=self.data_ctype, inds_ctype=inds_ctype) func = elementwise.ElementwiseKernel(v, "dest[i] = src[inds[i]]") self.get_by_inds.cache[inds.dtype] = func func(result, inds, self.data, range=slice(0, N, 1)) return result.get()
def set_by_inds(self, inds, data): """ Set mapped data by integer indices. Parameters ---------- inds : sequence of int Integer indices of data elements to update. data : numpy.ndarray Data to assign. """ assert len(np.shape(inds)) == 1 assert issubclass(inds.dtype.type, numbers.Integral) N = len(inds) assert N == len(data) if not isinstance(inds, gpuarray.GPUArray): inds = gpuarray.to_gpu(inds) if not isinstance(data, gpuarray.GPUArray): data = gpuarray.to_gpu(data) # Allocate data array if it doesn't exist: if not self.data: self.data = gpuarray.empty(N, data.dtype) else: assert self.data.dtype == data.dtype try: func = self.set_by_inds.cache[inds.dtype] except KeyError: inds_ctype = tools.dtype_to_ctype(inds.dtype) v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src".format( data_ctype=self.data_ctype, inds_ctype=inds_ctype) func = elementwise.ElementwiseKernel(v, "dest[inds[i]] = src[i]") self.set_by_inds.cache[inds.dtype] = func func(self.data, inds, data, range=slice(0, N, 1))
self.verbose = 0 # initially self.tau = 1e-5 self.compute_obj = 1 self.compute_both = 0 self.asgui = 0 self.max_func_evals = 10 # might save additional # time, is used in line search self.pbb_gradient_norm = 1e-9 self.beta = 0.0498 self.sigma = 0.298 self.unconstrained = False clip2bound = cuelement.ElementwiseKernel( "float *dx, float *x, float *g", "dx[i] = ((x[i] == 0.f) && (g[i] > 0)) ? 0.f : dx[i]", "clip2bound") class PBB: """ PBB -- Optimizes f(x) s.t., x >= 0 This function solves the following optimization problem min f(x) subject to x >= 0 The implementation follows a 'reverse-communication' interface wherein the function f(x) and its gradient f'(x) are computed via function handles. Usage:
def multiply(x_gpu, y_gpu, overwrite=True): """ Multiply arguments element-wise. Parameters ---------- x_gpu, y_gpu : pycuda.gpuarray.GPUArray Input arrays to be multiplied. dev : pycuda.driver.Device Device object to be used. overwrite : bool If true (default), return the result in `y_gpu`. is false, return the result in a newly allocated array. Returns ------- z_gpu : pycuda.gpuarray.GPUArray The element-wise product of the input arrays. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> x = np.asarray(np.random.rand(4, 4), np.float32) >>> y = np.asarray(np.random.rand(4, 4), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = gpuarray.to_gpu(y) >>> z_gpu = linalg.multiply(x_gpu, y_gpu) >>> np.allclose(x*y, z_gpu.get()) True """ alloc = misc._global_cublas_allocator if x_gpu.shape != y_gpu.shape: raise ValueError('input arrays must have the same shape') if x_gpu.dtype not in [ np.float32, np.float64, np.complex64, np.complex128 ]: raise ValueError('unrecognized type') x_ctype = tools.dtype_to_ctype(x_gpu.dtype) y_ctype = tools.dtype_to_ctype(y_gpu.dtype) if overwrite: func = el.ElementwiseKernel( "{x_ctype} *x, {y_ctype} *y".format(x_ctype=x_ctype, y_ctype=y_ctype), "y[i] *= x[i]") func(x_gpu, y_gpu) return y_gpu else: result_type = np.result_type(x_gpu.dtype, y_gpu.dtype) z_gpu = gpuarray.empty(x_gpu.shape, result_type, allocator=alloc) func = \ el.ElementwiseKernel("{x_ctype} *x, {y_ctype} *y, {z_type} *z".format(x_ctype=x_ctype, y_ctype=y_ctype, z_type=tools.dtype_to_ctype(result_type)), "z[i] = x[i]*y[i]") func(x_gpu, y_gpu, z_gpu) return z_gpu
def pinv(a_gpu, rcond=1e-15): """ Moore-Penrose pseudoinverse. Compute the Moore-Penrose pseudoinverse of the specified matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. rcond : float Singular values smaller than `rcond`*max(singular_values)` are set to zero. Returns ------- a_inv_gpu : pycuda.gpuarray.GPUArray Pseudoinverse of input matrix. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. If the input matrix is square, the pseudoinverse uses less memory. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(8, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> a_inv_gpu = linalg.pinv(a_gpu) >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4) True >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> b_inv_gpu = linalg.pinv(b_gpu) >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4) True """ if not _has_cula: raise NotImplementedError('CULA not installed') # Perform in-place SVD if the matrix is square to save memory: if a_gpu.shape[0] == a_gpu.shape[1]: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o') else: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's') # Suppress very small singular values: cutoff_gpu = gpuarray.max(s_gpu) * rcond ctype = tools.dtype_to_ctype(s_gpu.dtype) cutoff_func = el.ElementwiseKernel( "{ctype} *s, {ctype} *cutoff".format(ctype=ctype), "if (s[i] > cutoff[0]) {s[i] = 1/s[i];} else {s[i] = 0;}") cutoff_func(s_gpu, cutoff_gpu) # Compute the pseudoinverse without allocating a new diagonal matrix: return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c')
def set_by_inds_from_inds(dest_gpu, ind_dest, src_gpu, ind_src): """ Set values in a GPUArray by index from indexed values in another GPUArray. Parameters ---------- dest_gpu : pycuda.gpuarray.GPUArray GPUArray instance to modify. ind_dest : pycuda.gpuarray.GPUArray or numpy.ndarray 1D array of element indices in `dest_gpu` to set. Must have an integer dtype. src_gpu : pycuda.gpuarray.GPUArray GPUArray instance from which to set values. ind_src : pycuda.gpuarray.GPUArray or numpy.ndarray 1D array of element indices in `src_gpu` to copy. Must have an integer dtype and be the same length as `ind_dest`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> from nk.tools.gpu import set_by_inds_from_inds >>> dest_gpu = gpuarray.to_gpu(np.zeros(5, dtype=np.float32)) >>> ind_dest = gpuarray.to_gpu(np.array([0, 2, 4])) >>> src_gpu = gpuarray.to_gpu(np.arange(5, 10, dtype=np.float32)) >>> ind_src = gpuarray.to_gpu(np.array([2, 3, 4])) >>> gpu.set_by_inds_from_inds(dest_gpu, ind_dest, src_gpu, ind_src) >>> assert np.allclose(dest_gpu.get(), np.array([7, 0, 8, 0, 9], dtype=np.float32)) True """ if len(np.shape(ind_dest)) > 1: raise ValueError('destination index array must be 1D') if len(np.shape(ind_src)) > 1: raise ValueError('source index array must be 1D') assert dest_gpu.dtype == src_gpu.dtype assert ind_dest.dtype == ind_src.dtype assert issubclass(ind_dest.dtype.type, numbers.Integral) assert issubclass(ind_src.dtype.type, numbers.Integral) # Manually handle empty index array because it will cause the kernel to # fail if processed: N = len(ind_src) if N == 0: return assert N == len(ind_dest) if not isinstance(ind_dest, gpuarray.GPUArray): ind_dest = gpuarray.to_gpu(ind_dest) if not isinstance(ind_src, gpuarray.GPUArray): ind_src = gpuarray.to_gpu(ind_src) try: func = set_by_inds_from_inds.cache[(dest_gpu.dtype, ind_dest.dtype)] except KeyError: data_ctype = dtype_to_ctype(dest_gpu.dtype) ind_ctype = dtype_to_ctype(ind_dest.dtype) v = "{data_ctype} *dest, {ind_ctype} *ind_dest,"\ "{data_ctype} *src, {ind_ctype} *ind_src".format(data_ctype=data_ctype, ind_ctype=ind_ctype) func = elementwise.ElementwiseKernel( v, "dest[ind_dest[i]] = src[ind_src[i]]") set_by_inds_from_inds.cache[(dest_gpu.dtype, ind_dest.dtype)] = func func(dest_gpu, ind_dest, src_gpu, ind_src, range=slice(0, N, 1))
def set_by_inds(dest_gpu, ind, src_gpu, ind_which='dest'): """ Set values in a GPUArray by index. Parameters ---------- dest_gpu : pycuda.gpuarray.GPUArray GPUArray instance to modify. ind : pycuda.gpuarray.GPUArray or numpy.ndarray 1D array of element indices to set. Must have an integer dtype. src_gpu : pycuda.gpuarray.GPUArray GPUArray instance from which to set values. ind_which : str If set to 'dest', set the elements in `dest_gpu` with indices `ind` to the successive values in `src_gpu`; the lengths of `ind` and `src_gpu` must be equal. If set to 'src', set the successive values in `dest_gpu` to the values in `src_gpu` with indices `ind`; the lengths of `ind` and `dest_gpu` must be equal. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> from nk.tools.gpu import set_by_inds >>> dest_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32)) >>> ind = gpuarray.to_gpu(np.array([0, 2, 4])) >>> src_gpu = gpuarray.to_gpu(np.array([1, 1, 1], dtype=np.float32)) >>> set_by_inds(dest_gpu, ind, src_gpu, 'dest') >>> np.allclose(dest_gpu.get(), np.array([1, 1, 1, 3, 1], dtype=np.float32)) True >>> dest_gpu = gpuarray.to_gpu(np.zeros(3, dtype=np.float32)) >>> ind = gpuarray.to_gpu(np.array([0, 2, 4])) >>> src_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32)) >>> set_by_inds(dest_gpu, ind, src_gpu, 'src') >>> np.allclose(dest_gpu.get(), np.array([0, 2, 4], dtype=np.float32)) True Notes ----- Only supports 1D index arrays. May not be efficient for certain index patterns because of lack of inability to coalesce memory operations. """ if np.isscalar(src_gpu) or np.isscalar(dest_gpu): raise ValueError('data must be array-like') if len(np.shape(ind)) > 1: raise ValueError('index array must be 1D') # Manually handle empty index array because it will cause the kernel to # fail if processed: N = len(ind) if N == 0: return if not issubclass(ind.dtype.type, numbers.Integral): raise ValueError('index array must contain integers') if not dest_gpu.dtype == src_gpu.dtype: raise ValueError('dest_gpu.dtype != src_gpu.dtype') if ind_which == 'dest': assert N == len(src_gpu) elif ind_which == 'src': assert N == len(dest_gpu) else: raise ValueError('invalid value for `ind_which`') if not isinstance(ind, gpuarray.GPUArray): ind = gpuarray.to_gpu(ind) try: func = set_by_inds.cache[(dest_gpu.dtype, ind.dtype, ind_which)] except KeyError: data_ctype = dtype_to_ctype(dest_gpu.dtype) ind_ctype = dtype_to_ctype(ind.dtype) v = "{data_ctype} *dest, {ind_ctype} *ind, {data_ctype} *src".format( data_ctype=data_ctype, ind_ctype=ind_ctype) if ind_which == 'dest': func = elementwise.ElementwiseKernel(v, "dest[ind[i]] = src[i]") else: func = elementwise.ElementwiseKernel(v, "dest[i] = src[ind[i]]") set_by_inds.cache[(dest_gpu.dtype, ind.dtype, ind_which)] = func func(dest_gpu, ind, src_gpu, range=slice(0, N, 1))
def gen_simps_mult(N, dtype, even='avg'): """ Generate multiplication array for composite Simpson's rule. Generates an array whose dot product with some array of equal length is equivalent to the definite integral of the latter computed using composite Simpson's rule. If there are an even number of samples, N, then there are an odd number of intervals (N-1), but Simpson's rule requires an even number of intervals. The parameter 'even' controls how this is handled. Parameters ---------- N : int Length of array. dtype : float type Floating point type to use when generating the array. even : str {'avg', 'first', 'last'}, optional 'avg' : Average two results:1) use the first N-2 intervals with a trapezoidal rule on the last interval and 2) use the last N-2 intervals with a trapezoidal rule on the first interval. 'first' : Use Simpson's rule for the first N-2 intervals with a trapezoidal rule on the last interval. 'last' : Use Simpson's rule for the last N-2 intervals with a trapezoidal rule on the first interval. Returns ------- result : pycuda.gpuarray.GPUArray Generated array. """ if dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') ctype = tools.dtype_to_ctype(dtype) x_gpu = gpuarray.zeros(N, dtype) if N % 2: func = elementwise.ElementwiseKernel("{ctype} *x".format(ctype=ctype), "x[i] = (i%2 == 0) ? ((i != 0 && i != {M}) ? 2. : 1.) : 4.".format(M=N-1)) x_gpu.fill(1.) func(x_gpu) return x_gpu/3. else: if even not in ['avg', 'last', 'first']: raise ValueError("Parameter 'even' must be " "'avg', 'last', or 'first'.") basic_simps = gen_simps_mult(N-1, dtype) if even in ['avg', 'first']: x_gpu[:-1] += basic_simps x_gpu[-2:] += 0.5 # trapz on last interval if even in ['avg', 'last']: x_gpu[1:] += basic_simps x_gpu[:2] += 0.5 # trapz on first interval if even == 'avg': x_gpu /= 2. return x_gpu
import numpy as np import pycuda.autoinit from pycuda import gpuarray, elementwise x = np.arange(0, 1001, dtype=np.uint32) y = np.zeros(1001, np.uint32) kernel = elementwise.ElementwiseKernel( arguments="unsigned int* x, int* y", operation="y[i] = x[i] * x[i]", ) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) kernel(x_gpu, y_gpu) print(y_gpu.get())
def _get_scale_kernel(dtype): ctype = tools.dtype_to_ctype(dtype) return el.ElementwiseKernel( "{ctype} scale, {ctype} *x".format(ctype=ctype), "x[i] /= scale")