def calc_x_G(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh, handle=None): D = A[0].shape[1] Dm1 = A[0].shape[0] q = len(A) x = garr.zeros((Dm1, q * D - Dm1), dtype=A[0].dtype) x_part = garr.empty_like(x) x_subpart = garr.empty_like(A[0]) if not (C is None and Kp1 is None): assert (not C is None) and (not Kp1 is None) x_part.fill(0) for s in range(q): x_subpart = eps_r(rp1, C[s], Ap1, x_subpart, handle) #~1st line x_subpart += cla.dot(A[s], Kp1, handle=handle) #~3rd line x_part += cla.dot(cla.dot(x_subpart, r_si, handle=handle), Vsh[s], handle=handle) x += cla.dot(lm1_s, x_part, handle=handle) if not lm2 is None: x_part.fill(0) for s in range(q): #~2nd line x_subpart = eps_l(lm2, Am1, Cm1[s], x_subpart, handle) x_part += cla.dot(x_subpart, cla.dot(r_s, Vsh[s], handle=handle), handle=handle) x += cla.dot(lm1_si, x_part, handle=handle) return x
def gpubarlinedata(xdata, ydata, bins, minval=None, maxval=None): if maxval == None: maxval = gpumax(xdata) if minval == None: minval = gpumin(xdata) binsize = (maxval - minval) / float(bins) inbin = gpuarray.empty_like(xdata) select = gpuarray.empty_like(xdata) xmeans = [] ymeans = [] errors = [] for i in xrange(bins): lo = minval + binsize * i hi = minval + binsize * (i + 1) gpubarlinekerna(xdata, lo, hi, inbin) N = gpusum(inbin) if N > 1: gpubarlinekernb(inbin, ydata, select) my = gpusum(select) / float(N) gpubarlinekernb(inbin, xdata, select) mx = gpusum(select) / float(N) gpubarlinekernc(inbin, ydata, my, select) s = sqrt(gpusum(select) / (N * (N - 1))) xmeans.append(mx) ymeans.append(my) errors.append(s) return (xmeans, ymeans, errors)
def integrate(stepsize=0.01, stores=5, steps=10000, number_of_particles=2 ** 10): gpu_r, gpu_v, gpu_mass = create_particles(number_of_particles) number_of_particles = np.int32(number_of_particles) gpu_rs, gpu_vs = [gpu_r], [gpu_v] for i in xrange(stores - 1): gpu_rs.append(gpuarray.empty_like(gpu_r)) gpu_vs.append(gpuarray.empty_like(gpu_v)) advance = SourceModule(advance_kernel).get_function("advance") advance.prepare([np.intp, np.intp, np.intp, np.intp, np.intp, np.int32]) block_size = (32, 0, 0) grid_size = (int(number_of_particles / 32), 0, 0) advance.prepared_call(block_size, grid_size, gpu_r[0], gpu_v[0], gpu_mass, gpu_r[1], gpu_v[1], number_of_particles) old, new = 1, 2 for i in xrange(steps): r = rs_gpu[old].get_async() v = vs_gpu[old].get_async() advance.prepared_call_async( block_size, grid_size, gpu_rs[old], gpu_vs[old], gpu_mass, gpu_rs[new], gpu_vs[new], number_of_particles ) np.write("step{i:4}_r".format(i * stepsize) + ".dat", r) np.write("step{i:4}_v".format(i * stepsize) + ".dat", r) old, new = new, (new + 1) % stores
def integrate(stepsize = .01, stores = 5, steps=10000, number_of_particles=2**10): gpu_r, gpu_v, gpu_mass = create_particles(number_of_particles) number_of_particles = np.int32(number_of_particles) gpu_rs, gpu_vs = [gpu_r], [gpu_v] for i in xrange(stores-1): gpu_rs.append(gpuarray.empty_like(gpu_r)) gpu_vs.append(gpuarray.empty_like(gpu_v)) advance = SourceModule(advance_kernel).get_function("advance") advance.prepare([np.intp, np.intp, np.intp, np.intp, np.intp, np.int32]) block_size = (32,0,0) grid_size = (int(number_of_particles/32), 0, 0) advance.prepared_call(block_size, grid_size ,gpu_r[0], gpu_v[0], gpu_mass, gpu_r[1], gpu_v[1], number_of_particles) old, new = 1, 2 for i in xrange(steps): r = rs_gpu[old].get_async() v = vs_gpu[old].get_async() advance.prepared_call_async(block_size, grid_size ,gpu_rs[old], gpu_vs[old], gpu_mass, gpu_rs[new], gpu_vs[new], number_of_particles) np.write("step{i:4}_r".format(i*stepsize)+".dat", r) np.write("step{i:4}_v".format(i*stepsize)+".dat", r) old, new = new, (new+1)%stores
def cufft_conv(x, y): x = x.astype(np.complex64) y = y.astype(np.complex64) if (x.shape != y.shape): return -1 plan = fft.Plan(x.shape, np.complex64, np.complex64) inverse_plan = fft.Plan(x.shape, np.complex64, np.complex64) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) x_fft = gpuarray.empty_like(x_gpu, dtype=np.complex64) y_fft = gpuarray.empty_like(y_gpu, dtype=np.complex64) out_gpu = gpuarray.empty_like(x_gpu, dtype=np.complex64) fft.fft(x_gpu, x_fft, plan) fft.fft(y_gpu, y_fft, plan) linalg.multiply(x_fft, y_fft, overwrite=True) fft.ifft(y_fft, out_gpu, inverse_plan, scale=True) conv_out = out_gpu.get() x_gpu.gpudata.free() y_gpu.gpudata.free() x_fft.gpudata.free() y_fft.gpudata.free() out_gpu.gpudata.free() return conv_out
def sici(x_gpu): """ Sine/Cosine integral. Computes the sine and cosine integral of every element in the input matrix. Parameters ---------- x_gpu : GPUArray Input matrix of shape `(m, n)`. Returns ------- (si_gpu, ci_gpu) : tuple of GPUArrays Tuple of GPUarrays containing the sine integrals and cosine integrals of the entries of `x_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> x = np.array([[1, 2], [3, 4]], np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> (si_gpu, ci_gpu) = sici(x_gpu) >>> (si, ci) = scipy.special.sici(x) >>> np.allclose(si, si_gpu.get()) True >>> np.allclose(ci, ci_gpu.get()) True """ if x_gpu.dtype == np.float32: args = 'float *x, float *si, float *ci' op = 'sicif(x[i], &si[i], &ci[i])' elif x_gpu.dtype == np.float64: args = 'double *x, double *si, double *ci' op = 'sici(x[i], &si[i], &ci[i])' else: raise ValueError('unsupported type') try: func = sici.cache[x_gpu.dtype] except KeyError: func = elementwise.ElementwiseKernel( args, op, options=["-I", install_headers], preamble='#include "cuSpecialFuncs.h"') sici.cache[x_gpu.dtype] = func si_gpu = gpuarray.empty_like(x_gpu) ci_gpu = gpuarray.empty_like(x_gpu) func(x_gpu, si_gpu, ci_gpu) return (si_gpu, ci_gpu)
def sici(x_gpu): """ Sine/Cosine integral. Computes the sine and cosine integral of every element in the input matrix. Parameters ---------- x_gpu : GPUArray Input matrix of shape `(m, n)`. Returns ------- (si_gpu, ci_gpu) : tuple of GPUArrays Tuple of GPUarrays containing the sine integrals and cosine integrals of the entries of `x_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> x = np.array([[1, 2], [3, 4]], np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> (si_gpu, ci_gpu) = sici(x_gpu) >>> (si, ci) = scipy.special.sici(x) >>> np.allclose(si, si_gpu.get()) True >>> np.allclose(ci, ci_gpu.get()) True """ if x_gpu.dtype == np.float32: args = 'float *x, float *si, float *ci' op = 'sicif(x[i], &si[i], &ci[i])' elif x_gpu.dtype == np.float64: args = 'double *x, double *si, double *ci' op = 'sici(x[i], &si[i], &ci[i])' else: raise ValueError('unsupported type') try: func = sici.cache[x_gpu.dtype] except KeyError: func = elementwise.ElementwiseKernel(args, op, options=["-I", install_headers], preamble='#include "cuSpecialFuncs.h"') sici.cache[x_gpu.dtype] = func si_gpu = gpuarray.empty_like(x_gpu) ci_gpu = gpuarray.empty_like(x_gpu) func(x_gpu, si_gpu, ci_gpu) return (si_gpu, ci_gpu)
def __init__(self, n_units, n_incoming, N, init_sd=1.0, precision=np.float32, magic_numbers=False): self.n_units = n_units self.n_incoming = n_incoming self.N = N w = np.random.normal(0, init_sd, (self.n_incoming, self.n_units)) b = np.random.normal(0, init_sd, (1, n_units)) self.weights = gpuarray.to_gpu(w.copy().astype(precision)) self.gW = gpuarray.empty_like(self.weights) # Prior and ID must be set after creation self.prior = -1 self.ID = -1 self.biases = gpuarray.to_gpu(b.copy().astype(precision)) self.gB = gpuarray.empty_like(self.biases) #Set up momentum variables for HMC sampler self.pW = gpuarray.to_gpu(np.random.normal(0, 1, self.gW.shape)) self.pB = gpuarray.to_gpu(np.random.normal(0, 1, self.gB.shape)) self.epsW = gpuarray.zeros(self.weights.shape, precision) + 1.0 self.epsB = gpuarray.zeros(self.biases.shape, precision) + 1.0 self.precision = precision self.outputs = gpuarray.zeros((self.N, self.n_units), precision) self.magic_numbers = magic_numbers #Define tan_h function on GPU if magic_numbers: self.tanh = ElementwiseKernel("float *x", "x[i] = 1.7159 * tanh(2/3*x[i]);", "tan_h", preamble="#include <math.h>") else: self.tanh = ElementwiseKernel( "float *x", "x[i] = tanh(min(max(-10.0,x[i]),10.0));", "tan_h", preamble="#include <math.h>") #Compile kernels kernels = SourceModule(open(path + '/kernels.cu', "r").read()) self.add_bias_kernel = kernels.get_function("add_bias") self.rng = curandom.XORWOWRandomNumberGenerator() ##Initialize posterior weights self.posterior_weights = list() self.posterior_biases = list()
def make_GPU_gradient(mesh, context): '''Prepare to compute gradient on the GPU w.r.t. the given mesh. Return gradient function. ''' mx = int(getattr(mesh, 'nx', 1)) my = int(getattr(mesh, 'ny', 1)) mz = int(getattr(mesh, 'nz', 1)) dxInv = np.array(1./getattr(mesh, 'dx', 1), dtype=np.float64) dyInv = np.array(1./getattr(mesh, 'dy', 1), dtype=np.float64) dzInv = np.array(1./getattr(mesh, 'dz', 1), dtype=np.float64) sizeof_double = 8 with open(where + 'gradient2.cu') as fdlib: source = fdlib.read() module = SourceModule(source) mx_ptr = module.get_global("mx")[0] my_ptr = module.get_global("my")[0] mz_ptr = module.get_global("mz")[0] cuda.memcpy_htod(mx_ptr, np.array(mx, dtype=np.int32)) cuda.memcpy_htod(my_ptr, np.array(my, dtype=np.int32)) cuda.memcpy_htod(mz_ptr, np.array(mz, dtype=np.int32)) dxInv_ptr = module.get_global("dxInv")[0] dyInv_ptr = module.get_global("dyInv")[0] dzInv_ptr = module.get_global("dzInv")[0] cuda.memcpy_htod(dxInv_ptr, dxInv) cuda.memcpy_htod(dyInv_ptr, dyInv) cuda.memcpy_htod(dzInv_ptr, dzInv) deriv_x = module.get_function("gradient_x") deriv_y = module.get_function("gradient_y") deriv_z = module.get_function("gradient_z") block, grid = mesh.get_domain_decomposition(DeviceData().max_threads) d_deriv_x = gpuarray.empty(shape=(1, mesh.n_nodes), dtype=np.float64) d_deriv_y = gpuarray.empty_like(d_deriv_x) d_deriv_z = gpuarray.empty_like(d_deriv_x) def _gradient(scalar_values): '''Calculate three-dimensional gradient for GPUArray scalar_values. ''' deriv_x(scalar_values, d_deriv_x, block=block, grid=grid) deriv_y(scalar_values, d_deriv_y, block=block, grid=grid) deriv_z(scalar_values, d_deriv_z, block=block, grid=grid) context.synchronize() return (d_deriv_x, d_deriv_y, d_deriv_z)[:mesh.dimension] return _gradient
def __init__(self, n_classes, n_incoming, N, init_sd=0.1, precision=np.float32): self.type = 'Softmax' self.n_incoming = n_incoming self.N = N w = np.random.normal(0, init_sd, (self.n_incoming, n_classes)) b = np.random.normal(0, init_sd, (1, n_classes)) self.weights = gpuarray.to_gpu(w.copy().astype(precision)) self.gW = gpuarray.empty_like(self.weights) #print self.weights # print init_sd self.biases = gpuarray.to_gpu(b.copy().astype(precision)) self.gB = gpuarray.empty_like(self.biases) # Prior and ID are set later self.prior = -1 self.ID = -1 #Set up momentum variables for HMC sampler self.pW = gpuarray.to_gpu(np.random.normal(0, 1, self.gW.shape)) self.pB = gpuarray.to_gpu(np.random.normal(0, 1, self.gB.shape)) #Store stepsizes for each parameter self.epsW = gpuarray.zeros(self.weights.shape, precision) + 1.0 self.epsB = gpuarray.zeros(self.biases.shape, precision) + 1.0 self.n_classes = n_classes self.n_incoming = n_incoming self.N = N self.outputs = gpuarray.zeros((self.N, self.n_classes), precision) self.precision = precision kernels = SourceModule(open(path + '/kernels.cu', "r").read()) self.softmax_kernel = kernels.get_function("softmax") self.add_bias_kernel = kernels.get_function("add_bias") self.rng = curandom.XORWOWRandomNumberGenerator() ##Initialize posterior weights self.posterior_weights = list() self.posterior_biases = list() self.eps_tol = 1e-10
def __init__(self,n_units,n_incoming,N,init_sd=1.0,precision=np.float32,magic_numbers=False): self.n_units = n_units self.n_incoming = n_incoming self.N = N w = np.random.normal(0,init_sd,(self.n_incoming,self.n_units)) b = np.random.normal(0,init_sd,(1,n_units)) self.weights = gpuarray.to_gpu(w.copy().astype(precision)) self.gW = gpuarray.empty_like(self.weights) # Prior and ID must be set after creation self.prior = -1 self.ID = -1 self.biases = gpuarray.to_gpu(b.copy().astype(precision)) self.gB = gpuarray.empty_like(self.biases) #Set up momentum variables for HMC sampler self.pW = gpuarray.to_gpu(np.random.normal(0,1,self.gW.shape)) self.pB = gpuarray.to_gpu(np.random.normal(0,1,self.gB.shape)) self.epsW = gpuarray.zeros(self.weights.shape,precision) + 1.0 self.epsB = gpuarray.zeros(self.biases.shape,precision) + 1.0 self.precision = precision self.outputs = gpuarray.zeros((self.N,self.n_units),precision) self.magic_numbers = magic_numbers #Define tan_h function on GPU if magic_numbers: self.tanh = ElementwiseKernel( "float *x", "x[i] = 1.7159 * tanh(2/3*x[i]);", "tan_h",preamble="#include <math.h>") else: self.tanh = ElementwiseKernel( "float *x", "x[i] = tanh(min(max(-10.0,x[i]),10.0));", "tan_h",preamble="#include <math.h>") #Compile kernels kernels = SourceModule(open(path+'/kernels.cu', "r").read()) self.add_bias_kernel = kernels.get_function("add_bias") self.rng = curandom.XORWOWRandomNumberGenerator() ##Initialize posterior weights self.posterior_weights = list() self.posterior_biases = list()
def feed_forward(self, input_data, prediction=False): """Propagate forward through the layer **Parameters:** input_data : ``GPUArray`` Inpute data to perform dropout on. prediction : bool, optional Whether to use prediction model. If true, then the data is scaled by ``1 - dropout_probability`` uses dropout. **Returns:** dropout_data : ``GPUArray`` The data after performing dropout. """ if input_data.shape[1] != self.n_in: raise ValueError( 'Number of outputs from previous layer (%d) ' 'does not match number of inputs to this layer (%d)' % (input_data.shape[1], self.n_in)) if not prediction: dropout_input = gpuarray.empty_like(input_data) dropout_mask = sample_dropout_mask(input_data, self.dropout_probability, target=dropout_input) return dropout_input, dropout_mask else: return (input_data * (1 - self.dropout_probability), )
def test_cublas_bug(): ''' The SGEMM call would cause all calls after it to fail for some unknown reason. Likely this is caused swaprows causing memory corruption. NOTE: this was confirmed by nvidia to be a bug within CUDA, and should be fixed in CUDA 6.5 ''' from pycuda.driver import Stream from skcuda.cublas import cublasSgemm from skcuda.misc import _global_cublas_handle as handle n = 131 s = slice(128, n) X = gpuarray.to_gpu(np.random.randn(n, 2483).astype(np.float32)) a = gpuarray.empty((X.shape[1], 3), dtype=np.float32) c = gpuarray.empty((a.shape[0], X.shape[1]), dtype=np.float32) b = gpuarray.empty_like(X) m, n = a.shape[0], b[s].shape[1] k = a.shape[1] lda = m ldb = k ldc = m #cublasSgemm(handle, 0, 0, m, n, k, 0.0, b.gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc) cublasSgemm(handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc) #print handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc #gpuarray.dot(d, Xoutd[s]) #op.sgemm(a, b[s], c) stream = Stream() stream.synchronize()
def exp1(z_gpu): """ Exponential integral with `n = 1` of complex arguments. Parameters ---------- z_gpu : GPUArray Input matrix of shape `(m, n)`. Returns ------- e_gpu : GPUArray GPUarrays containing the exponential integrals of the entries of `z_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64) >>> z_gpu = gpuarray.to_gpu(z) >>> e_gpu = exp1(z_gpu) >>> e_sp = scipy.special.exp1(z) >>> np.allclose(e_sp, e_gpu.get()) True """ e_gpu = gpuarray.empty_like(z_gpu) func = _get_exp1_kernel(z_gpu.dtype) func(z_gpu, e_gpu) return e_gpu
def computeIrDensity(self, dS_gpu): """ Compute the impulse response density at the time intervals in dS_gpu """ K = self.modelParams["proc_id_model","K"] N = self.base.data.N gS_gpu = gpuarray.empty_like(dS_gpu) # Update GS using the impulse response parameters grid_w = int(np.ceil(N/1024.0)) self.gpuKernels["computeLogisticNormalGSIndiv"](np.int32(K), np.int32(self.base.data.N), self.gpuPtrs["proc_id_model","C"].gpudata, self.base.dSS["rowIndices"].gpudata, self.base.dSS["colPtrs"].gpudata, self.gpuPtrs["impulse_model","g_mu"].gpudata, self.gpuPtrs["impulse_model","g_tau"].gpudata, np.float32(self.params["dt_max"]), dS_gpu.gpudata, gS_gpu.gpudata, block=(1024, 1, 1), grid=(grid_w,1) ) return gS_gpu
def feed_forward(self, input_data, prediction=False): """Propagate forward through the layer **Parameters:** input_data : ``GPUArray`` Inpute data to perform dropout on. prediction : bool, optional Whether to use prediction model. If true, then the data is scaled by ``1 - dropout_probability`` uses dropout. **Returns:** dropout_data : ``GPUArray`` The data after performing dropout. """ if input_data.shape[1] != self.n_in: raise ValueError('Number of outputs from previous layer (%d) ' 'does not match number of inputs to this layer (%d)' % (input_data.shape[1], self.n_in)) if not prediction: dropout_input = gpuarray.empty_like(input_data) dropout_mask = sample_dropout_mask(input_data, self.dropout_probability, target=dropout_input ) return dropout_input, dropout_mask else: return (input_data * (1 - self.dropout_probability),)
def _FarnebackUpdateMatrices_gpu(self, R0_gpu, R1_gpu, flow_gpu, M_gpu): R1_warped_gpu = gpuarray.empty_like(R1_gpu) block = (32, 32, 1) grid = (int(divup(flow_gpu.shape[3], block[0])), int(divup(flow_gpu.shape[2], block[1])), 1) for i in range(_NUM_POLY_COEFFICIENTS - 1): farneback3d._utils.ndarray_to_float_tex(self._r1_texture, R1_gpu[i]) self._warp_kernel(flow_gpu, R1_warped_gpu[i], np.int32(flow_gpu.shape[3]), np.int32(flow_gpu.shape[2]), np.int32(flow_gpu.shape[1]), np.float32(1), np.float32(1), np.float32(1), block=block, grid=grid) self._update_matrices_kernel(R0_gpu, R1_warped_gpu, flow_gpu, M_gpu, np.int32(flow_gpu.shape[3]), np.int32(flow_gpu.shape[2]), np.int32(flow_gpu.shape[1]), block=block, grid=grid)
def e1z(z_gpu, dev): """ Exponential integral with `n = 1` of complex arguments. Parameters ---------- x_gpu : GPUArray Input matrix of shape `(m, n)`. dev : pycuda.driver.Device Device object to be used. Returns ------- e_gpu : GPUArray GPUarrays containing the exponential integrals of the entries of `z_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64) >>> z_gpu = gpuarray.to_gpu(z) >>> e_gpu = e1z(z_gpu, pycuda.autoinit.device) >>> e_sp = scipy.special.exp1(z) >>> np.allclose(e_sp, e_gpu.get()) True """ if z_gpu.dtype == np.complex64: use_double = 0 elif z_gpu.dtype == np.complex128: use_double = 1 else: raise ValueError("unsupported type") # Get block/grid sizes: max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev) block_dim, grid_dim = select_block_grid_sizes(dev, z_gpu.shape) max_blocks_per_grid = max(max_grid_dim) # Set this to False when debugging to make sure the compiled kernel is # not cached: cache_dir = None e1z_mod = SourceModule( e1z_mod_template.substitute( use_double=use_double, max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid ), cache_dir=cache_dir, options=["-I", install_headers], ) e1z_func = e1z_mod.get_function("e1z") e_gpu = gpuarray.empty_like(z_gpu) e1z_func(z_gpu.gpudata, e_gpu.gpudata, np.uint32(z_gpu.size), block=block_dim, grid=grid_dim) return e_gpu
def worker(): comm = MPI.Comm.Get_parent() size = comm.Get_size() rank = comm.Get_rank() name = MPI.Get_processor_name() import pycuda.driver as drv drv.init() # Find maximum number of available GPUs: max_gpus = drv.Device.count() # Use modular arithmetic to avoid assigning a nonexistent GPU: n = rank % max_gpus dev = drv.Device(n) ctx = dev.make_context() atexit.register(ctx.pop) # Execute a kernel: import pycuda.gpuarray as gpuarray from pycuda.elementwise import ElementwiseKernel kernel = ElementwiseKernel('double *y, double *x, double a', 'y[i] = a*x[i]') x_gpu = gpuarray.to_gpu(np.random.rand(2)) y_gpu = gpuarray.empty_like(x_gpu) kernel(y_gpu, x_gpu, np.double(2.0)) print 'I am process %d of %d on CPU %s using GPU %s of %s [x_gpu=%s, y_gpu=%s]' % \ (rank, size, name, n, max_gpus, str(x_gpu.get()), str(y_gpu.get())) comm.Disconnect()
def buffer_apply(self, input): # TODO: buffer apply to a large input may cause a launch timeout, need to buffer in # smaller chunks if this is the case b = self.filt_b_gpu a = self.filt_a_gpu zi = self.filt_state if not hasattr(self, 'filt_x_gpu') or input.size != self.filt_x_gpu.size: self._desiredshape = input.shape self._has_run_once = False self.filt_x_gpu = gpuarray.to_gpu(input.flatten()) self.filt_y_gpu = gpuarray.empty_like(self.filt_x_gpu) else: self.filt_x_gpu.set(input.flatten()) filt_x_gpu = self.filt_x_gpu filt_y_gpu = self.filt_y_gpu if self._has_run_once: self.gpu_filt_func.launch_grid(*self.grid) else: self.gpu_filt_func.prepared_call(self.grid, intp(b.gpudata), intp(a.gpudata), intp(filt_x_gpu.gpudata), intp(zi.gpudata), intp(filt_y_gpu.gpudata), int32(input.shape[0])) self._has_run_once = True return reshape(filt_y_gpu.get(pagelocked=self.pagelocked_mem), self._desiredshape)
def feed_forward(self, input_data, prediction=False): """Propagate forward through the layer **Parameters:** input_data : ``GPUArray`` Inpute data to perform dropout on. prediction : bool, optional Whether to use prediction model. If true, then the data is scaled by ``1 - dropout_probability`` uses dropout. **Returns:** dropout_data : ``GPUArray`` The data after performing dropout. """ assert input_data.shape[1] == self.n_in if not prediction: dropout_input = gpuarray.empty_like(input_data) dropout_mask = sample_dropout_mask(input_data, self.dropout_probability, target=dropout_input ) return dropout_input, dropout_mask else: return (input_data * (1 - self.dropout_probability),)
def nan_to_zeros(x, target=None): assert x.flags.c_contiguous if target is None: target = gpuarray.empty_like(x) assert target.flags.c_contiguous all_kernels['nan_to_zeros'](x, target) return target
def initializeGpuMemory(self): K = self.modelParams["proc_id_model","K"] # Sufficient statistics for the parameters of G kernels self.gpuPtrs["impulse_model","nnz_Z"] = gpuarray.empty((K,K), dtype=np.int32) self.gpuPtrs["impulse_model","g_suff_stats"] = gpuarray.empty((K,K), dtype=np.float32) self.gpuPtrs["impulse_model","GS"] = gpuarray.empty_like(self.base.dSS["dS"])
def feed_forward(self, input_data, prediction=False): """Propagate forward through the layer **Parameters:** input_data : ``GPUArray`` Inpute data to perform dropout on. prediction : bool, optional Whether to use prediction model. If true, then the data is scaled by ``1 - dropout_probability`` uses dropout. **Returns:** dropout_data : ``GPUArray`` The data after performing dropout. """ assert input_data.shape[1] == self.n_in if not prediction: dropout_input = gpuarray.empty_like(input_data) dropout_mask = sample_dropout_mask(input_data, self.dropout_probability, target=dropout_input) return dropout_input, dropout_mask else: return (input_data * (1 - self.dropout_probability), )
def gaussian_fourierkernel_elemwise(uu, vv, ww, sigma): """ Create Gaussian Fourier filter kernel Element wise cuda implementation """ import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import pycuda.autoinit siz = np.floor(np.array(uu.shape)) zz = uu.copy() u_gpu = gpuarray.to_gpu(uu) v_gpu = gpuarray.to_gpu(vv) w_gpu = gpuarray.to_gpu(ww) from pycuda.elementwise import ElementwiseKernel norm_comb = ElementwiseKernel( "float s, float pi, float *u, float *v, float *w, float *z", "z[i] = exp(-2 * (pi ^ 2) * (u[i] ^ 2 + v[i] ^ 2 + w[i] ^ 2) * (s^2))", "normal_combination") z_gpu = gpuarray.empty_like(a_gpu) norm_comb(sigma, np.pi, u_gpu, v_gpu, w_gpu, z_gpu) gfilter = (np.exp(-2 * (np.pi ** 2)) * z_gpu).get() return gfilter
def main_no_tex(dtype): lc_kernel = get_lin_comb_kernel_no_tex(( (True, dtype, dtype), (True, dtype, dtype) ), dtype) for size_exp in range(10,26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): lc_kernel.prepared_call(x._grid, x._block, a.gpudata, x.gpudata, b.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print size, size_exp, stop.time_since(start)
def buffer_apply(self, input): # TODO: buffer apply to a large input may cause a launch timeout, need to buffer in # smaller chunks if this is the case b = self.filt_b_gpu a = self.filt_a_gpu zi = self.filt_state if not hasattr(self, "filt_x_gpu") or input.size != self.filt_x_gpu.size: self._desiredshape = input.shape self._has_run_once = False self.filt_x_gpu = gpuarray.to_gpu(input.flatten()) self.filt_y_gpu = gpuarray.empty_like(self.filt_x_gpu) else: self.filt_x_gpu.set(input.flatten()) filt_x_gpu = self.filt_x_gpu filt_y_gpu = self.filt_y_gpu if self._has_run_once: self.gpu_filt_func.launch_grid(*self.grid) else: self.gpu_filt_func.prepared_call( self.grid, intp(b.gpudata), intp(a.gpudata), intp(filt_x_gpu.gpudata), intp(zi.gpudata), intp(filt_y_gpu.gpudata), int32(input.shape[0]), ) self._has_run_once = True return reshape(filt_y_gpu.get(pagelocked=self.pagelocked_mem), self._desiredshape)
def test(): gpu_func = getattr(cumath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) if complex: _dtypes = complex_dtypes else: _dtypes = dtypes for s in sizes: for dtype in _dtypes: np.random.seed(1) A = (np.random.random(s)*(b-a) + a).astype(dtype) if complex: A += (np.random.random(s)*(b-a) + a)*1j args = gpuarray.to_gpu(A) gpu_results = gpu_func(args).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), \ (max_err, name, dtype) gpu_results2 = gpuarray.empty_like(args) gr2 = gpu_func(args, out=gpu_results2) assert gpu_results2 is gr2 gr2 = gr2.get() max_err = np.max(np.abs(cpu_results - gr2)) assert (max_err <= threshold).all(), \ (max_err, name, dtype)
def e1z(z_gpu): """ Exponential integral with `n = 1` of complex arguments. Parameters ---------- x_gpu : GPUArray Input matrix of shape `(m, n)`. Returns ------- e_gpu : GPUArray GPUarrays containing the exponential integrals of the entries of `z_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64) >>> z_gpu = gpuarray.to_gpu(z) >>> e_gpu = e1z(z_gpu, pycuda.autoinit.device) >>> e_sp = scipy.special.exp1(z) >>> np.allclose(e_sp, e_gpu.get()) True """ if z_gpu.dtype == np.complex64: use_double = 0 elif z_gpu.dtype == np.complex128: use_double = 1 else: raise ValueError('unsupported type') # Get block/grid sizes; the number of threads per block is limited # to 256 because the e1z kernel defined above uses too many # registers to be invoked more threads per block: dev = get_current_device() max_threads_per_block = 256 block_dim, grid_dim = select_block_grid_sizes(dev, z_gpu.shape, max_threads_per_block) # Set this to False when debugging to make sure the compiled kernel is # not cached: cache_dir=None e1z_mod = \ SourceModule(e1z_mod_template.substitute(use_double=use_double), cache_dir=cache_dir) e1z_func = e1z_mod.get_function("e1z") e_gpu = gpuarray.empty_like(z_gpu) e1z_func(z_gpu, e_gpu, np.uint32(z_gpu.size), block=block_dim, grid=grid_dim) return e_gpu
def test(): gpu_func = getattr(cumath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) if complex: _dtypes = complex_dtypes else: _dtypes = dtypes for s in sizes: for dtype in _dtypes: np.random.seed(1) A = (np.random.random(s) * (b - a) + a).astype(dtype) if complex: A += (np.random.random(s) * (b - a) + a) * 1j args = gpuarray.to_gpu(A) gpu_results = gpu_func(args).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), \ (max_err, name, dtype) gpu_results2 = gpuarray.empty_like(args) gr2 = gpu_func(args, out=gpu_results2) assert gpu_results2 is gr2 gr2 = gr2.get() max_err = np.max(np.abs(cpu_results - gr2)) assert (max_err <= threshold).all(), \ (max_err, name, dtype)
def substract_matrix(a, b, target=None): assert a.shape == b.shape if target is None: target = gpuarray.empty_like(a) all_kernels['substract_matrix'](a, b, target) return target
def main(dtype): from pycuda.elementwise import get_linear_combination_kernel lc_kernel, lc_texrefs = get_linear_combination_kernel( ((True, dtype, dtype), (True, dtype, dtype)), dtype) for size_exp in range(10, 26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True) b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True) lc_kernel.prepared_call(x._grid, x._block, x.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print(size, size_exp, stop.time_since(start))
def bind_buffers(self): """ Gets allocated tensors for input and output feature maps. Allocates a scratch tensor for argmax indices if the op is max pooling since this is required for bprop. Builds a final list of parameters to pass to the kernel. """ I_data = self.I.value.tensor O_data = self.O.value.tensor # Allocate argmax tensor if self.op == "max": if self.index not in self.transformer.argmax_tensors: argmax = empty_like(self.O.value.tensor) self.transformer.argmax_tensors[self.index] = argmax else: argmax = self.transformer.argmax_tensors[self.index] A_data = argmax.gpudata else: A_data = 0 kernel_args = self.fprop_kernel self.params = [ kernel_args[1], kernel_args[2], None, I_data.gpudata, O_data.gpudata, A_data, 1.0, 0.0, 0 ] self.params.extend(kernel_args[3]) super(PoolFpropKernel, self).bind_buffers()
def main_no_tex(dtype): lc_kernel = get_lin_comb_kernel_no_tex( ((True, dtype, dtype), (True, dtype, dtype)), dtype) for size_exp in range(10, 26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): lc_kernel.prepared_call(x._grid, x._block, a.gpudata, x.gpudata, b.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print(size, size_exp, stop.time_since(start))
def main(dtype): from pycuda.elementwise import get_linear_combination_kernel lc_kernel, lc_texrefs = get_linear_combination_kernel(( (True, dtype, dtype), (True, dtype, dtype) ), dtype) for size_exp in range(10, 26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True) b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True) lc_kernel.prepared_call(x._grid, x._block, x.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print size, size_exp, stop.time_since(start)
def main(): width = 65 height = 65 depth = 260 shift_x = 20 shift_y = 17 from pycuda.curandom import rand as curand a_gpu = curand((depth, height, width)).astype('complex64') a = a_gpu.get() b = np.zeros_like(a) b_gpu = gpuarray.to_gpu(b) circ_shift(a_gpu, b_gpu, shift_x, shift_y) b = b_gpu.get() # print(a) # print(b) assert np.all(b == np.roll(np.roll(a, shift_x, axis=2), shift_y, axis=1)) t = time.time() b_gpu = gpuarray.empty_like(a_gpu) for i in np.arange(100): circ_shift(a_gpu, b_gpu, shift_x, shift_y) print('GPU took %.4f secs' % (time.time() - t)) t = time.time() for i in np.arange(100): np.roll(np.roll(a, shift_x, axis=2), shift_y, axis=1) print('CPU took %.4f secs' % (time.time() - t))
def mult_matrix(a, b, target=None): assert a.shape == b.shape if target is None: target = gpuarray.empty_like(a) all_kernels["mult_matrix"](a, b, target) return target
def run_function(X, Y_expected, func, rtol=1e-6, with_inplace_test=True, **kwargs): # CPU, with target argument Y = np.empty_like(Y_expected) Yhr = func(X, out=Y, **kwargs) assert_allclose(Y_expected, Yhr, err_msg="CPU with target", rtol=rtol) assert Yhr is Y # CPU, no target argument Yhr = func(X, **kwargs) assert_allclose(Y_expected, Yhr, err_msg="CPU, no target", rtol=rtol) if with_inplace_test: X2 = X.copy() Yhr = func(X2, out=X2, **kwargs) assert_allclose(Y_expected, Yhr, err_msg="CPU, inplace target", rtol=rtol) assert Yhr is X2 kwargs = op.to_gpu(kwargs) # GPU, with target Xd = op.to_gpu(X) Yd = gpuarray.empty_like(op.to_gpu(Y_expected)) Ydr = func(Xd, out=Yd, **kwargs) assert_allclose(Y_expected, op.to_cpu(Ydr), err_msg="GPU with target", rtol=rtol) assert Ydr is Yd # GPU, no target Ydr = func(Xd, **kwargs) assert_allclose(Y_expected, op.to_cpu(Ydr), err_msg="GPU, no target", rtol=rtol) if with_inplace_test: Ydr = func(Xd, out=Xd, **kwargs) assert_allclose(Y_expected, op.to_cpu(Ydr), err_msg="GPU, inplace target", rtol=rtol) assert Ydr is Xd
def product(self, x: gpuarray.GPUArray) -> gpuarray.GPUArray: """Multiply sparse matrix by dense vector.""" y = gpuarray.empty_like(x) op = cs.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE cs.cusparseDcsrmv(self.handle, op, self.m, self.n, self.nnz, 1.0, self.descr, self.csrValA, self.csrRowPtrA, self.csrColIndA, x, 0.0, y) return y
def computeIrDensity(self, dS_gpu): """ Compute the impulse response density at the time intervals in dS_gpu """ gS_gpu = gpuarray.empty_like(dS_gpu) gS_gpu.fill(self.params["density"]) return gS_gpu
def __call__(self, input_ary, output_ary=None, allocator=None, stream=None): allocator = allocator or input_ary.allocator if output_ary is None: output_ary = input_ary if isinstance(output_ary, (str, six.text_type)) and output_ary == "new": output_ary = gpuarray.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") n, = input_ary.shape if not n: return output_ary unit_size = self.scan_wg_size * self.scan_wg_seq_batches dev = driver.Context.get_device() max_groups = 3*dev.get_attribute( driver.device_attribute.MULTIPROCESSOR_COUNT) from pytools import uniform_interval_splitting interval_size, num_groups = uniform_interval_splitting( n, unit_size, max_groups); block_results = allocator(self.dtype.itemsize*num_groups) dummy_results = allocator(self.dtype.itemsize) # first level scan of interval (one interval per block) self.scan_intervals_knl.prepared_async_call( (num_groups, 1), (self.scan_wg_size, 1, 1), stream, input_ary.gpudata, n, interval_size, output_ary.gpudata, block_results) # second level inclusive scan of per-block results self.scan_intervals_knl.prepared_async_call( (1,1), (self.scan_wg_size, 1, 1), stream, block_results, num_groups, interval_size, block_results, dummy_results) # update intervals with result of second level scan self.final_update_knl.prepared_async_call( (num_groups, 1,), (self.update_wg_size, 1, 1), stream, output_ary.gpudata, n, interval_size, block_results) return output_ary
def compute_bandwidth(self, event_hit, event_time, event_charge, scale_factor=1.0): """Use the MC information accumulated by accumulate_moments() to estimate the best bandwidth to use when kernel estimating.""" rho = 1.0 hitcount = self.hitcount_gpu.get() mom0 = np.maximum(hitcount, 1) tmom1 = self.tmom1_gpu.get() tmom2 = self.tmom2_gpu.get() tmean = tmom1 / mom0 tvar = np.maximum(tmom2 / mom0 - tmean**2, 0.0) # roundoff can go neg trms = tvar**0.5 if self.time_only: d = 1 else: d = 2 dimensionality_factor = ((4.0 / (d + 2)) / (mom0 / scale_factor))**(-1.0 / (d + 4)) gaussian_density = np.minimum( 1.0 / trms, (1.0 / np.sqrt(2.0 * np.pi)) * np.exp(-0.5 * ((event_time - tmean) / trms)) / trms) time_bandwidths = dimensionality_factor / gaussian_density * rho inv_time_bandwidths = np.zeros_like(time_bandwidths) inv_time_bandwidths[time_bandwidths > 0] = time_bandwidths[ time_bandwidths > 0]**-1 # precompute inverse to speed up GPU evaluation self.inv_time_bandwidths_gpu = ga.to_gpu( inv_time_bandwidths.astype(np.float32)) # Compute charge bandwidths if needed if self.time_only: self.inv_charge_bandwidths_gpu = ga.empty_like( self.inv_time_bandwidths_gpu) self.inv_charge_bandwidths_gpu.fill(0.0) else: qmom1 = self.qmom1_gpu.get() qmom2 = self.qmom2_gpu.get() qmean = qmom1 / mom0 qrms = (qmom2 / mom0 - qmean**2)**0.5 gaussian_density = np.minimum( 1.0 / qrms, (1.0 / np.sqrt(2.0 * np.pi)) * np.exp(-0.5 * ((event_charge - qmean) / qrms)) / qrms) charge_bandwidths = dimensionality_factor / gaussian_density * rho # precompute inverse to speed up GPU evaluation self.inv_charge_bandwidths_gpu = ga.to_gpu( (charge_bandwidths**-1).astype(np.float32))
def __init__(self, bases, pv=None, *, force=False): """Create a new density matrix for several qudits. Parameters ---------- bases : list of quantumsim.bases.PauliBasis Dimensions of qubits in the system. pv : array or None. Must be of size (2**no_qubits, 2**no_qubits). Only upper triangle is relevant. If data is `None`, create a new density matrix with all qubits in ground state. """ super().__init__(bases, pv, force=force) if pv is not None: if self.dim_pauli != pv.shape: raise ValueError( '`bases` Pauli dimensionality should be the same as the ' 'shape of `data` array.\n' ' - bases shapes: {}\n - data shape: {}' .format(self.dim_pauli, pv.shape)) else: pv = np.zeros(self.dim_pauli, np.float64) ground_state_index = [pb.computational_basis_indices[0] for pb in self.bases] pv[tuple(ground_state_index)] = 1 if isinstance(pv, np.ndarray): if pv.dtype not in (np.float16, np.float32, np.float64): raise ValueError( '`pv` must have float64 data type, got {}' .format(pv.dtype) ) # Looks like there are some issues with ordering, so the line # below per se does not work. # self._data = ga.to_gpu(pv.astype(np.float64)) self._work_data = ga.to_gpu( pv.reshape(pv.size, order='C').astype(np.float64)) self._data = ga.empty(pv.shape, dtype=np.float64, order='C') self._data.set(self._work_data.reshape(pv.shape)) self._work_data.gpudata.free() elif isinstance(pv, ga.GPUArray): if pv.dtype != np.float64: raise ValueError( '`pv` must have float64 data type, got {}' .format(pv.dtype) ) self._data = pv else: raise ValueError( "`pv` must be Numpy array, PyCUDA GPU array or " "None, got type `{}`".format(type(pv))) self._data.gpudata.size = self._data.nbytes self._work_data = ga.empty_like(self._data) self._work_data.gpudata.size = self._work_data.nbytes
def adam_var(var: gpuarray.GPUArray, grad: gpuarray.GPUArray, d2, out: gpuarray.GPUArray = None): adam_var_func = adam_var_float_ker if var.dtype == np.float32 else adam_var_double_ker if out is None: out = gpuarray.empty_like(var) adam_var_func(out, var, grad, var.dtype.type(d2)) return out
def adam_mean(mean: gpuarray.GPUArray, grad: gpuarray.GPUArray, d1, out: gpuarray.GPUArray = None): adam_mean_func = adam_mean_float_ker if mean.dtype == np.float32 else adam_mean_double_ker if out is None: out = gpuarray.empty_like(mean) adam_mean_func(out, mean, grad, mean.dtype.type(d1)) return out
def softmax(mat, tmp=None): if tmp is None: tmp = gpuarray.empty_like(mat) L = logsumexp(mat, tmp) add_vec_to_mat(mat, L, target=tmp, substract=True) exp_func.prepared_async_call(tmp._grid, tmp._block, None, tmp.gpudata, tmp.gpudata, tmp.mem_size) return tmp
def test_reorderrows(): n = 1270 X = 5*np.random.randn(n, 1000).astype(np.float32) idx = list(range(X.shape[0])) np.random.shuffle(idx) Xd = op.to_gpu(X) Xoutd = gpuarray.empty_like(Xd) op.reorder_rows(Xd, idx, Xoutd) assert_allclose(X[idx], Xoutd.get()) assert_allclose(X[idx], op.reorder_rows(X, idx))
def df_relu(x): assert x.flags.c_contiguous df = gpuarray.empty_like(x) if x.dtype == np.dtype(np.float32): df_relu_kernel_float(x, df) elif x.dtype == np.dtype(np.float64): df_relu_kernel_double(x, df) else: raise ValueError("Incompatible dtype") return df
def __init__(self,n_classes,n_incoming,N,init_sd=0.1,precision=np.float32): self.type = 'Softmax' self.n_incoming = n_incoming self.N = N w = np.random.normal(0,init_sd,(self.n_incoming,n_classes)) b = np.random.normal(0,init_sd,(1,n_classes)) self.weights = gpuarray.to_gpu(w.copy().astype(precision)) self.gW = gpuarray.empty_like(self.weights) self.biases = gpuarray.to_gpu(b.copy().astype(precision)) self.gB = gpuarray.empty_like(self.biases) # Prior and ID are set later self.prior = -1 self.ID = -1 #Set up momentum variables for HMC sampler self.pW = gpuarray.to_gpu(np.random.normal(0,1,self.gW.shape)) self.pB = gpuarray.to_gpu(np.random.normal(0,1,self.gB.shape)) #Store stepsizes for each parameter self.epsW = gpuarray.zeros(self.weights.shape,precision) + 1.0 self.epsB = gpuarray.zeros(self.biases.shape,precision) + 1.0 self.n_classes = n_classes self.n_incoming = n_incoming self.N = N self.outputs = gpuarray.zeros((self.N,self.n_classes),precision) self.precision = precision kernels = SourceModule(open(path+'/kernels.cu', "r").read()) self.softmax_kernel = kernels.get_function("softmax") self.add_bias_kernel = kernels.get_function("add_bias") self.rng = curandom.XORWOWRandomNumberGenerator() ##Initialize posterior weights self.posterior_weights = list() self.posterior_biases = list() self.eps_tol = 1e-10
def conj(x_gpu, overwrite=True): """ Complex conjugate. Compute the complex conjugate of the array in device memory. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array of shape `(m, n)`. overwrite : bool If true (default), save the result in the specified array. If false, return the result in a newly allocated array. Returns ------- xc_gpu : pycuda.gpuarray.GPUArray Conjugate of the input array. If `overwrite` is true, the returned matrix is the same as the input array. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> x = np.array([[1+1j, 2-2j, 3+3j, 4-4j], [5+5j, 6-6j, 7+7j, 8-8j]], np.complex64) >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = linalg.conj(x_gpu) >>> np.all(x == np.conj(y_gpu.get())) True """ # Don't attempt to process non-complex matrix types: if x_gpu.dtype in [np.float32, np.float64]: return x_gpu try: func = conj.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) func = el.ElementwiseKernel( "{ctype} *x, {ctype} *y".format(ctype=ctype), "y[i] = conj(x[i])") conj.cache[x_gpu.dtype] = func if overwrite: func(x_gpu, x_gpu) return x_gpu else: y_gpu = gpuarray.empty_like(x_gpu) func(x_gpu, y_gpu) return y_gpu