def updateGeometry(self): # GPU storage self._psi = gpuarray.zeros(self.shape, dtype=np.complex64) self._phi = gpuarray.zeros(self.shape, dtype=np.uint8) self._theta = gpuarray.zeros(self.shape, dtype=np.float32) self._rho = gpuarray.zeros(self.shape, dtype=np.float32) self._ex = gpuarray.zeros(self.width, dtype=np.complex64) self._ey = gpuarray.zeros(self.height, dtype=np.complex64) # Geometry x = gpuarray.arange(self.width, dtype=np.float32).astype(np.complex64) y = gpuarray.arange(self.height, dtype=np.float32).astype(np.complex64) alpha = np.cos(np.radians(self.phis)).astype(np.float32) x = alpha * (x - self.xs) y = y - self.ys self._iqx = 1j * self.qprp * x self._iqy = 1j * self.qprp * y self._iqxz = 1j * self.qpar * x * x self._iqyz = 1j * self.qpar * y * y self.outeratan2f(y.real, x.real, self._theta) self.outerhypot(y.real, x.real, self._rho) self._rho = self.qprp * self._rho # CPU versions self.phi = np.zeros(self.shape, dtype=np.uint8) self.iqx = self._iqx.get() self.iqy = self._iqy.get() self.theta = self._theta.get() self.qr = self._rho.get() self.sigUpdateGeometry.emit()
def test_take(self): idx = gpuarray.arange(0, 10000, 2, dtype=np.uint32) for dtype in [np.float32, np.complex64]: a = gpuarray.arange(0, 600000, dtype=np.uint32).astype(dtype) a_host = a.get() result = gpuarray.take(a, idx) assert (a_host[idx.get()] == result.get()).all()
def test_ldexp(self): """tests if the ldexp function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32) a2 = gpuarray.arange(s, dtype=np.float32)*1e-3 b = cumath.ldexp(a, a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.ldexp(a[i], int(a2[i])) == b[i]
def test_fmod(self): """tests if the fmod function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32)/10 a2 = gpuarray.arange(s, dtype=np.float32)/45.2 + 0.1 b = cumath.fmod(a, a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.fmod(a[i], a2[i]) == b[i]
def test_fmod(self): """tests if the fmod function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32) / 10 a2 = gpuarray.arange(s, dtype=np.float32) / 45.2 + 0.1 b = cumath.fmod(a, a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.fmod(a[i], a2[i]) == b[i]
def test_ldexp(self): """tests if the ldexp function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32) a2 = gpuarray.arange(s, dtype=np.float32) * 1e-3 b = cumath.ldexp(a, a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.ldexp(a[i], int(a2[i])) == b[i]
def updateGeometry(self): shape = (self.w, self.h) self._psi = gpuarray.zeros(shape, dtype=np.complex64) self._phi = gpuarray.zeros(shape, dtype=np.uint8) self.phi = np.zeros(shape, dtype=np.uint8) self._ex = gpuarray.zeros(self.w, dtype=np.complex64) self._ey = gpuarray.zeros(self.h, dtype=np.complex64) qx = gpuarray.arange(self.w, dtype=np.float32).astype(np.complex64) qy = gpuarray.arange(self.h, dtype=np.float32).astype(np.complex64) qx = self.qpp * (qx - self.rs.x()) qy = self.alpha * self.qpp * (qy - self.rs.y()) self.iqx = 1j * qx self.iqy = 1j * qy self.iqxsq = 1j * qx * qx self.iqysq = 1j * qy * qy
def add_ref_image(self, ref_image): """Add a reference image to the array of reference images used for reconstruction""" if not self.initialised: self._init(ref_image) # Hash the image to check for uniqueness: imhash = hash(ref_image.tobytes()) if imhash in self.ref_image_hashes: # Ignore duplicate image return if self.n_ref_images < self.max_ref_images: self.ref_image_hashes.append(imhash) else: self.ref_image_hashes[self.next_ref_image_index] = imhash self.n_ref_images = len(self.ref_image_hashes) # Send flattened, double precision reference image to the GPU: gpu_ref_image = gpuarray.to_gpu(ref_image.flatten().astype(float)) # Compute 1D indices for the location in BT # where the new reference image will be inserted: start_index = self.n_pixels * self.next_ref_image_index stop_index = start_index + self.n_pixels insertion_indices = gpuarray.arange(start_index, stop_index, 1, dtype=int) # Insert the new image into the correct row of gpu_BT. skcuda.misc.set_by_index(self.BT_gpu, insertion_indices, gpu_ref_image) # Move our index along by one for where the next reference image will go: self.next_ref_image_index += 1 # Wrap around to overwrite oldest images: self.next_ref_image_index %= self.max_ref_images
def bptrs(a): """ Pointer array when input represents a batch of matrices. """ return gpuarray.arange(a.ptr,a.ptr+a.shape[0]*a.strides[0],a.strides[0], dtype=cublas.ctypes.c_void_p)
def particle_indices_of_slice(self, slice_index): '''Return an array of particle indices which are located in the slice defined by the given slice_index. ''' return gpuarray.arange(self.lower_bounds[slice_index], self.upper_bounds[slice_index] + 1, dtype=np.int32)
def particles_within_cuts(self): '''All particle indices which are situated within the slicing region defined by [z_cut_tail, z_cut_head).''' particles_within_cuts_ = gpuarray.arange(self.lower_bounds[0], self.upper_bounds[-1] + 1, dtype=np.int32) return particles_within_cuts_
def test_sum_allocator(self): # FIXME from pytest import skip skip("https://github.com/inducer/pycuda/issues/163") # crashes with terminate called after throwing an instance of 'pycuda::error' # what(): explicit_context_dependent failed: invalid device context - no currently active context? import pycuda.tools pool = pycuda.tools.DeviceMemoryPool() rng = np.random.randint(low=512, high=1024) a = gpuarray.arange(rng, dtype=np.int32) b = gpuarray.sum(a) c = gpuarray.sum(a, allocator=pool.allocate) # Test that we get the correct results assert b.get() == rng * (rng - 1) // 2 assert c.get() == rng * (rng - 1) // 2 # Test that result arrays were allocated with the appropriate allocator assert b.allocator == a.allocator assert c.allocator == pool.allocate
def test_arange(self): """test the arrangement of the array""" a = gpuarray.arange(12) res = a.get() for i in range(12): self.assert_(res[i] ==i)
def set_buffer(self, buf): x, y, z = buf.shape self.data = buf self.bptrs = gpuarray.arange(buf.ptr, buf.ptr + x * y * z * 4, y * z * 4, dtype=cublas.ctypes.c_void_p).gpudata self.data_ptr = gpu_ptr(buf)
def test_abs(self): """test if the abs function works""" a = gpuarray.arange(111) a = a * -1 res = a.get() for i in range (111): self.assert_(res[i] <= 0) a = abs(a) res = a.get() for i in range (111): self.assert_(res[i] >= 0) self.assert_(res[i] == i) for i in range(100,200): a = gpuarray.arange(500 * i) self.assert_(a[len(a)-1] == len(a)-1)
def test(): gpu_func = getattr(cumath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) for s in sizes: for dtype in dtypes: args = gpuarray.arange(a, b, (b - a) / s, dtype=np.float32) gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), \ (max_err, name, dtype)
def test(): gpu_func = getattr(cumath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) for s in sizes: for dtype in dtypes: args = gpuarray.arange(a, b, (b-a)/s, dtype=np.float32) gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), \ (max_err, name, dtype)
def test_abs(self): a = -gpuarray.arange(111, dtype=np.float32) res = a.get() for i in range(111): assert res[i] <= 0 a = abs(a) res = a.get() for i in range(111): assert abs(res[i]) >= 0 assert res[i] == i
def test_frexp(self): """tests if the frexp function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32) / 10 significands, exponents = cumath.frexp(a) a = a.get() significands = significands.get() exponents = exponents.get() for i in range(s): sig_true, ex_true = math.frexp(a[i]) assert sig_true == significands[i] assert ex_true == exponents[i]
def test_frexp(self): """tests if the frexp function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32)/10 significands, exponents = cumath.frexp(a) a = a.get() significands = significands.get() exponents = exponents.get() for i in range(s): sig_true, ex_true = math.frexp(a[i]) assert sig_true == significands[i] assert ex_true == exponents[i]
def test_modf(self): """tests if the modf function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32) / 10 fracpart, intpart = cumath.modf(a) a = a.get() intpart = intpart.get() fracpart = fracpart.get() for i in range(s): fracpart_true, intpart_true = math.modf(a[i]) assert intpart_true == intpart[i] assert abs(fracpart_true - fracpart[i]) < 1e-4
def test_modf(self): """tests if the modf function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32)/10 fracpart, intpart = cumath.modf(a) a = a.get() intpart = intpart.get() fracpart = fracpart.get() for i in range(s): fracpart_true, intpart_true = math.modf(a[i]) assert intpart_true == intpart[i] assert abs(fracpart_true - fracpart[i]) < 1e-4
def test_sum_allocator(self): import pycuda.tools pool = pycuda.tools.DeviceMemoryPool() rng = np.random.randint(low=512, high=1024) a = gpuarray.arange(rng, dtype=np.int32) b = gpuarray.sum(a) c = gpuarray.sum(a, allocator=pool.allocate) # Test that we get the correct results assert b.get() == rng * (rng - 1) // 2 assert c.get() == rng * (rng - 1) // 2 # Test that result arrays were allocated with the appropriate allocator assert b.allocator == a.allocator assert c.allocator == pool.allocate
def test_sum_allocator(self): import pycuda.tools pool = pycuda.tools.DeviceMemoryPool() rng = np.random.randint(low=512,high=1024) a = gpuarray.arange(rng,dtype=np.int32) b = gpuarray.sum(a) c = gpuarray.sum(a, allocator=pool.allocate) # Test that we get the correct results assert b.get() == rng*(rng-1)//2 assert c.get() == rng*(rng-1)//2 # Test that result arrays were allocated with the appropriate allocator assert b.allocator == a.allocator assert c.allocator == pool.allocate
def slice(self, beam, *args, **kwargs): '''Return a SliceSet object according to the saved configuration. Generate it using the keywords of the self.compute_sliceset_kwargs(beam) method. Defines interface to create SliceSet instances (factory method). Sort beam attributes by slice indices. Arguments: - statistics=True attaches mean values, standard deviations and emittances to the SliceSet for all planes. - statistics=['mean_x', 'sigma_dp', 'epsn_z'] only adds the listed statistics values (can be used to save time). Valid list entries are all statistics functions of Particles. ''' sliceset_kwargs = self.compute_sliceset_kwargs(beam) slice_index_of_particle = sliceset_kwargs['slice_index_of_particle'] sorting_permutation = gpuarray.zeros(beam.macroparticlenumber, dtype=np.int32) # also resorts slice_index_of_particle: get_sort_perm_int(slice_index_of_particle, sorting_permutation) beam.reorder(sorting_permutation) del sorting_permutation lower_bounds = gpuarray.empty(self.n_slices, dtype=np.int32) upper_bounds = gpuarray.empty(self.n_slices, dtype=np.int32) seq = gpuarray.arange(self.n_slices, dtype=np.int32) lower_bound_int(slice_index_of_particle, seq, lower_bounds) upper_bound_int(slice_index_of_particle, seq, upper_bounds) del seq sliceset_kwargs['beam_parameters'] = ( self.extract_beam_parameters(beam)) sliceset_kwargs['beam_parameters'].update({ 'lower_bounds': lower_bounds, 'upper_bounds': upper_bounds }) sliceset = MeshSliceSet(**sliceset_kwargs) if 'statistics' in kwargs: self.add_statistics(sliceset, beam, kwargs['statistics'], lower_bounds, upper_bounds) return sliceset
def compute_sliceset_kwargs(self, beam): '''Return argument dictionary to create a new SliceSet according to the saved configuration for uniformly binned SliceSet objects. ''' z_cut_tail, z_cut_head = self.get_long_cuts(beam) slice_width = (z_cut_head - z_cut_tail) / float(self.n_slices) z_bins = gpuarray.arange(z_cut_tail, z_cut_head + 1e-7 * slice_width, slice_width, dtype=np.float64) slice_index_of_particle = self.mesh.get_node_ids(beam.z) return dict(z_bins=z_bins, slice_index_of_particle=slice_index_of_particle, mode=self.mode, mesh=self.mesh, context=self._context)
def run_tests(timer, scale_factor): """PyCUDA port of time_test3.pro""" #nofileio = True # Initialize linear algebra extensions to PyCUDA scikits.cuda.linalg.init() #initialize time timer.reset() # # khughitt (2011/04/04): Non-CUDA tests from above will go here... # # # Begin CUDA tests # siz = int(384 * math.sqrt(scale_factor)) # a = curandom.rand((siz,siz), dtype=np.int32) a = curandom.rand((siz,siz)) timer.reset() #Test 17 - Transpose byte array, TRANSPOSE function for i in range(100): b = scikits.cuda.linalg.transpose(a, pycuda.autoinit.device) timer.log('Transpose %d^2 byte, TRANSPOSE function x 100' % siz) n = 2**(17 * scale_factor) a = gpuarray.arange(n, dtype=np.float32) timer.reset() #Test 20 - Forward and inverse FFT b = scikits.cuda.fft.fft(a) b = scikits.cuda.fft.ifft(b) timer.log('%d point forward plus inverse FFT' % n)
ndev = 2 devlist = range(ndev) # Setup the pycuda side drv.init() ctxs = [drv.Device(i).retain_primary_context() for i in devlist] # Setup the communicator object nc = NCCLComm(devlist) # Now create gpuarrays for sending/recv buffers srcs, dsts, size = [], [], 10 # Create some test arrays for ctx in ctxs: ctx.push() srcs.append(gpuarray.arange(100, 200, size, dtype='<f4')) dsts.append(gpuarray.zeros((size,), dtype='<f4')) ctx.pop() # Perform the reduction nc.all_reduce(size, srcs, dsts) nc.sync() # Look at the results for c, i, o in zip(ctxs, srcs, dsts): c.push() print i.get() print o.get() c.pop()
import pycuda.gpuarray as gpuarray import pycuda.cumath as cumath from pycuda.elementwise import ElementwiseKernel import pycuda as cuda import pycuda.autoinit import numpy as np from time import time # is_equal = ElementwiseKernel("unsigned int *x, unsigned int *y, bool *z", "z[i] = x[i] == y[i]", "is_equal") # is_equal = ElementwiseKernel("bool *z", "z[i] = 0", "equality_checker") eq_checker = ElementwiseKernel("int *x, int *y, int *z", "z[i] = x[i] == y[i]", "equality_checker") # modulo = ElementwiseKernel("int *x, int *y, int *z", "z[i] = x[i] % y[i]", "modulo") eq_gpu = gpuarray.to_gpu(np.empty(num_primes, dtype=np.int32)) a_gpu = gpuarray.arange(5, dtype=np.int32) b_gpu = gpuarray.arange(1, 6, dtype=np.int32) eq_checker(a_gpu, b_gpu, eq_gpu) limit = 100 # limit = int(input('Limit: ')) start_time = time() with open('primes1.txt') as f: primes = np.fromiter(map(int, f.read().strip().split(',')), dtype=np.uint32) p_gpu = gpuarray.to_gpu(primes) antiprimes = [] most = 0 for x in gpuarray.arange(2, limit + 1, dtype=np.uint32): print(x)
ndev = 2 devlist = range(ndev) # Setup the pycuda side drv.init() ctxs = [drv.Device(i).retain_primary_context() for i in devlist] # Setup the communicator object nc = NCCLComm(devlist) # Now create gpuarrays for sending/recv buffers srcs, dsts, size = [], [], 10 # Create some test arrays for ctx in ctxs: ctx.push() srcs.append(gpuarray.arange(100, 200, size, dtype='<f4')) dsts.append(gpuarray.zeros((size, ), dtype='<f4')) ctx.pop() # Perform the reduction nc.all_reduce(size, srcs, dsts) nc.sync() # Look at the results for c, i, o in zip(ctxs, srcs, dsts): c.push() print i.get() print o.get() c.pop()
def test_arange(self): a = gpuarray.arange(12, dtype=np.float32) assert (np.arange(12, dtype=np.float32) == a.get()).all()
import pycuda.autoinit import pycuda.gpuarray as gpuarray from pycuda.elementwise import ElementwiseKernel from pycuda.curandom import rand as curand import numpy n = 1000 sieveKernel = ElementwiseKernel( "int *a", "if(i > 1){for(int j = 2; i * j < n; j++){a[i*j] = 0;}}", "sieve") a_gpu = gpuarray.arange(n, dtype=numpy.int32) #print(a_gpu) sieveKernel(a_gpu) #print("-" * 80) #print(a_gpu) #print("-" * 80) print('[ ', end="") for i in a_gpu.get(): if i > 1: print(i, end=" ") print("\b]")
c = 5*a+6*b e = time() print 'cpu elapsed time: %f \n' % (e-s) ################### # 4) map/reduce kernel print '\n map/reduce kernel\n' print '--------------------\n' from pycuda.reduction import ReductionKernel sz = 7000 # on device a = gpuarray.arange(sz, dtype=numpy.float32) b = gpuarray.arange(sz, dtype=numpy.float32) krnl = ReductionKernel(numpy.float32, neutral="0", reduce_expr="a+2*b+b*b", map_expr="x[i] + y[i]", arguments="float *x, float *y") # device perf s = time() my_dot_prod = krnl(a, b).get() e = time() print 'kernel time: %f' % (e-s) # on host a2 = arange(sz, dtype=numpy.float32) b2 = arange(sz, dtype=numpy.float32)
dtype=dtype) a = gpuarray.to_gpu(h_array) print('a:\n{0}\nshape={1}\n'.format(a.get(), a.shape)) stream = drv.Stream() b = gpuarray.to_gpu_async(h_array, stream=stream) print('b:\n{0}\nshape={1}\n'.format(b.get(), b.shape)) c = gpuarray.empty((100, 100), dtype=dtype) print('c:\n{0}\nshape={1}\n'.format(c, c.shape)) d = gpuarray.zeros((100, 100), dtype=dtype) print('d:\n{0}\nshape={1}\n'.format(d, d.shape)) e = gpuarray.arange(0.0, 100.0, 1.0, dtype=dtype) print('e:\n{0}\nshape={1}\n'.format(e, e.shape)) f = gpuarray.if_positive(e < 50, e - 100, e + 100) print('f:\n{0}\nshape={1}\n'.format(f, f.shape)) g = gpuarray.if_positive(e < 50, gpuarray.ones_like(e), gpuarray.zeros_like(e)) print('g:\n{0}\nshape={1}\n'.format(g, g.shape)) h = gpuarray.maximum(e, f) print('h:\n{0}\nshape={1}\n'.format(h, h.shape)) i = gpuarray.minimum(e, f) print('i:\n{0}\nshape={1}\n'.format(i, i.shape)) g = gpuarray.sum(a)
def test_take(self): idx = gpuarray.arange(0, 200000, 2, dtype=np.uint32) a = gpuarray.arange(0, 600000, 3, dtype=np.float32) result = gpuarray.take(a, idx) assert ((3 * idx).get() == result.get()).all()
time = int(round(22.0 * xres / yres)) #print time xlen = time / float(size[0]) #print xlen #initialize out out = np.zeros(0) #rgb aliases r=0 g=1 b=2 for x in range(xres): #float32 degrades quality, but float64 not support by gpus t_gpu = gpuarray.arange(x*xlen, x*xlen + xlen, 1./44100, dtype=np.float32) tone_gpu = gpuarray.zeros(t_gpu.size, dtype=np.float32) print "{0}%".format(round(100.0 * x / xres, 2)) for y in range(yres): p = d[x+xres*y] #keep playing with these values amplitude = 10**(1-5.25+4.25*(p[r]+p[g]+p[b])/(255*3)) # print amplitude, math.log(amplitude+1) # amplitude = math.log(amplitude+1)# / math.log(255) # print x, y, amplitude if p[r] > 10 or p[g] > 10 and p[b] > 10: tone_gpu += oscillator(t_gpu, amp = amplitude, #amp=(p[r]+p[g]+p[b]), freq=yscale * (yres - y)) tone_gpu = tone_gpu + 1
# @copyright: https://gitee.com/weili_yzzcq/C-and-C-plus-plus/CUDA_CPlusPlus/ # @copyright: https://github.com/2694048168/C-and-C-plus-plus/CUDA_CPlusPlus/ # @function: pycuda 高级内核函数之 归约内核函数。 import pycuda.gpuarray as gpuarray import pycuda.driver as drv import numpy from pycuda.reduction import ReductionKernel import pycuda.autoinit n = 5 start = drv.Event() end = drv.Event() start.record() d_a = gpuarray.arange(n,dtype= numpy.uint32) d_b = gpuarray.arange(n,dtype= numpy.uint32) kernel = ReductionKernel(numpy.uint32,neutral="0",reduce_expr="a+b",map_expr="d_a[i]*d_b[i]",arguments="int *d_a,int *d_b") d_result = kernel(d_a,d_b).get() end.record() end.synchronize() secs = start.time_till(end)*1e-3 print("Vector A") print(d_a) print("Vector B") print(d_b)
def gpu_rfftfreq(n, d=1.0, result=None): factor = 1/(d*n) result = factor*gpuarray.arange(0, n//2 + 1, dtype=bm.precision.real_t).get() return result
import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy from pycuda.reduction import ReductionKernel vector_length = 400 input_vector_a = gpuarray.arange(vector_length, dtype=numpy.int) input_vector_b = gpuarray.arange(vector_length, dtype=numpy.int) dot_product = ReductionKernel(numpy.int, arguments="int *x, int *y", map_expr="x[i]*y[i]", reduce_expr="a+b", neutral="0") dot_product = dot_product(input_vector_a, input_vector_b).get() print("INPUT MATRIX A") print input_vector_a print("INPUT MATRIX B") print input_vector_b print("RESULT DOT PRODUCT OF A * B") print dot_product
import pycuda.reduction as rd import pycuda.driver as cuda import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy as np a = gpuarray.arange(400, dtype=np.float32) b = gpuarray.arange(400, dtype=np.float32) krnl = rd.ReductionKernel(np.float32, neutral='0', reduce_expr='a+b', map_expr='x[i]*y[i]', arguments='float *x, float *y') my_dot_prod = krnl(a, b).get() print my_dot_prod print np.sum(np.arange(400)**2)
def compute_pi(n): h = 1.0 / n x = h * (ga.arange(1, n, dtype=np.float32) + 0.5) s = ga.sum(4.0 / (1.0 + x**2), dtype=np.float32) return s.get() * h
def test_take(self): idx = gpuarray.arange(0, 200000, 2, dtype=np.uint32) a = gpuarray.arange(0, 600000, 3, dtype=np.float32) result = gpuarray.take(a, idx) assert ((3*idx).get() == result.get()).all()
# Operações de MapReduce em Paralelo na GPU # Pacotes import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy from pycuda.reduction import ReductionKernel # Comprimento do vetor vector_length = 400 # Vetores A e B input_vector_a = gpuarray.arange(vector_length, dtype=numpy.int) input_vector_b = gpuarray.arange(vector_length, dtype=numpy.int) # Operação de redução em paralelo dot_product = ReductionKernel(numpy.int, arguments="int *x, int *y", map_expr="x[i]*y[i]", reduce_expr="a+b", neutral="0") # Execução do kernel dot_product = dot_product(input_vector_a, input_vector_b).get() # Imprime os resultados print("Matriz A") print(input_vector_a) print("Matriz B") print(input_vector_b)
if stats_callback is not None: stats_callback(size, self, kernel_rec.kernel.prepared_timed_call(vectors[0]._grid, results[0]._block, *args)) else: kernel_rec.kernel.prepared_async_call(vectors[0]._grid, results[0]._block, self.stream, *args) return results if __name__ == "__main__": test_dtype = numpy.float32 import pycuda.autoinit from pymbolic import parse expr = parse("2*x+3*y+4*z") print expr cexpr = CompiledVectorExpression(expr, lambda expr: (True, test_dtype), test_dtype) from pymbolic import var ctx = { var("x"): gpuarray.arange(5, dtype=test_dtype), var("y"): gpuarray.arange(5, dtype=test_dtype), var("z"): gpuarray.arange(5, dtype=test_dtype), } print cexpr(lambda expr: ctx[expr])
from pycuda.reduction import ReductionKernel import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy a = gpuarray.arange(400, dtype=numpy.float32) b = gpuarray.arange(400, dtype=numpy.float32) dot = ReductionKernel(dtype_out=numpy.float32, neutral="0",reduce_expr="a+b", map_expr="x[i]*y[i]",arguments="const float *x, const float *y") a_dot_b = dot(a, b).get() a_dot_b_cpu = numpy.dot(a.get(), b.get())
def arange(self, *args, **kwargs): return gpuarray.arange(*args, **kwargs)