def main_no_tex(dtype): lc_kernel = get_lin_comb_kernel_no_tex(( (True, dtype, dtype), (True, dtype, dtype) ), dtype) for size_exp in range(10,26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): lc_kernel.prepared_call(x._grid, x._block, a.gpudata, x.gpudata, b.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print size, size_exp, stop.time_since(start)
def main_no_tex(dtype): lc_kernel = get_lin_comb_kernel_no_tex( ((True, dtype, dtype), (True, dtype, dtype)), dtype) for size_exp in range(10, 26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): lc_kernel.prepared_call(x._grid, x._block, a.gpudata, x.gpudata, b.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print(size, size_exp, stop.time_since(start))
def run(self): print('Starting grid: \n', self.grid_gpu) i = 0 while i < N_ITERS: self.diffusion( # input self.grid_gpu, # output self.new_grid, # random numbers self.randoms, # x coordinates self.random_x_coordinates, # y coordinates self.random_y_coordinates, # grid of n_blocks x n_blocks grid=(self.n_blocks, self.n_blocks, 1), # block 0f n_threads x n_threads block=(self.n_threads, self.n_threads, 1), ) self.grid_gpu, self.new_grid = self.new_grid, self.grid_gpu self.randoms = curandom.rand((self.size, self.size)) self.random_x_coordinates = ((curandom.rand( (self.size, self.size))) * self.size).astype(np.int32) self.random_y_coordinates = ((curandom.rand( (self.size, self.size))) * self.size).astype(np.int32) i += 1 print('\nGrid after iteration {}: \n{}'.format(i, self.grid_gpu)) print('\nFinal grid: \n', self.grid_gpu)
def main(dtype): from pycuda.elementwise import get_linear_combination_kernel lc_kernel, lc_texrefs = get_linear_combination_kernel(( (True, dtype, dtype), (True, dtype, dtype) ), dtype) for size_exp in range(10, 26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True) b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True) lc_kernel.prepared_call(x._grid, x._block, x.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print size, size_exp, stop.time_since(start)
def main(dtype): from pycuda.elementwise import get_linear_combination_kernel lc_kernel, lc_texrefs = get_linear_combination_kernel( ((True, dtype, dtype), (True, dtype, dtype)), dtype) for size_exp in range(10, 26): size = 1 << size_exp from pycuda.curandom import rand a = gpuarray.to_gpu(numpy.array(5, dtype=dtype)) x = rand(size, dtype=dtype) b = gpuarray.to_gpu(numpy.array(7, dtype=dtype)) y = rand(size, dtype=dtype) z = gpuarray.empty_like(x) start = drv.Event() stop = drv.Event() start.record() for i in range(20): a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True) b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True) lc_kernel.prepared_call(x._grid, x._block, x.gpudata, y.gpudata, z.gpudata, x.mem_size) stop.record() stop.synchronize() print(size, size_exp, stop.time_since(start))
def run(self, size): import numpy as np from pycuda import curandom a = curandom.rand(size, dtype=np.float64) b = curandom.rand(size, dtype=np.float64) with CUDATimer() as timer: self.op(a, b) return timer.elapsed_time()
def run(self, size): import numpy as np from pycuda import curandom a = curandom.rand(size, dtype = np.float64) b = curandom.rand(size, dtype = np.float64) with CUDATimer() as timer: self.op(a, b) return timer.elapsed_time()
def swipe(): randomNumbers_d = curandom.rand((nData)) stepNumber = np.int32(0) #saveEnergy = np.int32(0) tex_spins.set_array(spinsInArray_d) isingKernel(stepNumber, np.int32(nWidth), np.int32(nHeight), beta, spinsOut_d, randomNumbers_d, grid=grid2D_ising, block=block2D) copy2D_dtod(aligned=True) stepNumber = np.int32(1) #saveEnergy = np.int32(0) tex_spins.set_array(spinsInArray_d) isingKernel(stepNumber, np.int32(nWidth), np.int32(nHeight), beta, spinsOut_d, randomNumbers_d, grid=grid2D_ising, block=block2D) copy2D_dtod(aligned=True)
def test_gpuarray_to_garray(): x = curnd.rand((3,3), dtype=np.float32) x = x + 2 gx = common.gpu.gpuarray_to_garray(x) print "x:" print x print "gpuarray_to_garray(x):" print gx
def run_benchmark(): from pycuda.curandom import rand powers = numpy.arange(10, 13, 2**(-6)) sizes = [int(size) for size in numpy.unique(2**powers // 16 * 16)] bandwidths = [] times = [] for size in sizes: source = rand((size, size), dtype=numpy.float32) target = gpuarray.empty((size, size), dtype=source.dtype) start = pycuda.driver.Event() stop = pycuda.driver.Event() warmup = 2 for i in range(warmup): _transpose(target, source) count = 10 cuda.Context.synchronize() start.record() for i in range(count): _transpose(target, source) stop.record() stop.synchronize() elapsed_seconds = stop.time_since(start) * 1e-3 mem_bw = source.nbytes / elapsed_seconds * 2 * count bandwidths.append(mem_bw) times.append(elapsed_seconds) slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9] print("Sizes for which bandwidth was low:", slow_sizes) print("Ditto, mod 64:", [s % 64 for s in slow_sizes]) matplotlib.use('Agg') import matplotlib import matplotlib.pyplot as plt plt.xlabel('matrix size') plt.ylabel('bandwidth') plt.semilogx(sizes, bandwidths) plt.savefig("transpose-bw.png") plt.clf() plt.xlabel('matrix size') plt.ylabel('time') plt.loglog(sizes, times) plt.savefig("transpose-times.png")
def run_benchmark(): from pycuda.curandom import rand powers = numpy.arange(10, 13, 2**(-6)) sizes = [int(size) for size in numpy.unique(2**powers // 16 * 16)] bandwidths = [] times = [] for size in sizes: source = rand((size, size), dtype=numpy.float32) target = gpuarray.empty((size, size), dtype=source.dtype) start = pycuda.driver.Event() stop = pycuda.driver.Event() warmup = 2 for i in range(warmup): _transpose(target, source) count = 10 cuda.Context.synchronize() start.record() for i in range(count): _transpose(target, source) stop.record() stop.synchronize() elapsed_seconds = stop.time_since(start)*1e-3 mem_bw = source.nbytes / elapsed_seconds * 2 * count bandwidths.append(mem_bw) times.append(elapsed_seconds) slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9] print("Sizes for which bandwidth was low:", slow_sizes) print("Ditto, mod 64:", [s % 64 for s in slow_sizes]) matplotlib.use('Agg') import matplotlib import matplotlib.pyplot as plt plt.xlabel('matrix size') plt.ylabel('bandwidth') plt.semilogx(sizes, bandwidths) plt.savefig("transpose-bw.png") plt.clf() plt.xlabel('matrix size') plt.ylabel('time') plt.loglog(sizes, times) plt.savefig("transpose-times.png")
def swipe(): randomNumbers_d = curandom.rand((nData)) stepNumber = np.int32(0) #saveEnergy = np.int32(0) tex_spins.set_array( spinsInArray_d ) isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), beta, spinsOut_d, randomNumbers_d, grid=grid2D_ising, block=block2D ) copy2D_dtod(aligned=True) stepNumber = np.int32(1) #saveEnergy = np.int32(0) tex_spins.set_array( spinsInArray_d ) isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), beta, spinsOut_d, randomNumbers_d, grid=grid2D_ising, block=block2D ) copy2D_dtod(aligned=True)
def run_benchmark(): from pycuda.curandom import rand sizes = [] bandwidths = [] times = [] for i in numpy.arange(10, 13, 2**(-6)): size = int(((2**i) // 16) * 16) source = rand((size, size), dtype=numpy.float32) target = gpuarray.empty((size, size), dtype=source.dtype) start = pycuda.driver.Event() stop = pycuda.driver.Event() warmup = 2 for i in range(warmup): _transpose(target, source) count = 10 cuda.Context.synchronize() start.record() for i in range(count): _transpose(target, source) stop.record() stop.synchronize() elapsed_seconds = stop.time_since(start) * 1e-3 mem_bw = source.nbytes / elapsed_seconds * 2 * count sizes.append(size) bandwidths.append(mem_bw) times.append(elapsed_seconds) slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9] print slow_sizes print[s % 64 for s in slow_sizes] from matplotlib.pyplot import semilogx, loglog, show, savefig, clf semilogx(sizes, bandwidths) savefig("transpose-bw.png") clf() loglog(sizes, times) savefig("transpose-times.png")
def swipe(): randomNumbers_d = curandom.rand((nData)) stepNumber = np.int32(0) #saveEnergy = np.int32(0) tex_spins.set_array( spinsInArray_d ) surf_spins.set_array( spinsInArray_d ) isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), beta, spinsOut_d, randomNumbers_d, plotDataFloat_d, np.float32(upVal), np.float32(downVal), grid=grid3D_ising, block=block3D ) #copy3D_dtod() stepNumber = np.int32(1) #saveEnergy = np.int32(0) tex_spins.set_array( spinsInArray_d ) surf_spins.set_array( spinsInArray_d ) isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), beta, spinsOut_d, randomNumbers_d, plotDataFloat_d, np.float32(upVal), np.float32(downVal), grid=grid3D_ising, block=block3D )
def check_transpose(): from pycuda.curandom import rand for i in numpy.arange(10, 13, 0.125): size = int(((2**i) // 32) * 32) print(size) source = rand((size, size), dtype=numpy.float32) result = transpose(source) err = source.get().T - result.get() err_norm = la.norm(err) assert err_norm == 0, (size, err_norm)
def check_transpose(): from pycuda.curandom import rand for i in numpy.arange(10, 13, 0.125): size = int(((2**i) // 32) * 32) print size source = rand((size, size), dtype=numpy.float32) result = transpose(source) err = source.get().T - result.get() err_norm = la.norm(err) assert err_norm == 0, (size, err_norm)
def initialisation(self, x_init): y = curand.rand(x_init.shape) y -= x_init + 0.5 # this is only a fixx, remove this line if possible if self.options.compute_both: fx, gx = self.objective.compute_both(x_init) fy, gy = self.objective.compute_both(y) if fx < fy: self.x = x_init self.oldx = y self.g = gx self.oldg = gy self.obj = fx self.oldobj = fy else: self.x = y self.oldx = x_init self.g = gy self.oldg = gx self.obj = fy self.oldobj = fx else: fx = self.objective.compute_obj(x_init) fy = self.objective.compute_obj(y) if fx < fy: self.x = x_init self.oldx = y self.g = self.objective.compute_grad(x_init) self.oldg = self.objective.compute_grad(y) self.obj = fx self.oldobj = fy else: self.x = y self.oldx = x_init self.g = self.objective.compute_grad(y) self.oldg = self.objective.compute_grad(x_init) self.obj = fy self.oldobj = fx
def run_benchmark(): from pycuda.curandom import rand sizes = [] bandwidths = [] times = [] for i in numpy.arange(10, 13, 2**(-6)): size = int(((2**i) // 16) * 16) source = rand((size, size), dtype=numpy.float32) target = gpuarray.empty((size, size), dtype=source.dtype) start = pycuda.driver.Event() stop = pycuda.driver.Event() warmup = 2 for i in range(warmup): _transpose(target, source) count = 10 cuda.Context.synchronize() start.record() for i in range(count): _transpose(target, source) stop.record() stop.synchronize() elapsed_seconds = stop.time_since(start)*1e-3 mem_bw = source.nbytes / elapsed_seconds * 2 * count sizes.append(size) bandwidths.append(mem_bw) times.append(elapsed_seconds) slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9] print slow_sizes print [s % 64 for s in slow_sizes]
def replot(): global xMin, xMax, yMin, yMax global jMin, jMax, iMin, iMax global random_d jMin, jMax = animation2D.jMin, animation2D.jMax iMin, iMax = animation2D.iMin, animation2D.iMax xMin += (xMax-xMin)*(float(jMin)/nWidth) xMax -= (xMax-xMin)*(float(nWidth-jMax)/nWidth) yMin += (yMax-yMin)*(float(iMin)/nHeight) yMax -= (yMax-yMin)*(float(nHeight-iMax)/nHeight) print "Reploting: ( {0} , {1} , {2} , {3} )".format(xMin, xMax, yMin, yMax) start, end = cuda.Event(), cuda.Event() start.record() random_d = curandom.rand((nData), dtype=npPrcsn) mappingLogisticKernel( np.int32(nWidth), np.int32(nHeight), npPrcsn(xMin), npPrcsn(xMax), npPrcsn(yMin), npPrcsn(yMax), random_d, graphPoints_d, grid=mapGrid, block=mapBlock ) normalize( graphPoints_d ) end.record() end.synchronize() print " Map Calculated in: %f secs\n" %( start.time_till(end)*1e-3) animation2D.windowTitle = "ploting [ ( {0} , {1} ), ( {2} , {3} ) ]".format(xMin, xMax, yMin, yMax) animation2D.jMin, animation2D.jMax = 10000, -1 animation2D.iMin, animation2D.iMax = 10000, -1 maskFunc()
def run_tests(timer, scale_factor): """PyCUDA port of""" #nofileio = True # Initialize linear algebra extensions to PyCUDA scikits.cuda.linalg.init() #initialize time timer.reset() # # khughitt (2011/04/04): Non-CUDA tests from above will go here... # # # Begin CUDA tests # siz = int(384 * math.sqrt(scale_factor)) # a = curandom.rand((siz,siz), dtype=np.int32) a = curandom.rand((siz,siz)) timer.reset() #Test 17 - Transpose byte array, TRANSPOSE function for i in range(100): b = scikits.cuda.linalg.transpose(a, pycuda.autoinit.device) timer.log('Transpose %d^2 byte, TRANSPOSE function x 100' % siz) n = 2**(17 * scale_factor) a = gpuarray.arange(n, dtype=np.float32) timer.reset() #Test 20 - Forward and inverse FFT b = scikits.cuda.fft.fft(a) b = scikits.cuda.fft.ifft(b) timer.log('%d point forward plus inverse FFT' % n)
def main(): import pycuda.gpuarray as gpuarray sizes = [] times = [] flops = [] flopsCPU = [] timesCPU = [] for power in range(10, 25): # 24 size = 1<<power print size sizes.append(size) a = gpuarray.zeros((size,), dtype=numpy.float32) if power > 20: count = 100 else: count = 1000 #start timer start = drv.Event() end = drv.Event() start.record() #cuda operation which fills the array with random numbers for i in range(count): curandom.rand((size, )) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end)*1e-3 times.append(secs/count) flops.append(size) #cpu operations which fills teh array with random data a = numpy.array((size,), dtype=numpy.float32) #start timer start = drv.Event() end = drv.Event() start.record() #cpu operation which fills the array with random data for i in range(count): numpy.random.rand(size).astype(numpy.float32) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end)*1e-3 #add results to variable timesCPU.append(secs/count) flopsCPU.append(size) #calculate pseudo flops flops = [f/t for f, t in zip(flops,times)] flopsCPU = [f/t for f, t in zip(flopsCPU,timesCPU)] #print the data out tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup")) for s, t, f,tCpu,fCpu in zip(sizes, times, flops,timesCPU,flopsCPU): tbl.add_row((s,t,f,tCpu,fCpu,f/fCpu)) print tbl
dt = dt1 del gpyfft_plan gbps = d.nbytes * nb * ndim * 2 * 2 / dt / 1024**3 #print("%4d %4dx%4d 2D FFT+iFFT dt=%6.2f ms %7.2f Gbytes/s [gpyfft[clFFT]] [nb=%4d]" % # (nz, n, n, dt / nb * 1000, gbps, nb)) results["gpyfft[clFFT]"].append(gbps) results["gpyfft[clFFT]-dt"].append(dt) if has_pyvkfft_opencl or has_gpyfft: del d gc.collect() # CUDA backends if has_pyvkfft_cuda or has_pyvkfft_cuda: d = curandom.rand(shape=sh, dtype=np.float32).astype(dtype) if has_pyvkfft_cuda: dt = 0 try: app = cuVkFFTApp(d.shape, d.dtype, ndim=ndim) for i in range(nb_repeat): cu_ctx.synchronize() t0 = timeit.default_timer() for i in range(nb): d = app.ifft(d) d = app.fft(d) cu_ctx.synchronize() dt1 = timeit.default_timer() - t0 if dt == 0: dt = dt1
def initialize_kernel(self): self.kernel_code = """ // Ignore edge rows and columns // Assuming the matrix is large, the effect of this is small __global__ void diffuse(float* grid, float* new_grid, float* randoms) {{ unsigned int grid_size = {}; float prob = {}; unsigned int x = threadIdx.x + blockIdx.x * blockDim.x; // column element of index unsigned int y = threadIdx.y + blockIdx.y * blockDim.y; // row element of index unsigned int thread_id = y * grid_size + x; // thread index in array unsigned int edge = (x == 0) || (x == grid_size - 1) || (y == 0) || (y == grid_size - 1); if (grid[thread_id] == 1) {{ new_grid[thread_id] = 1; // current cell if (!edge) {{ /* if (randoms[thread_id - grid_size] < prob) {{ new_grid[thread_id - grid_size] = 1; // above }} if (randoms[thread_id - grid_size - 1] < prob) {{ new_grid[thread_id - grid_size - 1] = 1; // above and left }} if (randoms[thread_id - grid_size + 1] < prob) {{ new_grid[thread_id - grid_size + 1] = 1; // above and right }} if (randoms[thread_id + grid_size] < prob) {{ new_grid[thread_id + grid_size] = 1; // below }} */ if (randoms[thread_id + grid_size - 1] < prob) {{ new_grid[thread_id + grid_size - 1] = 1; // below and left }} if (randoms[thread_id + grid_size + 1] < prob) {{ new_grid[thread_id + grid_size + 1] = 1; // below and right }} if (randoms[thread_id - 1] < prob) {{ new_grid[thread_id - 1] = 1; // left }} if (randoms[thread_id + 1] < prob) {{ new_grid[thread_id + 1] = 1; // right }} }} }} }} """ # Transfer CPU memory to GPU memory self.grid_gpu = gpuarray.to_gpu(self.grid) self.new_grid = gpuarray.empty((self.size, self.size), np.float32) self.kernel = self.kernel_code.format(self.size, self.prob) # Compile kernel code self.mod = SourceModule(self.kernel) # Get kernel function from compiled module self.diffusion = self.mod.get_function('diffuse') # random numbers indicating probabilty of diffusion to a given cell self.randoms = curandom.rand((self.size, self.size))
import pycuda.driver as cuda import pycuda.autoinit import numpy import time import pycuda.gpuarray as gpuarray import pycuda.curandom as curandom n = 16 * 1024 * 1204 U1 = curandom.rand(n) U2 = curandom.rand(n) counter = gpuarray.zeros(n, dtype='f') start_time = time.time() counter = gpuarray.sum((U1 * U1 + U2 * U2) <= 1.0) print "PI_gpu = ", 4.0 * counter / n print "Time elapsed GPUArrays: ", time.time() - start_time, "s" # Sequential part U1 = numpy.random.rand(n).astype('f') U2 = numpy.random.rand(n).astype('f') start_time = time.time() counter_cpu = numpy.sum((numpy.power(U1, 2) + numpy.power(U2, 2)) <= 1.0) print "PI_cpu = ", 4.0 * counter_cpu / n print "Time elapsed CPU: ", time.time() - start_time, "s"
def initialize_kernel(self): self.kernel_code = """ #include <stdlib.h> #include <math.h> // Ignore edge rows and columns __global__ void diffuse(float* grid, float* new_grid, float* randoms, int* x_coords, int* y_coords) {{ unsigned int grid_size = {}; float prob = {}; unsigned int x = threadIdx.x + blockIdx.x * blockDim.x; // column element of index unsigned int y = threadIdx.y + blockIdx.y * blockDim.y; // row element of index unsigned int thread_id = y * grid_size + x; // thread index in array if (grid[thread_id] == 1) {{ new_grid[thread_id] = 1; // current cell if (randoms[thread_id] < prob) {{ // row and col before distance decay unsigned int random_x = x_coords[thread_id]; unsigned int random_y = y_coords[thread_id]; float diff = prob - randoms[thread_id]; // distance decay occuring in x and y directions // amount of decay dictated by random coordinate, diffusion threshold, and random value float decay_x = floor(abs(((float)random_x - x) / prob * diff)); float decay_y = floor(abs(((float)random_y - y) / prob * diff)); // apply decay in appropriate direction unsigned int spread_x = random_x; if (random_x > x) {{ spread_x -= decay_x; }} else if (random_x < x) {{ spread_x += decay_x; }} // apply decay in appropriate direction unsigned int spread_y = random_y; if (random_y > y) {{ spread_y -= decay_y; }} else if (random_y < y) {{ spread_y += decay_y; }} /* printf("Initial y: %u\\t" "Inintial x: %u\\t" "Random y: %u\\t" "Random x: %u\\t" "Y decay: %f\\t" "Decay x: %f\\t" "New y: %u\\t" "New x: %u\\n", y, x, random_y, random_x, decay_y, decay_x, spread_y, spread_x); */ unsigned int spread_index = spread_y * grid_size + spread_x; new_grid[spread_index] = 1; }} }} }} """ # Transfer CPU memory to GPU memory self.grid_gpu = gpuarray.to_gpu(self.grid) self.new_grid = gpuarray.empty((self.size, self.size), np.float32) self.kernel = self.kernel_code.format(self.size, self.prob) # Compile kernel code self.mod = SourceModule(self.kernel) # Get kernel function from compiled module self.diffusion = self.mod.get_function('diffuse') # random numbers indicating probabilty of diffusion to a given cell self.randoms = curandom.rand((self.size, self.size)) self.random_x_coordinates = ((curandom.rand( (self.size, self.size))) * self.size).astype(np.int32) self.random_y_coordinates = ((curandom.rand( (self.size, self.size))) * self.size).astype(np.int32)
def main(): import pycuda.gpuarray as gpuarray sizes = [] times = [] flops = [] flopsCPU = [] timesCPU = [] for power in range(10, 25): # 24 size = 1 << power print size sizes.append(size) a = gpuarray.zeros((size, ), dtype=numpy.float32) if power > 20: count = 100 else: count = 1000 #start timer start = drv.Event() end = drv.Event() start.record() #cuda operation which fills the array with random numbers for i in range(count): curandom.rand((size, )) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end) * 1e-3 times.append(secs / count) flops.append(size) #cpu operations which fills teh array with random data a = numpy.array((size, ), dtype=numpy.float32) #start timer start = drv.Event() end = drv.Event() start.record() #cpu operation which fills the array with random data for i in range(count): numpy.random.rand(size).astype(numpy.float32) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end) * 1e-3 #add results to variable timesCPU.append(secs / count) flopsCPU.append(size) #calculate pseudo flops flops = [f / t for f, t in zip(flops, times)] flopsCPU = [f / t for f, t in zip(flopsCPU, timesCPU)] #print the data out tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup")) for s, t, f, tCpu, fCpu in zip(sizes, times, flops, timesCPU, flopsCPU): tbl.add_row((s, t, f, tCpu, fCpu, f / fCpu)) print tbl
cudaCodeStringRaw = cudaCodeString = (cudaCodeStringRaw %{"HEIGHT":mapBlock[0], "B_HEIGHT":block2D[1], "B_WIDTH":block2D[0] }).replace("cudaP", precision) cudaCode = SourceModule(cudaCodeString) mappingLogisticKernel = cudaCode.get_function('mappingLogistic_kernel') maskKernel = cudaCode.get_function('mask_kernel') plotKernel = cudaCode.get_function('plot_kernel') ######################################################################## from pycuda.elementwise import ElementwiseKernel ######################################################################## linearDouble = ElementwiseKernel(arguments="cudaP a, cudaP b, cudaP *input, cudaP *output".replace( 'cudaP', precision), operation = "output[i] = a*input[i] + b ") #Initialize all gpu data print "Initializing Data" initialMemory = getFreeMemory( show=True ) random_d = curandom.rand((nData), dtype=npPrcsn) graphPoints_d= gpuarray.to_gpu( np.zeros([nData], dtype=npPrcsn) ) #For plotting maskPoints_h = np.ones(nData).astype(np.int32) maskPoints_d = gpuarray.to_gpu( maskPoints_h ) plotData_d = gpuarray.to_gpu( np.zeros([nData], dtype=npPrcsn) ) finalMemory = getFreeMemory( show=False ) print " Total Global Memory Used: {0} Mbytes".format(float(initialMemory-finalMemory)/1e6) def replot(): global xMin, xMax, yMin, yMax global jMin, jMax, iMin, iMax global random_d jMin, jMax = animation2D.jMin, animation2D.jMax iMin, iMax = animation2D.iMin, animation2D.iMax xMin += (xMax-xMin)*(float(jMin)/nWidth)
def initialize_kernel(self): self.kernel_code = """ // Ignore edge rows and columns __global__ void local_diffuse(float* grid, float* new_grid, float* randoms) {{ unsigned int grid_size = {}; float prob = {}; unsigned int x = threadIdx.x + blockIdx.x * blockDim.x; // column element of index unsigned int y = threadIdx.y + blockIdx.y * blockDim.y; // row element of index unsigned int thread_id = y * grid_size + x; // thread index in array unsigned int edge = (x == 0) || (x == grid_size - 1) || (y == 0) || (y == grid_size - 1); if (grid[thread_id] == 1) {{ new_grid[thread_id] = 1; // current cell if (!edge) {{ if (randoms[thread_id - grid_size] < prob) {{ new_grid[thread_id - grid_size] = 1; // above }} if (randoms[thread_id - grid_size - 1] < prob) {{ new_grid[thread_id - grid_size - 1] = 1; // above and left }} if (randoms[thread_id - grid_size + 1] < prob) {{ new_grid[thread_id - grid_size + 1] = 1; // above and right }} if (randoms[thread_id + grid_size] < prob) {{ new_grid[thread_id + grid_size] = 1; // below }} if (randoms[thread_id + grid_size - 1] < prob) {{ new_grid[thread_id + grid_size - 1] = 1; // below and left }} if (randoms[thread_id + grid_size + 1] < prob) {{ new_grid[thread_id + grid_size + 1] = 1; // below and right }} if (randoms[thread_id - 1] < prob) {{ new_grid[thread_id - 1] = 1; // left }} if (randoms[thread_id + 1] < prob) {{ new_grid[thread_id + 1] = 1; // right }} }} }} }} // Ignore edge rows and columns __global__ void non_local_diffuse(float* grid, float* new_grid, float* randoms, int* x_coords, int* y_coords) {{ unsigned int grid_size = {}; float prob = {}; unsigned int x = threadIdx.x + blockIdx.x * blockDim.x; // column element of index unsigned int y = threadIdx.y + blockIdx.y * blockDim.y; // row element of index unsigned int thread_id = y * grid_size + x; // thread index in array if (grid[thread_id] == 1) {{ new_grid[thread_id] = 1; // current cell if (randoms[thread_id] < prob) {{ unsigned int spread_index = y_coords[thread_id] * grid_size + x_coords[thread_id]; new_grid[spread_index] = 1; }} }} }} """ # Below this will be in split # Split transfers data to GPU memory # Random not part of split # grid_a = initialize_grid(MATRIX_SIZE, BLOCK_SIZE, P_LOCAL, P_NON_LOCAL) # grid_b = empty grid # grid_a, grid_b < # local_diffuse(grid_a, grid_b) # Transfer CPU memory to GPU memory self.grid_gpu = gpuarray.to_gpu(self.grid) self.new_grid = gpuarray.empty((self.size, self.size), np.float32) self.kernel = self.kernel_code.format(self.size, self.p_local, self.size, self.p_non_local) # Compile kernel code self.mod = SourceModule(self.kernel) self.local_diffusion = self.mod.get_function('local_diffuse') self.non_local_diffusion = self.mod.get_function('non_local_diffuse') self.randoms = curandom.rand((self.size, self.size)) self.random_x_coordinates = ((curandom.rand( (self.size, self.size))) * self.size).astype(np.int32) self.random_y_coordinates = ((curandom.rand( (self.size, self.size))) * self.size).astype(np.int32)
''' @Project: deep-learning-with-keras-notebooks @Package @author: ly @date Date: 2020年01月02日 17:21 @Description: @URL: @version: V1.0 ''' import pycuda.autoinit import pycuda.curandom as curandom size = 5000 a = curandom.rand((size, )).get() from matplotlib.pyplot import * subplot(211) plot(a) grid(True) ylabel('plot - gpu') subplot(212) hist(a, 100) grid(True) ylabel('Histogram - gpu') show()
# simple module to show the plotting of random data import pycuda.autoinit import pycuda.curandom as curandom size = 1000 a = curandom.rand((size,)).get() from matplotlib.pylab import * subplot(211) plot(a) grid(True) ylabel("plot - gpu") subplot(212) hist(a, 100) grid(True) ylabel("histogram - gpu") # and save it savefig("plot-random-data")
# Gradient check if (self.options.use_tolg): nr = cua.max(cua.fabs(self.grad)).get() if (nr < self.options.tolg): self.term_reason = '|| grad ||_inf < opt.tolg' return # No condition met, so return false self.term_reason = 0 if __name__ == '__main__': case = 2 if case == 1: A = curand.rand((10000, 1000)) xt = curand.rand((1000, 1)) b =, xt) x_init = cua.empty_like(xt) x_init.fill(0.1) # Set up objective objective = MVM_Objective(A, b) # Default optimization options opt = Solopt() pbb = PBB(objective, x_init, opt) elif case == 2:
def random_normal(loc=0.0, scale=1.0, size=None): u1 = curandom.rand(size, dtype=numpy.float64) u2 = curandom.rand(size, dtype=numpy.float64) z1 = cumath.sqrt(-2.*cumath.log(u1))*cumath.cos(2.*numpy.pi*u2) return CUDAArray(scale*z1+loc)
if temp > 0.1: temp -= 0.1 beta = np.float32(1. / temp) animation2D.windowTitle = "Ising Model 2D spins={0}x{1} T={2:.1f}".format( nHeight, nWidth, float(temp)) ######################################################################## ######################################################################## #Initialize all gpu data print "\nInitializing Data" initialMemory = getFreeMemory(show=True) #Set initial random distribution spins_h = (2 * np.random.random_integers(0, 1, [nHeight, nWidth]) - 1).astype( np.int32) spinsOut_d = gpuarray.to_gpu(spins_h) randomNumbers_d = curandom.rand((nData)) #For texture version spinsInArray_d, copy2D_dtod = gpuArray2DtocudaArray(spinsOut_d) #For shared version finalMemory = getFreeMemory(show=False) print " Total Global Memory Used: {0} Mbytes\n".format( float(initialMemory - finalMemory) / 1e6) ######################################################################## ######################################################################## #configure animation2D functions and plotData animation2D.stepFunc = stepFunction animation2D.specialKeys = specialKeyboardFunc animation2D.plotData_d = spinsOut_d animation2D.maxVar = np.float32(2) animation2D.minVar = np.float32(-20)
import pycuda.gpuarray as gpuarray import pycuda.autoinit import pycuda.curandom as curandom a = curandom.rand((5,3)) print('a:\n{0}\n{1}\n{2}\n'.format(a, a.dtype, type(a))) b = curandom.seed_getter_uniform(5) print('seed_getter_uniform:\n{0}\n{1}\n'.format(b, b.dtype)) c = curandom.seed_getter_unique(5) print('seed_getter_unique:\n{0}\n{1}\n'.format(c, c.dtype)) generator = curandom.XORWOWRandomNumberGenerator(curandom.seed_getter_unique, 1000 d = gpuarray.empty((5,3), dtype = 'float32') generator.fill_uniform(d) print('d:\n{0}\n{1}\n{2}\n'.format(d, d.dtype, type(d))) e = generator.gen_uniform((5,3), dtype = 'float32') print('e:\n{0}\n{1}\n{2}\n'.format(e, e.dtype, type(e)))
import pycuda.driver as cuda import pycuda.autoinit import numpy import time import pycuda.gpuarray as gpuarray import pycuda.curandom as curandom n = 16*1024*1204 U1 = curandom.rand(n) U2 = curandom.rand(n) counter = gpuarray.zeros(n, dtype='f') start_time = time.time() counter = gpuarray.sum( (U1*U1 + U2*U2) <= 1.0 ) print "PI_gpu = ", 4.0*counter/n print "Time elapsed GPUArrays: ", time.time() - start_time, "s" # Sequential part U1 = numpy.random.rand(n).astype('f') U2 = numpy.random.rand(n).astype('f') start_time = time.time() counter_cpu = numpy.sum( (numpy.power(U1,2) + numpy.power(U2,2)) <= 1.0 ) print "PI_cpu = ", 4.0*counter_cpu/n
if key== animation2D.GLUT_KEY_UP: temp += 0.1 if key== animation2D.GLUT_KEY_DOWN: if temp > 0.1: temp -= 0.1 beta = np.float32(1./temp) animation2D.windowTitle = "Ising Model 2D spins={0}x{1} T={2:.1f}".format(nHeight, nWidth, float(temp)) ######################################################################## ######################################################################## #Initialize all gpu data print "\nInitializing Data" initialMemory = getFreeMemory( show=True ) #Set initial random distribution spins_h = (2*np.random.random_integers(0,1,[nHeight, nWidth]) - 1 ).astype(np.int32) spinsOut_d = gpuarray.to_gpu( spins_h ) randomNumbers_d = curandom.rand((nData)) #For texture version spinsInArray_d, copy2D_dtod = gpuArray2DtocudaArray( spinsOut_d ) #For shared version finalMemory = getFreeMemory( show=False ) print " Total Global Memory Used: {0} Mbytes\n".format(float(initialMemory-finalMemory)/1e6) ######################################################################## ######################################################################## #configure animation2D functions and plotData animation2D.stepFunc = stepFunction animation2D.specialKeys = specialKeyboardFunc animation2D.plotData_d = spinsOut_d animation2D.maxVar = np.float32(2) animation2D.minVar = np.float32(-20)
import pycuda.autoinit import pycuda.curandom as curandom import matplotlib.pyplot as plt import numpy as np N = 1000 # --- Generating random data directly on the gpu d_a = curandom.rand(N, dtype = np.float32, stream = None) plt.plot(d_a.get()) plt.xlabel('Realization index') plt.ylabel('Random numbers') save_file = 0 if save_file: plt.savefig('test') else:
# Gradient check if (self.options.use_tolg): nr = cua.max(cua.fabs(self.grad)).get(); if (nr < self.options.tolg): self.term_reason = '|| grad ||_inf < opt.tolg' return # No condition met, so return false self.term_reason = 0; if __name__ == '__main__': case = 2 if case == 1: A = curand.rand((10000,1000)) xt = curand.rand((1000,1)) b =, xt) x_init = cua.empty_like(xt) x_init.fill(0.1) # Set up objective objective = MVM_Objective(A,b) # Default optimization options opt = Solopt() pbb = PBB(objective, x_init, opt); elif case == 2: