def perform_precomputation(self): # Precompute aa, bb1, bb2 (required for kernel) # compute l Nv = self.vm.Nv() Nrho = self.vm.Nrho() M = self.vm.M() L = self.vm.L() qz = self.vm.qz() qw = self.vm.qw() sz = self.vm.sz() sw = self.vm.sw() vsize = self.vm.vsize() szpre = self._szpre swpre = self._swpre check(self.cfg.dtype == np.float64, "Need to extend for single precision") # precision control dint = np.int32 dfloat = np.float64 dcplx = np.complex128 if self.cfg.dtype == np.float32: dfloat = np.float32 dcplx = np.complex64 l0 = np.concatenate( (np.arange(0, Nv / 2, dtype=dint), np.arange(-Nv / 2, 0, dtype=dint))) l = np.zeros((3, vsize), dtype=dint) for idv in range(vsize): I = int(idv / (Nv * Nv)) J = int((idv % (Nv * Nv)) / Nv) K = int((idv % (Nv * Nv)) % Nv) l[0, idv] = l0[I] l[1, idv] = l0[J] l[2, idv] = l0[K] d_lx = gpuarray.to_gpu(np.ascontiguousarray(l[0, :])) d_ly = gpuarray.to_gpu(np.ascontiguousarray(l[1, :])) d_lz = gpuarray.to_gpu(np.ascontiguousarray(l[2, :])) # transfer sphere points to gpu d_sz_x = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 0])) d_sz_y = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 1])) d_sz_z = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 2])) # define complex to complex plan rank = 3 n = np.array([Nv, Nv, Nv], dtype=np.int32) #planD2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_D2Z) self.planZ2Z_MNrho = cufftPlanMany(rank, n.ctypes.data, None, 1, vsize, None, 1, vsize, CUFFT_Z2Z, M * Nrho) self.planZ2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_Z2Z) dfltargs = dict( dtype=self.cfg.dtypename, Nrho=Nrho, M=M, vsize=vsize, sw=sw, prefac=self._prefactor, cases=self._cases, masses=self.vm.masses(), qw=qw, qz=qz, L=L, sz=sz, gamma=self._gamma, eta=self._eta, Mpre=self._Mpre, szpre=szpre, swpre=swpre #, Ne=self._Ne ) src = DottedTemplateLookup('dgfs1D.bi.kernels.scattering', dfltargs).get_template( self.scattering_model).render() # Compile the source code and retrieve the kernel print("\nCompiling scattering kernels, this may take some time ...") module = compiler.SourceModule(src) self.block = (256, 1, 1) self.grid = get_grid_for_block(self.block, vsize) print("Starting precomputation, this may take some time ...") start, end = cuda.Event(), cuda.Event() cuda.Context.synchronize() start.record() start.synchronize() self.d_aa = gpuarray.empty(Nrho * M * vsize, dtype=dfloat) precompute_aa = get_kernel(module, "precompute_a", 'PPPP') precompute_aa.prepared_call(self.grid, self.block, d_lx.ptr, d_ly.ptr, d_lz.ptr, self.d_aa.ptr) self.d_bb1 = {} self.d_bb2 = {} precompute_bb = {} for cp, cq in self._cases: cpcq = str(cp) + str(cq) self.d_bb1[cpcq] = gpuarray.empty(Nrho * M * vsize, dtype=dcplx) self.d_bb2[cpcq] = gpuarray.zeros(vsize, dtype=dcplx) precompute_bb[cpcq] = module.get_function("precompute_bc_" + cpcq) precompute_bb[cpcq].prepare('IIdddPPPPPPPP') precompute_bb[cpcq].set_cache_config(cuda.func_cache.PREFER_L1) for p in range(Nrho): fac = np.pi / L * qz[p] fac_b = swpre * pow(qz[p], self._gamma[cpcq] + 2) fac_c = qw[p] * sw * fac_b for q in range(M): precompute_bb[cpcq].prepared_call( self.grid, self.block, dint(p), dint(q), dfloat(fac), dfloat(fac_b), dfloat(fac_c), d_lx.ptr, d_ly.ptr, d_lz.ptr, d_sz_x.ptr, d_sz_y.ptr, d_sz_z.ptr, self.d_bb1[cpcq].ptr, self.d_bb2[cpcq].ptr) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 print("Finished precomputation in: %fs" % (secs)) # transform scalar to complex self.r2zKern = module.get_function("r2z") self.r2zKern.prepare('IIIPP') self.r2zKern.set_cache_config(cuda.func_cache.PREFER_L1) # Prepare the cosSinMul kernel for execution self.cosSinMultKern = {} #self.computeQGKern = {} self.outKern = {} for cp, cq in self._cases: idx = str(cp) + str(cq) self.cosSinMultKern[idx] = module.get_function("cosSinMul_" + idx) self.cosSinMultKern[idx].prepare('PPPPP') self.cosSinMultKern[idx].set_cache_config( cuda.func_cache.PREFER_L1) #self.computeQGKern[idx] = module.get_function("computeQG_"+idx) #self.computeQGKern[idx].prepare('PPP') #self.computeQGKern[idx].set_cache_config( # cuda.func_cache.PREFER_L1) self.outKern[idx] = module.get_function("output_" + idx) self.outKern[idx].prepare('IIIIPPPP') self.outKern[idx].set_cache_config(cuda.func_cache.PREFER_L1) # prepare the computeQG kernel self.computeQGKern = module.get_function("computeQG") self.computeQGKern.prepare('PPP') self.computeQGKern.set_cache_config(cuda.func_cache.PREFER_L1) # Prepare the prodKern kernel for execution self.prodKern = module.get_function("prod") self.prodKern.prepare('PPP') self.prodKern.set_cache_config(cuda.func_cache.PREFER_L1) # Prepare the ax kernel for execution self.ax2Kern = module.get_function("ax2") self.ax2Kern.prepare('PPP') self.ax2Kern.set_cache_config(cuda.func_cache.PREFER_L1) # define scratch spaces self.d_FTf = gpuarray.empty(vsize, dtype=dcplx) self.d_FTg = gpuarray.empty(vsize, dtype=dcplx) self.d_f1C = gpuarray.empty_like(self.d_FTf) self.d_f2C = gpuarray.empty_like(self.d_FTf) self.d_QG = gpuarray.empty_like(self.d_FTf) self.d_t1 = gpuarray.empty(M * Nrho * vsize, dtype=dcplx) self.d_t2 = gpuarray.empty_like(self.d_t1) self.d_t3 = gpuarray.empty_like(self.d_t1)
import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np from skcuda.fft import fft, Plan import pycuda.driver as cuda import skcuda.cublas as cublas import skcuda s = cuda.Event() e = cuda.Event() s.record() nStreams = 8 stream = [cuda.Stream() for i in range(nStreams)] N = 8192 print skcuda.misc.get_current_device() x = [np.asarray(np.random.rand(N/nStreams), np.float32) for i in range(nStreams)] #x_pin = cuda.register_host_memory(x) #xf = np.fft.fft(x) x_gpu = [gpuarray.to_gpu_async(x[i], stream=stream[i]) for i in range(nStreams)] xf_gpu = [gpuarray.empty((N/nStreams)/2 + 1, np.complex64) for i in range(nStreams)] plan = [Plan(x[0].shape, np.float32, np.complex64, stream=stream[i]) for i in range(nStreams)] print skcuda.misc.get_current_device() for i in range(nStreams): fft(x_gpu[i], xf_gpu[i], plan[i]) print skcuda.misc.get_current_device() x_pin = [xf_gpu[i].get_async(stream=stream[i]) for i in range(nStreams)]
# reference: https://medium.com/@CIulius/five-different-ways-to-sum-vectors-in-pycuda-3f2d9409b139 import numpy as np # --- PyCUDA initialization import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import pycuda.autoinit ######## # MAIN # ######## start = cuda.Event() end = cuda.Event() N = 100000 # --- Create random vectorson the CPU h_a = np.random.randn(1, N) h_b = np.random.randn(1, N) # --- Set CPU arrays as single precision h_a = h_a.astype(np.float32) h_b = h_b.astype(np.float32) h_c = np.empty_like(h_a) d_a = gpuarray.to_gpu(h_a) d_b = gpuarray.to_gpu(h_b) start.record() d_c = (d_a + d_b)
def main_cg(): from optparse import OptionParser parser = OptionParser(usage="%prog [options] MATRIX-MARKET-FILE") parser.add_option( "-s", "--is-symmetric", action="store_true", help="Specify that the input matrix is already symmetric") options, args = parser.parse_args() from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool dev_pool = DeviceMemoryPool() pagelocked_pool = PageLockedMemoryPool() from scipy.io import mmread csr_mat = mmread(args[0]).tocsr().astype(numpy.float32) inv_mat_diag = 1 / csr_mat.diagonal() print "building..." from pycuda.sparse.packeted import PacketedSpMV spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype) rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype) from pycuda.sparse.operator import DiagonalPreconditioner if True: precon = DiagonalPreconditioner( spmv.permute( gpuarray.to_gpu(inv_mat_diag, allocator=dev_pool.allocate))) else: precon = None from pycuda.sparse.cg import solve_pkt_with_cg print "start solve" for i in range(4): start = drv.Event() stop = drv.Event() start.record() rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate) res_gpu, it_count, res_count = \ solve_pkt_with_cg(spmv, rhs_gpu, precon, tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5, pagelocked_allocator=pagelocked_pool.allocate) res = res_gpu.get() stop.record() stop.synchronize() elapsed = stop.time_since(start) * 1e-3 est_flops = (csr_mat.nnz * 2 * (it_count + res_count) + csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count) if precon is not None: est_flops += csr_mat.shape[0] * it_count print "residual norm: %g" % (la.norm(csr_mat * res - rhs) / la.norm(rhs)) print( "size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count, it_count / elapsed, est_flops / elapsed / 1e9)) # TODO: mixed precision # TODO: benchmark pagelocked_pool.stop_holding() dev_pool.stop_holding()
if(con_prob[i]<min) { min = con_prob[i]; predict = i; } } if (predict != (int)test_labels[idx]) {error[idx] += 1;} } """) discrete_predict = mod.get_function("discrete_predict") discrete_likelihood = mod.get_function("discrete_likelihood") gaussian_predict = mod.get_function("gaussian_predict") gaussian_likelihood_mean = mod.get_function("gaussian_likelihood_mean") gaussian_likelihood_var = mod.get_function("gaussian_likelihood_var") end = drv.Event() parser = ArgumentParser() parser.add_argument("--mode", type=int, default=0, help="descrete:0, continuous:1") args = parser.parse_args() def readmnist(mnist_dir, mode='training'): if mode == 'training': image_dir = os.path.join(mnist_dir, 'train-images-idx3-ubyte') label_dir = os.path.join(mnist_dir, 'train-labels-idx1-ubyte') elif mode == 'testing': image_dir = os.path.join(mnist_dir, 't10k-images-idx3-ubyte')
def main(): import pycuda.gpuarray as gpuarray sizes = [] times_gpu = [] flops_gpu = [] flops_cpu = [] times_cpu = [] from pycuda.tools import bitlog2 max_power = bitlog2(drv.mem_get_info()[0]) - 2 # they're floats, i.e. 4 bytes each for power in range(10, max_power): size = 1 << power print(size) sizes.append(size) a = gpuarray.zeros((size, ), dtype=numpy.float32) b = gpuarray.zeros((size, ), dtype=numpy.float32) b.fill(1) if power > 20: count = 100 else: count = 1000 # gpu ----------------------------------------------------------------- start = drv.Event() end = drv.Event() start.record() for i in range(count): a + b end.record() end.synchronize() secs = start.time_till(end) * 1e-3 times_gpu.append(secs / count) flops_gpu.append(size) del a del b # cpu ----------------------------------------------------------------- a_cpu = numpy.random.randn(size).astype(numpy.float32) b_cpu = numpy.random.randn(size).astype(numpy.float32) # start timer from time import time start = time() for i in range(count): a_cpu + b_cpu secs = time() - start times_cpu.append(secs / count) flops_cpu.append(size) # calculate pseudo flops flops_gpu = [f / t for f, t in zip(flops_gpu, times_gpu)] flops_cpu = [f / t for f, t in zip(flops_cpu, times_cpu)] from pytools import Table tbl = Table() tbl.add_row(( "Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup", )) for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu): tbl.add_row((s, t, f, t_cpu, f_cpu, f / f_cpu)) print(tbl)
stdTGPU2 = stdUnderMask(tempGPU, maskGPU, np.float32(meanVal), p=p) linearAdd(tempGPU, maskGPU, np.float64(meanTGPU.get()), np.float64(1. / stdTGPU.get())) tgpu = tempGPU.get() print(tvol.mean(), tvol.max(), tvol.min(), tvol.std()) print(tgpu.mean(), tgpu.max(), tgpu.min(), tgpu.std()) print() print(stdVal, meanVal, stdTGPU, meanTGPU, stdTGPU2) import sys sys.exit() #mask = paste_in_center(mask, np.zeros_like(volume)) start = driver.Event() end = driver.Event() start.record() #scores, angles, plan = template_matching_gpu(volume, temp, mask, wedgeV, np.fft.fftshift(stdV).astype(np.float32), [[0,0,0],]*num_angles, return_cpu=True) end.record() end.synchronize() print('exec time (s):', start.time_till(end) * 1e-3, 7000 * start.time_till(end) * 1e-3 / num_angles / 60) map = ((plan.templatePadded.get())) print(map.max(), map.min(), map.mean()) mrcfile.new('templatePadded.mrc', map.astype(np.float32), overwrite=True)
def convolution_example(context): #create input data image_width = 1024 image_height = 1024 filter_width = 17 filter_height = 17 input_width = image_width + 2*(filter_width//2) input_height = image_height + 2*(filter_height//2) input_image = numpy.random.randn(input_width, input_height).astype(numpy.float32) filter = numpy.random.randn(filter_width, filter_height).astype(numpy.float32) output_image = numpy.zeros((image_width, image_height), dtype=numpy.float32) #move data to the GPU args = [output_image, input_image, filter] gpu_args = [] for arg in args: gpu_args.append(drv.mem_alloc(arg.nbytes)) drv.memcpy_htod(gpu_args[-1], arg) #read kernel into string with open('convolution.cu', 'r') as f: kernel_string = f.read() #get compute capability for compiling CUDA kernels devprops = { str(k): v for (k, v) in context.get_device().get_attributes().items() } cc = str(devprops['COMPUTE_CAPABILITY_MAJOR']) + str(devprops['COMPUTE_CAPABILITY_MINOR']) #compile the kernels module = SourceModule(kernel_string, arch='compute_' + cc, code='sm_' + cc, cache_dir=False, no_extern_c=True) convolution = module.get_function("convolution_kernel") convolution_naive = module.get_function("convolution_kernel_naive") #setup thread block sizes threads = (32, 16, 1) grid = (int(numpy.ceil(image_width/float(threads[0]))), int(numpy.ceil(image_height/float(threads[1]))), 1) #compute reference using naive kernel reference = numpy.zeros_like(output_image) start = drv.Event() end = drv.Event() context.synchronize() start.record() convolution_naive(*gpu_args, block=threads, grid=grid, stream=None, shared=0) end.record() context.synchronize() print("convolution_kernel_naive took", end.time_since(start), "ms.") drv.memcpy_dtoh(reference, gpu_args[0]) drv.memcpy_htod(gpu_args[0], output_image) #launch the kernel context.synchronize() start.record() convolution(*gpu_args, block=threads, grid=grid, stream=None, shared=0) end.record() context.synchronize() print("convolution_kernel took", end.time_since(start), "ms.") #copy output data back from GPU drv.memcpy_dtoh(output_image, gpu_args[0]) #compare output with reference correct = numpy.allclose(output_image, reference, atol=1e-6) if not correct: print("TEST FAILED!") else: print("TEST PASSED!")
hog_output = np.zeros((height, width)).ravel().astype(np.float32) # Number of threads per block n_TPB_x = int(16) n_TPB_y = int(16) # Number of thread blocks n_blocks_x = int(np.ceil(width / n_TPB_x)) n_blocks_y = int(np.ceil(height / n_TPB_y)) ### # Patch-based HoG kernel ### # Start the timer gpu_start_time = cu.Event() gpu_end_time = cu.Event() gpu_start_time.record() # Create device variables input_im_device = cu.mem_alloc(t_image.nbytes) cu.memcpy_htod(input_im_device, t_image) x_gradient_device = cu.mem_alloc(x_gradient.nbytes) cu.memcpy_htod(x_gradient_device, x_gradient) y_gradient_device = cu.mem_alloc(y_gradient.nbytes) cu.memcpy_htod(y_gradient_device, y_gradient) gradient_mag_device = cu.mem_alloc(gradient_mag.nbytes) cu.memcpy_htod(gradient_mag_device, gradient_mag)
def project_Kt(XKt, LorY, surfSrc, surfTar, Kt_diag, self, param, ind0, timing, kernel): if param.GPU == 1: tic = cuda.Event() toc = cuda.Event() else: tic = Event() toc = Event() REAL = param.REAL Ns = len(surfSrc.triangle) Nt = len(surfTar.triangle) L = numpy.sqrt(2 * surfSrc.Area) # Representative length tic.record() K = param.K w = getWeights(K) X_Kt = numpy.zeros(Ns * K) X_Ktc = numpy.zeros(Ns * K) NsK = numpy.arange(Ns * K) X_Kt[:] = XKt[NsK / K] * w[NsK % K] * surfSrc.Area[NsK / K] X_Ktc[:] = XKt[NsK / K] toc.record() toc.synchronize() timing.time_mass += tic.time_till(toc) * 1e-3 tic.record() C = 0 X_aux = numpy.zeros(Ns * K) getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_Kt, X_aux, X_aux, X_aux, ind0, param.P, param.NCRIT) toc.record() toc.synchronize() timing.time_P2M += tic.time_till(toc) * 1e-3 tic.record() for C in reversed(range(1, len(surfSrc.tree))): PC = surfSrc.tree[C].parent upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr) toc.record() toc.synchronize() timing.time_M2M += tic.time_till(toc) * 1e-3 tic.record() X_Kt = X_Kt[surfSrc.sortSource] X_Ktc = X_Ktc[surfSrc.sortSource] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 param.Nround = len(surfTar.twig) * param.NCRIT Ktx_aux = numpy.zeros(param.Nround) Kty_aux = numpy.zeros(param.Nround) Ktz_aux = numpy.zeros(param.Nround) AI_int = 0 ### CPU code if param.GPU == 0: if surfTar.offsetMlt[self, len(surfTar.twig)] > 0: Ktx_aux, Kty_aux, Ktz_aux = M2PKt_sort(surfSrc, surfTar, Ktx_aux, Kty_aux, Ktz_aux, self, ind0.index_large, param, LorY, timing) Ktx_aux, Kty_aux, Ktz_aux = P2PKt_sort(surfSrc, surfTar, X_Kt, X_Ktc, Ktx_aux, Kty_aux, Ktz_aux, self, LorY, w, param, timing) ### GPU code elif param.GPU == 1: Ktx_gpu = cuda.to_device(Ktx_aux.astype(REAL)) Kty_gpu = cuda.to_device(Kty_aux.astype(REAL)) Ktz_gpu = cuda.to_device(Ktz_aux.astype(REAL)) if surfTar.offsetMlt[self, len(surfTar.twig)] > 0: Ktx_gpu, Kty_gpu, Ktz_gpu = M2PKt_gpu(surfSrc, surfTar, Ktx_gpu, Kty_gpu, Ktz_gpu, self, ind0, param, LorY, timing, kernel) Ktx_gpu, Kty_gpu, Ktz_gpu = P2PKt_gpu(surfSrc, surfTar, X_Kt, X_Ktc, Ktx_gpu, Kty_gpu, Ktz_gpu, self, LorY, w, param, timing, kernel) tic.record() Ktx_aux = cuda.from_device(Ktx_gpu, len(Ktx_aux), dtype=REAL) Kty_aux = cuda.from_device(Kty_gpu, len(Kty_aux), dtype=REAL) Ktz_aux = cuda.from_device(Ktz_gpu, len(Ktz_aux), dtype=REAL) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc) * 1e-3 tic.record() Kt_lyr = Ktx_aux[surfTar.unsort]*surfTar.normal[:,0] \ + Kty_aux[surfTar.unsort]*surfTar.normal[:,1] \ + Ktz_aux[surfTar.unsort]*surfTar.normal[:,2] if abs(Kt_diag) > 1e-12: # if same surface Kt_lyr += Kt_diag * XKt toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 return Kt_lyr
def project(XK, XV, LorY, surfSrc, surfTar, K_diag, V_diag, IorE, self, param, ind0, timing, kernel): if param.GPU == 1: tic = cuda.Event() toc = cuda.Event() else: tic = Event() toc = Event() REAL = param.REAL Ns = len(surfSrc.triangle) Nt = len(surfTar.triangle) L = numpy.sqrt(2 * surfSrc.Area) # Representative length tic.record() K = param.K w = getWeights(K) X_V = numpy.zeros(Ns * K) X_Kx = numpy.zeros(Ns * K) X_Ky = numpy.zeros(Ns * K) X_Kz = numpy.zeros(Ns * K) X_Kc = numpy.zeros(Ns * K) X_Vc = numpy.zeros(Ns * K) NsK = numpy.arange(Ns * K) X_V[:] = XV[NsK / K] * w[NsK % K] * surfSrc.Area[NsK / K] X_Kx[:] = XK[NsK / K] * w[NsK % K] * surfSrc.Area[NsK / K] * surfSrc.normal[NsK / K, 0] X_Ky[:] = XK[NsK / K] * w[NsK % K] * surfSrc.Area[NsK / K] * surfSrc.normal[NsK / K, 1] X_Kz[:] = XK[NsK / K] * w[NsK % K] * surfSrc.Area[NsK / K] * surfSrc.normal[NsK / K, 2] X_Kc[:] = XK[NsK / K] X_Vc[:] = XV[NsK / K] toc.record() toc.synchronize() timing.time_mass += tic.time_till(toc) * 1e-3 tic.record() C = 0 getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_V, X_Kx, X_Ky, X_Kz, ind0, param.P, param.NCRIT) toc.record() toc.synchronize() timing.time_P2M += tic.time_till(toc) * 1e-3 tic.record() for C in reversed(range(1, len(surfSrc.tree))): PC = surfSrc.tree[C].parent upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr) toc.record() toc.synchronize() timing.time_M2M += tic.time_till(toc) * 1e-3 tic.record() X_V = X_V[surfSrc.sortSource] X_Kx = X_Kx[surfSrc.sortSource] X_Ky = X_Ky[surfSrc.sortSource] X_Kz = X_Kz[surfSrc.sortSource] X_Kc = X_Kc[surfSrc.sortSource] X_Vc = X_Vc[surfSrc.sortSource] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 param.Nround = len(surfTar.twig) * param.NCRIT K_aux = numpy.zeros(param.Nround) V_aux = numpy.zeros(param.Nround) AI_int = 0 ### CPU code if param.GPU == 0: K_aux, V_aux = M2P_sort(surfSrc, surfTar, K_aux, V_aux, self, ind0.index_large, param, LorY, timing) K_aux, V_aux = P2P_sort(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, K_aux, V_aux, self, LorY, K_diag, V_diag, IorE, L, w, param, timing) ### GPU code elif param.GPU == 1: K_gpu = cuda.to_device(K_aux.astype(REAL)) V_gpu = cuda.to_device(V_aux.astype(REAL)) if surfTar.offsetMlt[self, len(surfTar.twig)] > 0: K_gpu, V_gpu = M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, self, ind0, param, LorY, timing, kernel) K_gpu, V_gpu = P2P_gpu(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, K_gpu, V_gpu, self, LorY, K_diag, IorE, L, w, param, timing, kernel) tic.record() K_aux = cuda.from_device(K_gpu, len(K_aux), dtype=REAL) V_aux = cuda.from_device(V_gpu, len(V_aux), dtype=REAL) toc.record() toc.synchronize() timing.time_trans += tic.time_till(toc) * 1e-3 tic.record() K_lyr = K_aux[surfTar.unsort] V_lyr = V_aux[surfTar.unsort] toc.record() toc.synchronize() timing.time_sort += tic.time_till(toc) * 1e-3 return K_lyr, V_lyr
def estimate_diameter_gpu(skel, data, lat_data, azth_data, n_scan_angles, max_iters=150, do_reshape=True): """Computes 3D diameter at every point of a skeleton of data using GPU. Estimates 3D diameter at every point of the skeleton `skel` extracted from the binary data `data` with help of orientation information provided by `lat_data` and `azth_data` arrays. The diameter is evaluated with a ray casting approach `cast_ray` adapted for a 3D case. Parameters ---------- skel : 3D array Indicates the skeleton of the binary data. data : 3D array Indicates the 3D binary data. lat_data : 3D array Indicates the 3D array containing latitude / elevation angle at every point of the skeleton in radians. azth_data : 3D array Indicates the 3D array containing azimuth angle at every point of the skeleton in radians. n_scan_angles : int Indicates the number of scanning angles on a range [0, 360] degrees. max_iters : int Indicates the maximum length of a ray in each direction. do_reshape : boolean Specifies if the output array should be reshaped immediately after estimation. Returns ------- out : dict The dictionary of the 3D array of estimated diameter and the execution time. """ if not cuda_available: print 'The pycuda package is not found. The diameter estimation cannot be done.' return None program, diameter3d = _diameter_kernel() Z, Y, X = np.int32(skel.nonzero()) depth, height, width = np.uint32(skel.shape) scan_angl_arr = np.deg2rad( np.float32(np.linspace(0, 360, num=n_scan_angles, endpoint=False))) radius_arr = np.zeros_like(Z, dtype=np.float32) lat_data_1d = lat_data[skel.nonzero()] azth_data_1d = azth_data[skel.nonzero()] gpu_X = gpuarray.to_gpu(X) gpu_Y = gpuarray.to_gpu(Y) gpu_Z = gpuarray.to_gpu(Z) gpu_lat_data_1d = gpuarray.to_gpu(lat_data_1d) gpu_azth_data_1d = gpuarray.to_gpu(azth_data_1d) gpu_radius_arr = gpuarray.to_gpu(radius_arr) gpu_scan_angl_arr = gpuarray.to_gpu(scan_angl_arr) gpu_rad_tex = program.get_texref('tex_data') gpu_data = numpy3d_to_array(data) gpu_rad_tex.set_array(gpu_data) n_scan_angles = np.uint32(n_scan_angles) n_points = np.uint32(len(Z)) max_iters = np.uint32(max_iters) norm_factor = np.float32(1. / n_scan_angles) block = (16, 16, 1) n_blocks = np.ceil(float(n_points) / (block[0] * block[1])) g_cols = 2 g_rows = np.int(np.ceil(n_blocks / g_cols)) grid = (g_rows, g_cols, 1) start = cuda.Event() end = cuda.Event() start.record() # start timing diameter3d(width, height, depth, n_points, norm_factor, max_iters, n_scan_angles, gpu_X, gpu_Y, gpu_Z, gpu_scan_angl_arr, gpu_azth_data_1d, gpu_lat_data_1d, gpu_radius_arr, block=block, grid=grid) end.record() # end timing end.synchronize() dm_time = start.time_till(end) * 1e-3 print "Diameter estimation time: %fs" % (dm_time) radius_arr = gpu_radius_arr.get() if do_reshape: radius_arr = np.reshape(radius_arr, data.shape, order='C') out = {'diameter': radius_arr * 2., 'time': dm_time} return out
onembed.ctypes.data, 1, 39 * BENG_CHANNELS_ + 1, cufft.CUFFT_R2C, 1) # Turn trimmed spectrum into 2048 timeseries n = array([32 * 2 * BENG_CHANNELS_], int32) inembed = array([39 * BENG_CHANNELS_ + 1], int32) onembed = array([32 * 2 * BENG_CHANNELS_], int32) plan_interp_B = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, 39 * BENG_CHANNELS_ + 1, onembed.ctypes.data, 1, 32 * 2 * BENG_CHANNELS_, cufft.CUFFT_C2R, 1) # event timers tic = cuda.Event() toc = cuda.Event() # allocate device memory # reader gpu_vdif_buf = cuda.mem_alloc(cpu_vdif_buf.nbytes) gpu_beng_data_0 = cuda.mem_alloc(8 * BENG_CHANNELS_ * BENG_SNAPSHOTS * BENG_BUFFER_IN_COUNTS) gpu_beng_data_1 = cuda.mem_alloc(8 * BENG_CHANNELS_ * BENG_SNAPSHOTS * BENG_BUFFER_IN_COUNTS) gpu_fid = cuda.mem_alloc(4 * VDIF_PER_BENG * BENG_BUFFER_IN_COUNTS) gpu_cid = cuda.mem_alloc(4 * VDIF_PER_BENG * BENG_BUFFER_IN_COUNTS) gpu_bcount = cuda.mem_alloc(4 * VDIF_PER_BENG * BENG_BUFFER_IN_COUNTS) gpu_beng_frame_completion = cuda.mem_alloc(4 * BENG_BUFFER_IN_COUNTS) # reorder/resample # buffers
def vis_gpu(antpos, freq, eq2tops, crd_eq, I_sky, bm_cube, nthreads=NTHREADS, max_memory=MAX_MEMORY, real_dtype=np.float32, complex_dtype=np.complex64, verbose=False): # ensure shapes nant = antpos.shape[0] assert (antpos.shape == (nant, 3)) npix = crd_eq.shape[1] assert (crd_eq.shape == (3, npix)) assert (I_sky.shape == (npix, )) beam_px = bm_cube.shape[1] assert (bm_cube.shape == (nant, beam_px, beam_px)) ntimes = eq2tops.shape[0] assert (eq2tops.shape == (ntimes, 3, 3)) # ensure data types antpos = antpos.astype(real_dtype) eq2tops = eq2tops.astype(real_dtype) crd_eq = crd_eq.astype(real_dtype) Isqrt = np.sqrt(I_sky).astype(real_dtype) bm_cube = bm_cube.astype(real_dtype) # XXX complex? chunk = max(min(npix, MIN_CHUNK), 2**int(ceil(np.log2(float(nant * npix) / max_memory / 2)))) npixc = npix / chunk # blocks of threads are mapped to (pixels,ants,freqs) block = (max(1, nthreads / nant), min(nthreads, nant), 1) grid = (int(ceil(npixc / float(block[0]))), int(ceil(nant / float(block[1])))) gpu_code = GPU_TEMPLATE % { 'NANT': nant, 'NPIX': npixc, 'BEAM_PX': beam_px, 'BLOCK_PX': block[0], } gpu_module = compiler.SourceModule(gpu_code) bm_interp = gpu_module.get_function("InterpolateBeam") meas_eq = gpu_module.get_function("MeasEq") bm_texref = gpu_module.get_texref("bm_tex") import pycuda.autoinit h = cublasCreate() # handle for managing cublas # define GPU buffers and transfer initial values bm_texref.set_array( numpy3d_to_array(bm_cube) ) # never changes, transpose happens in copy so cuda bm_tex is (BEAM_PX,BEAM_PX,NANT) antpos_gpu = gpuarray.to_gpu( antpos) # never changes, set to -2*pi*antpos/c Isqrt_gpu = gpuarray.empty(shape=(npixc, ), dtype=real_dtype) A_gpu = gpuarray.empty(shape=(nant, npixc), dtype=real_dtype) # will be set on GPU by bm_interp crd_eq_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype) eq2top_gpu = gpuarray.empty(shape=(3, 3), dtype=real_dtype) # sent from CPU each time crdtop_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype) # will be set on GPU tau_gpu = gpuarray.empty(shape=(nant, npixc), dtype=real_dtype) # will be set on GPU v_gpu = gpuarray.empty(shape=(nant, npixc), dtype=complex_dtype) # will be set on GPU vis_gpus = [ gpuarray.empty(shape=(nant, nant), dtype=complex_dtype) for i in xrange(chunk) ] # output CPU buffers for downloading answers vis_cpus = [ np.empty(shape=(nant, nant), dtype=complex_dtype) for i in xrange(chunk) ] streams = [driver.Stream() for i in xrange(chunk)] event_order = ('start', 'upload', 'eq2top', 'tau', 'interpolate', 'meas_eq', 'vis', 'end') vis = np.empty((ntimes, nant, nant), dtype=complex_dtype) for t in xrange(ntimes): if verbose: print '%d/%d' % (t + 1, ntimes) eq2top_gpu.set( eq2tops[t]) # defines sky orientation for this time step events = [{e: driver.Event() for e in event_order} for i in xrange(chunk)] for c in xrange(chunk + 2): cc = c - 1 ccc = c - 2 if 0 <= ccc < chunk: stream = streams[ccc] vis_gpus[ccc].get_async(ary=vis_cpus[ccc], stream=stream) events[ccc]['end'].record(stream) if 0 <= cc < chunk: stream = streams[cc] cublasSetStream(h, stream.handle) ## compute crdtop = dot(eq2top,crd_eq) # cublas arrays are in Fortran order, so P=M*N is actually # peformed as P.T = N.T * M.T cublasSgemm(h, 'n', 'n', npixc, 3, 3, 1., crd_eq_gpu.gpudata, npixc, eq2top_gpu.gpudata, 3, 0., crdtop_gpu.gpudata, npixc) events[cc]['eq2top'].record(stream) ## compute tau = dot(antpos,crdtop) cublasSgemm(h, 'n', 'n', npixc, nant, 3, 1., crdtop_gpu.gpudata, npixc, antpos_gpu.gpudata, 3, 0., tau_gpu.gpudata, npixc) events[cc]['tau'].record(stream) ## interpolate bm_tex at specified topocentric coords, store interpolation in A ## threads are parallelized across pixel axis bm_interp(crdtop_gpu, A_gpu, grid=grid, block=block, stream=stream) events[cc]['interpolate'].record(stream) # compute v = A * I * exp(1j*tau*freq) meas_eq(A_gpu, Isqrt_gpu, tau_gpu, real_dtype(freq), v_gpu, grid=grid, block=block, stream=stream) events[cc]['meas_eq'].record(stream) # compute vis = dot(v, v.T) # transpose below incurs about 20% overhead cublasCgemm(h, 'c', 'n', nant, nant, npixc, 1., v_gpu.gpudata, npixc, v_gpu.gpudata, npixc, 0., vis_gpus[cc].gpudata, nant) events[cc]['vis'].record(stream) if c < chunk: stream = streams[c] events[c]['start'].record(stream) crd_eq_gpu.set_async(crd_eq[:, c * npixc:(c + 1) * npixc], stream=stream) Isqrt_gpu.set_async(Isqrt[c * npixc:(c + 1) * npixc], stream=stream) events[c]['upload'].record(stream) events[chunk - 1]['end'].synchronize() vis[t] = sum(vis_cpus) if verbose: for c in xrange(chunk): print '%d:%d START->END:' % ( c, chunk), events[c]['start'].time_till( events[c]['end']) * 1e-3 #for i,e in enumerate(event_order[:-1]): # print c, e,'->',event_order[i+1], ':', events[c][e].time_till(events[c][event_order[i+1]]) * 1e-3 print 'TOTAL:', events[0]['start'].time_till( events[chunk - 1]['end']) * 1e-3 # teardown GPU configuration cublasDestroy(h) return vis
def run_simulation(self): # setup data#{{{ data = { 'weights': self.weights, 'lengths': self.lengths, 'params': self.params.T } base_shape = self.n_work_items, for name, shape in dict( tavg0=(self.exposures, self.args.n_regions,), tavg1=(self.exposures, self.args.n_regions,), state=(self.buf_len, self.states * self.args.n_regions), ).items(): # memory error exception for compute device try: data[name] = np.zeros(shape + base_shape, 'f') except MemoryError as e: self.logger.error('%s.\n\t Please check the parameter dimensions %d x %d, they are to large ' 'for this compute device', e, self.args.n_sweep_arg0, self.args.n_sweep_arg1) exit(1) gpu_data = self.make_gpu_data(data)#{{{ # setup CUDA stuff#{{{ step_fn = self.make_kernel( source_file=self.args.filename, warp_size=32, # block_dim_x=self.args.n_sweep_arg0, # ext_options=preproccesor_defines, # caching=args.caching, args=self.args, lineinfo=self.args.lineinfo, nh=self.buf_len, )#}}} # setup simulation#{{{ tic = time.time() n_streams = 32 streams = [drv.Stream() for i in range(n_streams)] events = [drv.Event() for i in range(n_streams)] tavg_unpinned = [] try: tavg = drv.pagelocked_zeros((n_streams,) + data['tavg0'].shape, dtype=np.float32) except drv.MemoryError as e: self.logger.error( '%s.\n\t Please check the parameter dimensions, %d parameters are too large for this GPU', e, self.params.size) exit(1) # determine optimal grid recursively def dog(fgd): maxgd, mingd = max(fgd), min(fgd) maxpos = fgd.index(max(fgd)) if (maxgd - 1) * mingd * bx * by >= nwi: fgd[maxpos] = fgd[maxpos] - 1 dog(fgd) else: return fgd # n_sweep_arg0 scales griddim.x, n_sweep_arg1 scales griddim.y # form an optimal grid recursively bx, by = self.args.blockszx, self.args.blockszy nwi = self.n_work_items rootnwi = int(np.ceil(np.sqrt(nwi))) gridx = int(np.ceil(rootnwi / bx)) gridy = int(np.ceil(rootnwi / by)) final_block_dim = bx, by, 1 fgd = [gridx, gridy] dog(fgd) final_grid_dim = fgd[0], fgd[1] assert gridx * gridy * bx * by >= nwi self.logger.info('history shape %r', gpu_data['state'].shape) self.logger.info('gpu_data %s', gpu_data['tavg0'].shape) self.logger.info('on device mem: %.3f MiB' % (self.nbytes(data) / 1024 / 1024, )) self.logger.info('final block dim %r', final_block_dim) self.logger.info('final grid dim %r', final_grid_dim) # run simulation#{{{ nstep = self.args.n_time self.gpu_mem_info() if self.args.verbose else None try: for i in tqdm.trange(nstep, file=sys.stdout): try: event = events[i % n_streams] stream = streams[i % n_streams] if i > 0: stream.wait_for_event(events[(i - 1) % n_streams]) step_fn(np.uintc(i * self.n_inner_steps), np.uintc(self.args.n_regions), np.uintc(self.buf_len), np.uintc(self.n_inner_steps), np.uintc(self.n_work_items), np.float32(self.dt), gpu_data['weights'], gpu_data['lengths'], gpu_data['params'], gpu_data['state'], gpu_data['tavg%d' % (i%2,)], block=final_block_dim, grid=final_grid_dim) event.record(streams[i % n_streams]) except drv.LaunchError as e: self.logger.error('%s', e) exit(1) tavgk = 'tavg%d' % ((i + 1) % 2,) # async wrt. other streams & host, but not this stream. if i >= n_streams: stream.synchronize() tavg_unpinned.append(tavg[i % n_streams].copy()) drv.memcpy_dtoh_async(tavg[i % n_streams], gpu_data[tavgk].ptr, stream=stream) # recover uncopied data from pinned buffer if nstep > n_streams: for i in range(nstep % n_streams, n_streams): stream.synchronize() tavg_unpinned.append(tavg[i].copy()) for i in range(nstep % n_streams): stream.synchronize() tavg_unpinned.append(tavg[i].copy()) except drv.LogicError as e: self.logger.error('%s. Check the number of states of the model or ' 'GPU block shape settings blockdim.x/y %r, griddim %r.', e, final_block_dim, final_grid_dim) exit(1) except drv.RuntimeError as e: self.logger.error('%s', e) exit(1) # self.logger.info('kernel finish..') # release pinned memory tavg = np.array(tavg_unpinned) # also release gpu_data self.release_gpumem(gpu_data) self.logger.info('kernel finished') return tavg
def chyqmom27( moments: np.ndarray, size: int): mem_d_size_in_byte = np.ones(size).astype(np.float32).nbytes sizeof_float = np.int32(np.dtype(np.float32).itemsize) size = np.int32(size) BlockSize = (256, 1, 1) GridSize = (size +BlockSize[0] - 1) /BlockSize[0]; GridSize = (int(GridSize), 1, 1) # compile kernel HYQ = SourceModule(HYQMOM) CHY27 = SourceModule(CHYQMOM27) hyqmom3 = HYQ.get_function('hyqmom3') c_kernel = CHY27.get_function('chyqmom27_cmoments') chyqmom27_rho_yf = CHY27.get_function('chyqmom27_rho_yf') chyqmom27_zf = CHY27.get_function('chyqmom27_zf') chyqmom27_mu = CHY27.get_function('chyqmom27_mu') float_value_set = CHY27.get_function('float_value_set') float_array_set = CHY27.get_function('float_array_set') chyqmom27_set_m = CHY27.get_function('chyqmom27_set_m') print_device = CHY27.get_function('print_device') chyqmom27_wout = CHY27.get_function('chyqmom27_wout') chyqmom27_xout = CHY27.get_function('chyqmom27_xout') chyqmom27_yout = CHY27.get_function('chyqmom27_yout') chyqmom27_zout = CHY27.get_function('chyqmom27_zout') w = cuda.aligned_zeros((27, int(size)), dtype=np.float32) x = cuda.aligned_zeros((27, int(size)), dtype=np.float32) y = cuda.aligned_zeros((27, int(size)), dtype=np.float32) z = cuda.aligned_zeros((27, int(size)), dtype=np.float32) # Allocate memory moments_device = cuda.mem_alloc(int(sizeof_float * size * 16)) c_moments = cuda.mem_alloc(int(sizeof_float * size * 12)) m = cuda.mem_alloc(int(sizeof_float * size * 10)) float_value_set(m, np.float32(1), size, np.int32(0), block=BlockSize, grid=GridSize) float_value_set(m, np.float32(0), size, size, block=BlockSize, grid=GridSize) w1 = cuda.mem_alloc(int(sizeof_float * size * 3)) x1 = cuda.mem_alloc(int(sizeof_float * size * 3)) w2 = cuda.mem_alloc(int(sizeof_float * size * 9)) x2 = cuda.mem_alloc(int(sizeof_float * size * 9)) y2 = cuda.mem_alloc(int(sizeof_float * size * 9)) rho = cuda.mem_alloc(int(sizeof_float * size * 9)) yf = cuda.mem_alloc(int(sizeof_float * size * 3)) yp = cuda.mem_alloc(int(sizeof_float * size * 9)) zf = cuda.mem_alloc(int(sizeof_float * size * 3)) w3 = cuda.mem_alloc(int(sizeof_float * size * 3)) x3 = cuda.mem_alloc(int(sizeof_float * size * 3)) mu = cuda.mem_alloc(int(sizeof_float * size * 3)) w_dev = cuda.mem_alloc(int(sizeof_float * size * 27)) x_dev = cuda.mem_alloc(int(sizeof_float * size * 27)) y_dev = cuda.mem_alloc(int(sizeof_float * size * 27)) z_dev = cuda.mem_alloc(int(sizeof_float * size * 27)) cuda.memcpy_htod(moments_device, moments) # Is this faster? time_before = cuda.Event() time_after = cuda.Event() time_before.record() c_kernel(moments_device, c_moments, size, block=BlockSize, grid=GridSize) float_array_set(m, c_moments, size, np.int32(2) * size, np.int32(0), block=BlockSize, grid=GridSize) float_array_set(m, c_moments, size, np.int32(3) * size, np.int32(6) * size, block=BlockSize, grid=GridSize) float_array_set(m, c_moments, size, np.int32(4) * size, np.int32(9) * size, block=BlockSize, grid=GridSize) # print("What is m1?") # print_device(m, np.int32(5), block=BlockSize, grid=GridSize) hyqmom3(m, x1, w1, size, block=BlockSize, grid=GridSize) # Is this faster? chyqmom27_set_m(m, c_moments, size, block=BlockSize, grid=GridSize) # this_context.synchronize() # print_device(m, np.int32(10), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Entering CHYQMOM9") chyqmom9(m, size, w2, x2, y2) # this_context.synchronize() # print("What is w2?") # print_device(w2, np.int32(10), block=BlockSize, grid=GridSize) chyqmom27_rho_yf(c_moments, y2, w2, rho, yf, yp, size, block=BlockSize, grid=GridSize) chyqmom27_zf(c_moments, x1, zf, size, block=BlockSize, grid=GridSize) chyqmom27_mu(c_moments, rho, zf, mu, size, block=BlockSize, grid=GridSize) float_array_set(m, mu, size, np.int32(2) * size, np.int32(0), block=BlockSize, grid=GridSize) float_array_set(m, mu, size, np.int32(3) * size, np.int32(1) * size, block=BlockSize, grid=GridSize) float_array_set(m, mu, size, np.int32(4) * size, np.int32(2) * size, block=BlockSize, grid=GridSize) hyqmom3(m, x3, w3, size, block=BlockSize, grid=GridSize) chyqmom27_wout(moments_device, w1, rho, w3, w_dev, size, block=BlockSize, grid=GridSize) chyqmom27_xout(moments_device, x1, x_dev, size, block=BlockSize, grid=GridSize) chyqmom27_yout(moments_device, yf, yp, y_dev, size, block=BlockSize, grid=GridSize) chyqmom27_zout(moments_device, zf, x3, z_dev, block=BlockSize, grid=GridSize) time_after.record() time_after.synchronize() elapsed_time = time_after.time_since(time_before) cuda.memcpy_dtoh(w, w_dev) cuda.memcpy_dtoh(x, x_dev) cuda.memcpy_dtoh(y, y_dev) cuda.memcpy_dtoh(z, z_dev) # this_context.synchronize() # print("Entering rho") # print_device(rho, np.int32(9*2), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Entering mu") # print_device(mu, np.int32(3*2), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Entering w1") # print_device(w1, np.int32(3*2), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Entering rho") # print_device(rho, np.int32(9*2), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Entering w3") # print_device(w3, np.int32(3*2), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Final w_dev") # print_device(w_dev, np.int32(27*1), block=BlockSize, grid=GridSize) moments_device.free() c_moments.free() m.free() w1.free() x1.free() w2.free() x2.free() y2.free() rho.free() yf.free() yp.free() zf.free() w3.free() x3.free() mu.free() return elapsed_time, w_dev, x_dev, y_dev, z_dev
prev_layer = None for layer in layers: layer.fprop() if layer.weights is not None: mean = layer.get_activation_mean() scale = .5 #if prev_layer is None else prev_layer.reduction_factor() print("Scale weights: %.3f (%.3f) %s" % (scale / mean, scale, layer)) layer.weights *= scale / mean layer.fprop() prev_layer = layer ng.bench = layer_bench start = drv.Event() end = drv.Event() fprop_time = 0 bprop_time = 0 fprop_flops = 0 bprop_flops = 0 # We throw away the first run as it includes pycuda kernel loading times. # So add 1 to our loop count. for loop in range(loops + 1): start.record() flops = 0 #fprop
def sync_only(): sync_gpu(driver.Event())
def solve_gpu(currentmodelrun, modelend, G): """Solving using FDTD method on GPU. Implemented using Nvidia CUDA. Args: currentmodelrun (int): Current model run number. modelend (int): Number of last model to run. G (class): Grid class instance - holds essential parameters describing the model. Returns: tsolve (float): Time taken to execute solving """ import pycuda.driver as drv from pycuda.compiler import SourceModule drv.init() # Suppress nvcc warnings on Windows if sys.platform == 'win32': compiler_opts = ['-w'] else: compiler_opts = None # Create device handle and context on specifc GPU device (and make it current context) dev = drv.Device(G.gpu.deviceID) ctx = dev.make_context() # Electric and magnetic field updates - prepare kernels, and get kernel functions if Material.maxpoles > 0: kernels_fields = SourceModule(kernels_template_fields.substitute( REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=G.updatecoeffsdispersive.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=G.Tx.shape[1], NY_T=G.Tx.shape[2], NZ_T=G.Tx.shape[3]), options=compiler_opts) else: # Set to one any substitutions for dispersive materials kernels_fields = SourceModule(kernels_template_fields.substitute( REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=1, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=1, NY_T=1, NZ_T=1), options=compiler_opts) update_e_gpu = kernels_fields.get_function("update_e") update_h_gpu = kernels_fields.get_function("update_h") # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for fields kernels updatecoeffsE = kernels_fields.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_fields.get_global('updatecoeffsH')[0] if G.updatecoeffsE.nbytes + G.updatecoeffsH.nbytes > G.gpu.constmem: raise GeneralError( 'Too many materials in the model to fit onto constant memory of size {} on {} - {} GPU' .format(human_size(G.gpu.constmem), G.gpu.deviceID, G.gpu.name)) else: drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) # Electric and magnetic field updates - dispersive materials - get kernel functions and initialise array on GPU if Material.maxpoles > 0: # If there are any dispersive materials (updates are split into two parts as they require present and updated electric field values). update_e_dispersive_A_gpu = kernels_fields.get_function( "update_e_dispersive_A") update_e_dispersive_B_gpu = kernels_fields.get_function( "update_e_dispersive_B") G.gpu_initialise_dispersive_arrays() # Electric and magnetic field updates - set blocks per grid and initialise field arrays on GPU G.gpu_set_blocks_per_grid() G.gpu_initialise_arrays() # PML updates if G.pmls: # Prepare kernels kernels_pml = SourceModule(kernels_template_pml.substitute( REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_R=G.pmls[0].ERA.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts) # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for PML kernels updatecoeffsE = kernels_pml.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_pml.get_global('updatecoeffsH')[0] drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) # Set block per grid, initialise arrays on GPU, and get kernel functions for pml in G.pmls: pml.gpu_set_blocks_per_grid(G) pml.gpu_initialise_arrays() pml.gpu_get_update_funcs(kernels_pml) # Receivers if G.rxs: # Initialise arrays on GPU rxcoords_gpu, rxs_gpu = gpu_initialise_rx_arrays(G) # Prepare kernel and get kernel function kernel_store_outputs = SourceModule( kernel_template_store_outputs.substitute(REAL=cudafloattype, NY_RXCOORDS=3, NX_RXS=6, NY_RXS=G.iterations, NZ_RXS=len(G.rxs), NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2]), options=compiler_opts) store_outputs_gpu = kernel_store_outputs.get_function("store_outputs") # Sources - initialise arrays on GPU, prepare kernel and get kernel functions if G.voltagesources + G.hertziandipoles + G.magneticdipoles: kernels_sources = SourceModule(kernels_template_sources.substitute( REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_SRCINFO=4, NY_SRCWAVES=G.iterations, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts) # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for source kernels updatecoeffsE = kernels_sources.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_sources.get_global('updatecoeffsH')[0] drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) if G.hertziandipoles: srcinfo1_hertzian_gpu, srcinfo2_hertzian_gpu, srcwaves_hertzian_gpu = gpu_initialise_src_arrays( G.hertziandipoles, G) update_hertzian_dipole_gpu = kernels_sources.get_function( "update_hertzian_dipole") if G.magneticdipoles: srcinfo1_magnetic_gpu, srcinfo2_magnetic_gpu, srcwaves_magnetic_gpu = gpu_initialise_src_arrays( G.magneticdipoles, G) update_magnetic_dipole_gpu = kernels_sources.get_function( "update_magnetic_dipole") if G.voltagesources: srcinfo1_voltage_gpu, srcinfo2_voltage_gpu, srcwaves_voltage_gpu = gpu_initialise_src_arrays( G.voltagesources, G) update_voltage_source_gpu = kernels_sources.get_function( "update_voltage_source") # Snapshots - initialise arrays on GPU, prepare kernel and get kernel functions if G.snapshots: # Initialise arrays on GPU snapEx_gpu, snapEy_gpu, snapEz_gpu, snapHx_gpu, snapHy_gpu, snapHz_gpu = gpu_initialise_snapshot_array( G) # Prepare kernel and get kernel function kernel_store_snapshot = SourceModule( kernel_template_store_snapshot.substitute(REAL=cudafloattype, NX_SNAPS=Snapshot.nx_max, NY_SNAPS=Snapshot.ny_max, NZ_SNAPS=Snapshot.nz_max, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2]), options=compiler_opts) store_snapshot_gpu = kernel_store_snapshot.get_function( "store_snapshot") # Iteration loop timer iterstart = drv.Event() iterend = drv.Event() iterstart.record() for iteration in tqdm(range(G.iterations), desc='Running simulation, model ' + str(currentmodelrun) + '/' + str(modelend), ncols=get_terminal_width() - 1, file=sys.stdout, disable=not G.progressbars): # Get GPU memory usage on final iteration if iteration == G.iterations - 1: memsolve = drv.mem_get_info()[1] - drv.mem_get_info()[0] # Store field component values for every receiver if G.rxs: store_outputs_gpu(np.int32(len(G.rxs)), np.int32(iteration), rxcoords_gpu.gpudata, rxs_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.rxs)), 1, 1)) # Store any snapshots for i, snap in enumerate(G.snapshots): if snap.time == iteration + 1: store_snapshot_gpu(np.int32(i), np.int32(snap.xs), np.int32(snap.xf), np.int32(snap.ys), np.int32(snap.yf), np.int32(snap.zs), np.int32(snap.zf), np.int32(snap.dx), np.int32(snap.dy), np.int32(snap.dz), G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, snapEx_gpu.gpudata, snapEy_gpu.gpudata, snapEz_gpu.gpudata, snapHx_gpu.gpudata, snapHy_gpu.gpudata, snapHz_gpu.gpudata, block=Snapshot.tpb, grid=Snapshot.bpg) if G.snapsgpu2cpu: gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(), snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap) # Update magnetic field components update_h_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg) # Update magnetic field components with the PML correction for pml in G.pmls: pml.gpu_update_magnetic(G) # Update magnetic field components for magetic dipole sources if G.magneticdipoles: update_magnetic_dipole_gpu(np.int32(len(G.magneticdipoles)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_magnetic_gpu.gpudata, srcinfo2_magnetic_gpu.gpudata, srcwaves_magnetic_gpu.gpudata, G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.magneticdipoles)), 1, 1)) # Update electric field components # If all materials are non-dispersive do standard update if Material.maxpoles == 0: update_e_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=G.tpb, grid=G.bpg) # If there are any dispersive materials do 1st part of dispersive update # (it is split into two parts as it requires present and updated electric field values). else: update_e_dispersive_A_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata, G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=G.tpb, grid=G.bpg) # Update electric field components with the PML correction for pml in G.pmls: pml.gpu_update_electric(G) # Update electric field components for voltage sources if G.voltagesources: update_voltage_source_gpu(np.int32(len(G.voltagesources)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_voltage_gpu.gpudata, srcinfo2_voltage_gpu.gpudata, srcwaves_voltage_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.voltagesources)), 1, 1)) # Update electric field components for Hertzian dipole sources (update any Hertzian dipole sources last) if G.hertziandipoles: update_hertzian_dipole_gpu(np.int32(len(G.hertziandipoles)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_hertzian_gpu.gpudata, srcinfo2_hertzian_gpu.gpudata, srcwaves_hertzian_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.hertziandipoles)), 1, 1)) # If there are any dispersive materials do 2nd part of dispersive update (it is split into two parts as it requires present and updated electric field values). Therefore it can only be completely updated after the electric field has been updated by the PML and source updates. if Material.maxpoles > 0: update_e_dispersive_B_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata, G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg) # Copy output from receivers array back to correct receiver objects if G.rxs: gpu_get_rx_array(rxs_gpu.get(), rxcoords_gpu.get(), G) # Copy data from any snapshots back to correct snapshot objects if G.snapshots and not G.snapsgpu2cpu: for i, snap in enumerate(G.snapshots): gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(), snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap) iterend.record() iterend.synchronize() tsolve = iterstart.time_till(iterend) * 1e-3 # Remove context from top of stack and delete ctx.pop() del ctx return tsolve, memsolve
def vector_add_example(context): #create input data n = numpy.int32(5e7) a = numpy.random.randn(n).astype(numpy.float32) b = numpy.random.randn(n).astype(numpy.float32) c = numpy.zeros_like(b) #measure CPU time start = timer() d = a + b end = timer() print("a+b took", (end - start) * 1000.0, "ms") #move data to the GPU args = [c, a, b] gpu_args = [] for arg in args: gpu_args.append(drv.mem_alloc(arg.nbytes)) drv.memcpy_htod(gpu_args[-1], arg) gpu_args.append(n) #read kernel into string with open('vector_add.cu', 'r') as f: kernel_string = f.read() #get compute capability for compiling CUDA kernels devprops = { str(k): v for (k, v) in context.get_device().get_attributes().items() } cc = str(devprops['COMPUTE_CAPABILITY_MAJOR']) + str( devprops['COMPUTE_CAPABILITY_MINOR']) #compile the kernel vector_add = SourceModule(kernel_string, arch='compute_' + cc, code='sm_' + cc, cache_dir=False, no_extern_c=True, options=['-Xcompiler=--std=c++11' ]).get_function("vec_add_kernel") #launch the kernel threads = (1024, 1, 1) grid = (int(numpy.ceil(n / float(threads[0]))), 1, 1) context.synchronize() start = drv.Event() end = drv.Event() start.record() vector_add(*gpu_args, block=threads, grid=grid, stream=None, shared=0) end.record() context.synchronize() print("vec_add_kernel took", end.time_since(start), "ms.") #copy output data back from GPU drv.memcpy_dtoh(c, gpu_args[0]) #compare output with reference correct = numpy.allclose(c, a + b, atol=1e-6) if not correct: print("TEST FAILED!") print(c) print(a + b) else: print("TEST PASSED!")
def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs): """ Enqueue CUDA memory transfer and kernel execution operations on a CUDA stream. CPU and GPU arrays are sliced given the dimension:slice mapping specified in cpu_slice_map and gpu_slice_map. Returns a (event, X2) tuple, where event is a CUDA event recorded at the end of this sequence of operations and X2 is a pinned memory array that will hold the result of the chi-squared operation. """ tl = self.thread_local i, subslvr = tl.subslvr_gen.next() # A dictionary of references to memory pool allocated objects # ensuring that said objects remained allocated until # after compute has been performed. Returned from # this function, this object should be discarded when # reading the result of the enqueued operations. pool_refs = defaultdict(list) # Cache keyed by array names and contained indices # This is used to avoid unnecessary CPU to GPU copies # by caching the last index of the CPU array dirty = tl.dirty[i] # Guard pool allocations with a coarse-grained mutex with subslvr.pool_lock: # Now, iterate over our source chunks, enqueuing # memory transfers and CUDA kernels for src_cpu_slice_map, src_gpu_slice_map in self._gen_source_slices( ): # Update our maps with source slice information cpu_slice_map.update(src_cpu_slice_map) gpu_slice_map.update(src_gpu_slice_map) # Configure dimension extents and global size on the sub-solver for name, slice_ in cpu_slice_map.iteritems(): subslvr.update_dimension( name=name, global_size=self.dimension(name).global_size, lower_extent=slice_.start, upper_extent=slice_.stop) # Enqueue E Beam kernel = subslvr.rime_e_beam new_refs = self._enqueue_array( subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, classifiers=[Classifier.E_BEAM_INPUT]) cdata_ref = self._enqueue_const_data_htod( subslvr, kernel.rime_const_data[0]) _update_refs(pool_refs, new_refs) _update_refs(pool_refs, {'cdata_ebeam': [cdata_ref]}) kernel.execute(subslvr, subslvr.stream) # Enqueue B Sqrt kernel = subslvr.rime_b_sqrt new_refs = self._enqueue_array( subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, classifiers=[Classifier.B_SQRT_INPUT]) cdata_ref = self._enqueue_const_data_htod( subslvr, kernel.rime_const_data[0]) _update_refs(pool_refs, new_refs) _update_refs(pool_refs, {'cdata_bsqrt': [cdata_ref]}) kernel.execute(subslvr, subslvr.stream) # Enqueue EKB Sqrt kernel = subslvr.rime_ekb_sqrt new_refs = self._enqueue_array( subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, classifiers=[Classifier.EKB_SQRT_INPUT]) cdata_ref = self._enqueue_const_data_htod( subslvr, kernel.rime_const_data[0]) _update_refs(pool_refs, new_refs) _update_refs(pool_refs, {'cdata_ekb': [cdata_ref]}) kernel.execute(subslvr, subslvr.stream) # Enqueue Sum Coherencies kernel = subslvr.rime_sum new_refs = self._enqueue_array( subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, classifiers=[Classifier.COHERENCIES_INPUT]) cdata_ref = self._enqueue_const_data_htod( subslvr, kernel.rime_const_data[0]) _update_refs(pool_refs, new_refs) _update_refs(pool_refs, {'cdata_coherencies': [cdata_ref]}) kernel.execute(subslvr, subslvr.stream) # Enqueue chi-squared term reduction and return the # GPU array allocated to it X2_gpu_ary = subslvr.rime_reduce.execute(subslvr, subslvr.stream) # Get pinned memory to hold the chi-squared result sub_X2 = subslvr.pinned_mem_pool.allocate(shape=X2_gpu_ary.shape, dtype=X2_gpu_ary.dtype) # Enqueue chi-squared copy off the GPU onto the CPU X2_gpu_ary.get_async(ary=sub_X2, stream=subslvr.stream) # Enqueue transfer of simulator output (model visibilities) to the CPU sim_output_refs = self._enqueue_array( subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_DTOH, dirty={}, classifiers=[Classifier.SIMULATOR_OUTPUT]) # Should only be model visibilities assert len(sim_output_refs) == 1, ( 'Expected one array (model visibilities), ' 'received {l} instead.'.format(l=len(new_refs))) model_vis = sim_output_refs['model_vis'][0] # Create and record an event directly after the chi-squared copy # We'll synchronise on this thread in our synchronisation executor sync_event = cuda.Event(cuda.event_flags.DISABLE_TIMING | cuda.event_flags.BLOCKING_SYNC) sync_event.record(subslvr.stream) # Retain references to CPU pinned and GPU device memory # until the above enqueued operations have been performed. pool_refs['X2_gpu'].append(X2_gpu_ary) pool_refs['X2_cpu'].append(sub_X2) pool_refs['model_vis_output'].append(model_vis) return (sync_event, sub_X2, model_vis, pool_refs, subslvr.pool_lock, cpu_slice_map.copy(), gpu_slice_map.copy())
def radiation_loop(self, quant, write, rt_plot, Vmod): """ loops over the relevant kernels iteratively until the equilibrium TP - profile reached """ condition = True quant.iter_value = np.int32(0) quant.p_lay = quant.dev_p_lay.get() # measures the runtime of a specified number of iterations start_loop = cuda.Event() end_loop = cuda.Event() start_total = cuda.Event() end_total = cuda.Event() # uncomment for time testing purposes # start_test = cuda.Event() # end_test = cuda.Event() start_total.record() while condition: if quant.iter_value % 10 == 0: start_loop.record() self.interpolate_temperatures(quant) self.interpolate_planck(quant) if quant.iter_value % 10 == 0: if Vmod.V_iter_nr == 0: self.interpolate_opacities_and_scattering_cross_sections(quant) self.interpolate_meanmolmass(quant) elif Vmod.V_iter_nr > 0: Vmod.interpolate_molecular_and_mixed_opac(quant) Vmod.combine_to_mixed_opacities(quant) self.interpolate_kappa(quant) self.calculate_c_p(quant) self.normalize_cloud_scattering(quant) self.calculate_transmission(quant) self.calculate_delta_z(quant) quant.delta_z_lay = quant.dev_delta_z_lay.get() hsfunc.calculate_height_z(quant) quant.dev_z_lay = gpuarray.to_gpu(quant.z_lay) self.calculate_direct_beamflux(quant) self.populate_spectral_flux(quant) self.integrate_flux(quant) # uncomment for time testing purposes # start_test.record() # end_test.record() # end_test.synchronize() # time_test = start_test.time_till(end_test) # print("\nTime for test [s]: {:g}".format(time_test * 1e-3)) if quant.singlewalk == 0: abortsum = 0 quant.marked_red = np.zeros(quant.nlayer) if quant.iter_value % 100 == 0: print("\nWe are running \"" + quant.name + "\" at iteration step nr. : "+str(quant.iter_value)) if quant.iter_value >= quant.foreplay: # radiative temperature progression self.rad_temp_iteration(quant) quant.abort = quant.dev_abort.get() for i in range(quant.nlayer): abortsum += quant.abort[i] if quant.abort[i] == 0: quant.marked_red[i] = 1 if quant.iter_value % 10 == 0: print("Layers converged: "+str(abortsum)+" out of "+str(quant.nlayer)+".") # checks whether to continue the loop condition = abortsum < quant.nlayer quant.iter_value += 1 quant.iter_value = np.int32(quant.iter_value) if quant.iter_value % quant.n_plot == 0 or not condition: write.write_restart_file(quant) if quant.realtime_plot == 1: rt_plot.plot_tp(quant) # records the time needed for 10 loops if (quant.iter_value-1) % 10 == 9: end_loop.record() end_loop.synchronize() time_loop = start_loop.time_till(end_loop) print("\nTime for the last 10 loops [s]: {:.2f}".format(time_loop * 1e-3)) # time restriction for the run. It aborts automatically after the following time steps and prevents a hung job. if quant.iter_value > 2e4: write.write_abort_file(quant) print("\nRun exceeds allowed maximum allowed number of iteration steps. Aborting...") raise SystemExit() elif quant.singlewalk == 1: condition = False end_total.record() end_total.synchronize() time_total = start_total.time_till(end_total) print("\nTime for radiative iteration [s]: {:.2f}".format(time_total * 1e-3)) print("Total number of iterative steps: "+str(quant.iter_value))
M1, N1, _ = image.shape M2 = int(2 * M1) N2 = int(2 * N1) result = np.zeros((M2, N2), dtype=np.uint32) block = (16, 16, 1) grid = (int(np.ceil(M2 / block[0])), int(np.ceil(N2 / block[1]))) # собираем ядро mod = compiler.SourceModule(open("kernel.cu", "r").read()) bilinear_interpolation_kernel = mod.get_function("interpolate") x_out = np.array([i for i in range(M2)] * N2) y_out = np.array([i for i in range(N2)] * M2) start = driver.Event() stop = driver.Event() #подготовка текстуры print("Считаем на ГПУ...") start.record() prep_image = prepare_image(image) tex = mod.get_texref("tex") tex.set_filter_mode(driver.filter_mode.LINEAR) tex.set_address_mode(0, driver.address_mode.CLAMP) tex.set_address_mode(1, driver.address_mode.CLAMP) driver.matrix_to_texref(prep_image, tex, order="C") bilinear_interpolation_kernel(driver.Out(result), driver.In(x_out),
def convection_loop(self, quant, write, rt_plot): """ loops interchangeably through the radiative and convection schemes """ # kappa is required for the conv. instability check self.interpolate_kappa(quant) quant.T_lay = quant.dev_T_lay.get() quant.p_lay = quant.dev_p_lay.get() quant.p_int = quant.dev_p_int.get() quant.kappa_lay = quant.dev_kappa_lay.get() if quant.iso == 0: quant.kappa_int = quant.dev_kappa_int.get() hsfunc.conv_check(quant) hsfunc.mark_convective_layers(quant) # only starts the loop if convective adjustment is switched on if quant.singlewalk == 0 and quant.convection == 1: condition = sum(quant.conv_unstable) > 0 start_total = cuda.Event() end_total = cuda.Event() start_total.record() quant.iter_value = np.int32(0) if condition: # measures time start_loop = cuda.Event() end_loop = cuda.Event() print("\nConvectively unstable layers found. Starting convective adjustment") else: print("All layers convectively stable. No convective adjustment necessary.\n") # quantities required on the host for the first convective adjustment quant.F_net = quant.dev_F_net.get() quant.F_up_tot = quant.dev_F_up_tot.get() while condition: if quant.iter_value % 10 == 0: start_loop.record() if quant.iter_value % 100 == 0: print("\nWe are running \"" + quant.name + "\" at iteration step nr. : "+str(quant.iter_value)) # start with the convective adjustment and then recalculate the rad. fluxes, go back to conv. adjustment, then rad. fluxes, etc. self.interpolate_temperatures(quant) self.interpolate_meanmolmass(quant) self.interpolate_kappa(quant) self.calculate_c_p(quant) quant.kappa_lay = quant.dev_kappa_lay.get() quant.kappa_int = quant.dev_kappa_int.get() quant.c_p_lay = quant.dev_c_p_lay.get() quant.T_lay = quant.dev_T_lay.get() hsfunc.convective_adjustment(quant) quant.dev_T_lay = gpuarray.to_gpu(quant.T_lay) self.interpolate_temperatures(quant) self.interpolate_opacities_and_scattering_cross_sections(quant) self.interpolate_meanmolmass(quant) self.interpolate_kappa(quant) self.calculate_c_p(quant) self.normalize_cloud_scattering(quant) self.interpolate_planck(quant) self.calculate_transmission(quant) if quant.iter_value % 10 == 0: self.calculate_delta_z(quant) quant.delta_z_lay = quant.dev_delta_z_lay.get() hsfunc.calculate_height_z(quant) quant.dev_z_lay = gpuarray.to_gpu(quant.z_lay) self.calculate_direct_beamflux(quant) self.populate_spectral_flux(quant) self.integrate_flux(quant) # copy back fluxes to determine convergence quant.F_net = quant.dev_F_net.get() quant.F_down_tot = quant.dev_F_down_tot.get() quant.F_up_tot = quant.dev_F_up_tot.get() quant.F_net_diff = quant.dev_F_net_diff.get() quant.kappa_lay = quant.dev_kappa_lay.get() quant.kappa_int = quant.dev_kappa_int.get() # mark layers where convection is dominant // used by realtime net flux plotting quant.T_lay = quant.dev_T_lay.get() # mark convection zone. used by realtime plotting hsfunc.mark_convective_layers(quant) # hsfunc.radiative_adjustment(quant) # if quant.iter_value % 50 == 0: # print("kick") # hsfunc.random_adjustment(quant) # quant.dev_T_lay = gpuarray.to_gpu(quant.T_lay) # checks whether to continue the loop condition = not(hsfunc.check_for_global_local_equilibrium(quant)) or quant.iter_value < 100 # the following only if loop continues if condition: # realtime plotting every 10th step if quant.iter_value % quant.n_plot == 0 or not condition: write.write_restart_file(quant) if quant.realtime_plot == 1: rt_plot.plot_convective_feedback(quant) # kernel that advances the temperature in a radiative way self.conv_temp_iteration(quant) # records the time needed for 10 loops if quant.iter_value % 10 == 9: end_loop.record() end_loop.synchronize() time_loop = start_loop.time_till(end_loop) print("\nTime for the last 10 (total: {:d}) loops [s]: {:.2f}".format(quant.iter_value, time_loop * 1e-3)) quant.iter_value += 1 quant.iter_value = np.int32(quant.iter_value) # time restriction for the run. It aborts automatically after the following timesteps and thus prevents a hung job. if quant.iter_value > 2e4: write.write_abort_file(quant) print("\nRun exceeds allowed maximum allowed number of iteration steps. Aborting...") raise SystemExit() end_total.record() end_total.synchronize() time_total = start_total.time_till(end_total) print("\nTime for rad.-conv. iteration [s]: {:.2f}".format(time_total * 1e-3)) print("Total number of iterative steps: " + str(quant.iter_value))
def __init__(self, device=0, iterations=7, compiler_options=None, observers=None): """instantiate CudaFunctions object used for interacting with the CUDA device Instantiating this object will inspect and store certain device properties at runtime, which are used during compilation and/or execution of kernels by the kernel tuner. It also maintains a reference to the most recently compiled source module for copying data to constant memory before kernel launch. :param device: Number of CUDA device to use for this context :type device: int :param iterations: Number of iterations used while benchmarking a kernel, 7 by default. :type iterations: int """ self.allocations = [] self.texrefs = [] if not pycuda_available and isinstance( drv, PyCudaPlaceHolder): #and part to allow mocking raise ImportError( "Error: pycuda not installed, please install e.g. using 'pip install pycuda'." ) drv.init() self.context = drv.Device(device).make_context() #inspect device properties devprops = { str(k): v for (k, v) in self.context.get_device().get_attributes().items() } self.max_threads = devprops['MAX_THREADS_PER_BLOCK'] cc = str(devprops.get('COMPUTE_CAPABILITY_MAJOR', '0')) + str( devprops.get('COMPUTE_CAPABILITY_MINOR', '0')) if cc == "00": cc = self.context.get_device().compute_capability() self.cc = str(cc[0]) + str(cc[1]) self.iterations = iterations self.current_module = None self.func = None self.compiler_options = compiler_options or [] #select PyCUDA source module if int(self.cc) >= 35: self.source_mod = DynamicSourceModule else: self.source_mod = SourceModule if not self.source_mod: raise ImportError( "Error: pycuda not correctly installed, please ensure pycuda is installed on the same CUDA installation as you're using right now" ) #create a stream and events self.stream = drv.Stream() self.start = drv.Event() self.end = drv.Event() #default dynamically allocated shared memory size, can be overwritten using smem_args self.smem_size = 0 #setup observers self.observers = observers or [] self.observers.append(CudaRuntimeObserver(self)) for obs in self.observers: obs.register_device(self) #collect environment information env = dict() env["device_name"] = self.context.get_device().name() env["cuda_version"] = ".".join([str(i) for i in drv.get_version()]) env["compute_capability"] = self.cc env["iterations"] = self.iterations env["compiler_options"] = compiler_options env["device_properties"] = devprops self.env = env self.name = env["device_name"]
# Get image data height, width = np.int32(original_image.shape) print "Processing %d x %d image" % (width, height) size = width * height # Initialize the image region as empty im_region = np.int8(np.zeros([height, width])) # On the host, define the kernel parameters blocksize = (128, 4, 1) #128,8 The number of threads per block (x,y,z) gridx = int(np.ceil(width / (1.0 * blocksize[0]))) gridy = int(np.ceil(height / (1.0 * blocksize[1]))) gridsize = (gridx, gridy) # The number of thread blocks (x,y) # Initialize the GPU event trackers for timing start_gpu_time = cu.Event() end_gpu_time = cu.Event() gpu_transfer_time = 0.0 gpu_comp_time = 0.0 cpu_compute_time = 0.0 # Allocate memory image = np.float32(np.array(original_image)) region = np.int8(np.zeros([height, width])) queue = np.int32(np.linspace(0, size - 1, size)) nextFront = np.int32(np.zeros([height, width])) scan = np.int32(np.zeros([height, width])) qLen = np.int32(queue.size) # Allocate device memory and copy host to device start_gpu_time.record() d_image = gpu.to_gpu(image.reshape(-1))
def main(): parser = argparse.ArgumentParser(description='BERT Inference Benchmark') parser.add_argument("-e", "--engine", help='Path to BERT TensorRT engine') parser.add_argument( '-b', '--batch-size', default=[], action="append", help= 'Batch size(s) to benchmark. Can be specified multiple times for more than one batch size. This script assumes that the engine has been built with one optimization profile for each batch size, and that these profiles are in order of increasing batch size.', type=int) parser.add_argument('-s', '--sequence-length', default=128, help='Sequence length of the BERT model', type=int) parser.add_argument( '-i', '--iterations', default=200, help='Number of iterations to run when benchmarking each batch size.', type=int) parser.add_argument( '-w', '--warm-up-runs', default=10, help='Number of iterations to run prior to benchmarking.', type=int) parser.add_argument('-r', '--random-seed', required=False, default=12345, help='Random seed.', type=int) args, _ = parser.parse_known_args() args.batch_size = args.batch_size or [1] # Import necessary plugins for BERT TensorRT ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL) with open(args.engine, 'rb') as f, trt.Runtime( TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(f.read( )) as engine, engine.create_execution_context() as context: # Allocate buffers large enough to store the largest batch size max_input_shape = (args.sequence_length * max(args.batch_size), ) max_output_shape = (args.sequence_length * max(args.batch_size), 2, 1, 1) buffers = [ DeviceBuffer(max_input_shape), DeviceBuffer(max_input_shape), DeviceBuffer((max(args.batch_size) + 1, )), DeviceBuffer((args.sequence_length, )), DeviceBuffer(max_output_shape) ] # Prepare random input pseudo_vocab_size = 30522 pseudo_type_vocab_size = 2 np.random.seed(args.random_seed) test_word_ids = np.random.randint( 0, pseudo_vocab_size, (args.sequence_length * max(args.batch_size)), dtype=np.int32) test_segment_ids = np.random.randint( 0, pseudo_type_vocab_size, (args.sequence_length * max(args.batch_size)), dtype=np.int32) test_cu_seq_lens = np.arange( 0, args.sequence_length * max(args.batch_size) + 1, args.sequence_length, dtype=np.int32) # Copy input h2d cuda.memcpy_htod(buffers[0].buf, test_word_ids.ravel()) cuda.memcpy_htod(buffers[1].buf, test_segment_ids.ravel()) cuda.memcpy_htod(buffers[2].buf, test_cu_seq_lens.ravel()) bench_times = {} for idx, batch_size in enumerate(sorted(args.batch_size)): context.active_optimization_profile = 0 # Each profile has unique bindings bindings = [buf.binding() for buf in buffers] shapes = { "input_ids": (args.sequence_length * batch_size, ), "segment_ids": (args.sequence_length * batch_size, ), "cu_seqlens": (batch_size + 1, ), "max_seqlen": (args.sequence_length, ), } for binding, shape in shapes.items(): context.set_binding_shape(engine[binding], shape) assert context.all_binding_shapes_specified # Inference total_time = 0 start = cuda.Event() end = cuda.Event() stream = cuda.Stream() # Warmup for _ in range(args.warm_up_runs): context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) stream.synchronize() # Timing loop times = [] for _ in range(args.iterations): start.record(stream) context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) end.record(stream) stream.synchronize() times.append(end.time_since(start)) # Compute average time, 95th percentile time and 99th percentile time. bench_times[batch_size] = times [b.free() for b in buffers] for batch_size, times in bench_times.items(): total_time = sum(times) avg_time = total_time / float(len(times)) times.sort() percentile95 = times[int(len(times) * 0.95)] percentile99 = times[int(len(times) * 0.99)] print( "Running {:} iterations with Batch Size: {:}\n\tTotal Time: {:} ms \tAverage Time: {:} ms\t95th Percentile Time: {:} ms\t99th Percentile Time: {:}" .format(args.iterations, batch_size, total_time, avg_time, percentile95, percentile99))
elif rank == 2: mpi_direction = 'f' else: mpi_direction = 'fb' if rank == 0: # prepare for plot from matplotlib.pyplot import * ion() imsh = imshow(np.ones((3*nx,ny),'f').T, cmap=cm.hot, origin='lower', vmin=0, vmax=0.005) colorbar() # measure kernel execution time from datetime import datetime t1 = datetime.now() flop = 3*(nx*ny*nz*30)*tgap flops = np.zeros(tmax/tgap+1) start, stop = cuda.Event(), cuda.Event() start.record() # main loop for tn in xrange(1, tmax+1): fdtd.update_h() fdtd.mpi_exchange_boundary_h(mpi_direction, comm) fdtd.update_e() fdtd.mpi_exchange_boundary_e(mpi_direction, comm) if rank == 1: fdtd.update_src(tn) if tn%tgap == 0 and rank == 0: stop.record() stop.synchronize()
update_h.prepare("PPPPPP", block=(tpb,1,1)) update_e.prepare("PPPPPPPPP", block=(tpb,1,1)) update_src.prepare("fP", block=(nz,1,1)) ''' # prepare for plot from matplotlib.pyplot import * ion() imsh = imshow(np.ones((nx,ny),'f').T, cmap=cm.hot, origin='lower', vmin=0, vmax=0.005) colorbar() ''' # measure kernel execution time #from datetime import datetime #t1 = datetime.now() start = cuda.Event() stop = cuda.Event() start.record() # main loop for tn in xrange(1, 1000+1): update_h.prepared_call( (bpg,1), ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu) update_e.prepared_call( (bpg/2,1), ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu, cex_gpu, cey_gpu, cez_gpu) update_src.prepared_call((1,1), np.float32(tn), ez_gpu) '''
import pycuda.driver as drv from pycuda import gpuarray import libcudnn, ctypes import numpy as np # Create a cuDNN context cudnn_context = libcudnn.cudnnCreate() # Set some options and tensor dimensions tensor_format = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW'] data_type = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT'] convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION'] convolution_fwd_pref = libcudnn.cudnnConvolutionFwdPreference[ 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'] start, end = (drv.Event(), drv.Event()) def start_bench(): start.record() def end_bench(op): end.record() end.synchronize() msecs = end.time_since(start) print("%7.3f msecs" % (msecs)) n_input = 64 filters_in = 128