def constructMaxwellian(self, U, M, Ut): nupts, ldim, _ = M.traits nvars, neles = M.ioshape[1:] grid = get_grid_for_block(self.block, U.ioshape[0]) self.momentNormKern.prepared_call(grid, self.block, U.ioshape[0], U, Ut) grid = get_grid_for_block(self.block, nupts * nvars * neles) self.cmaxwellianKern.prepared_call(grid, self.block, nupts, ldim, nvars, neles, self.d_cvx.ptr, self.d_cvy.ptr, self.d_cvz.ptr, M, Ut)
def swap_axes(self, fin, fout): nupts, ldim, _ = fin.traits nvars, neles = fout.ioshape[1:] grid_swap = get_grid_for_block(self.block, nupts * nvars * neles) self.swapKern.prepared_call(grid_swap, self.block, nupts, ldim, nvars, neles, fin, fout)
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Check that A is reasonably sparse if np.count_nonzero(a.get()) > self.max_nnz: raise NotSuitableError('Matrix too dense for GiMMiK') # Generate src = generate_mm(a.get(), dtype=a.dtype, platform='cuda', alpha=alpha, beta=beta) # Build fun = self._build_kernel('gimmik_mm', src, 'iPiPi') # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, b.ncol) class MulKernel(ComputeKernel): def run(self, queue): fun.prepared_async_call(grid, block, queue.cuda_stream_comp, b.ncol, b, b.leaddim, out, out.leaddim) return MulKernel()
def copy_to_reg(self, *arr, subdims=None): nrow, ldimr, dtype = arr[1].traits _, ldimf, _ = arr[0].traits ncolar, ncolb = arr[1].ioshape[1:] ncolaf, _ = arr[0].ioshape[1:] # Render the kernel template src = self.backend.lookup.get_template('copy_dgfs').render( subdims=subdims or range(ncolar), ncola0=ncolaf, ncola1=ncolar) #print(arr[0].traits) #print(arr[0].ioshape) #print(arr[1].traits) #print(arr[1].ioshape) #raise ValueError('copy_to_reg') # Build the kernel kern = self._build_kernel('copy_dgfs', src, [np.int32] * 4 + [np.intp] * 2 + [np.int32] * 4) # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, ncolb, nrow) class CopyToDGFSKernel(ComputeKernel): def run(self, queue, *consts): args = list(arr) + list(consts) kern.prepared_call(grid, block, nrow, ncolb, ldimf, ldimr, *args) return CopyToDGFSKernel()
def axnpby_dgfs_full(self, *arr, subdims=None): if any(arr[0].traits != x.traits for x in arr[1:]): raise ValueError('Incompatible matrix types') nv = len(arr) nrow, ldim, dtype = arr[0].traits ncola, ncolb = arr[0].ioshape[1:] size = nrow * ldim # Render the kernel template src = self.backend.lookup.get_template('axnpby_dgfs_full').render( nv=nv) # Build the kernel kern = self._build_kernel('axnpby_dgfs_full', src, [np.int32] * 2 + [np.intp] * nv + [dtype] * nv) # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, size) class AxnpbyDGFSFullKernel(ComputeKernel): def run(self, queue, *consts): args = list(arr) + list(consts) # changing from prepared_async_call (queue.cuda_stream_comp,) kern.prepared_call(grid, block, size, *args) return AxnpbyDGFSFullKernel()
def updateMomentARS(self, dt, *args): # the size of args should be 4*q+1 for ARS scheme q = (len(args) - 1) // 4 assert len(args) == 4 * q + 1, "Inconsistency in number of parameters" lda = np.int(args[-1].ioshape[0]) grid = get_grid_for_block(self.block, lda) self.updateMomKernsARS[q - 1].prepared_call(grid, self.block, self._prefactor, lda, dt, *args)
def updateDistARS(self, dt, *args): # the size of args should be 6*q+2 for ARS scheme q = (len(args) - 2) // 6 assert len(args) == 6 * q + 2, "Inconsistency in number of parameters" nupts, ldim, _ = args[-1].traits nvars, neles = args[-1].ioshape[1:] grid = get_grid_for_block(self.block, nupts * nvars * neles) self.updateDistKernsARS[q - 1].prepared_call(grid, self.block, self._prefactor, nupts, ldim, nvars, neles, dt, *args)
def updateDistBDF(self, dt, *args): # the size of args should be 4*q+5 for BDF scheme q = (len(args) - 5) // 4 assert len(args) == 4 * q + 5, "Inconsistency in number of parameters" nupts, ldim, _ = args[1].traits nvars, neles = args[1].ioshape[1:] grid = get_grid_for_block(self.block, nupts * nvars * neles) self.updateDistKernsBDF[q - 1].prepared_call(grid, self.block, self._prefactor, nupts, ldim, nvars, neles, dt, *args)
def pack(self, mv): # An exchange view is simply a regular view plus an exchange matrix m, v = mv.xchgmat, mv.view # Render the kernel template src = self.backend.lookup.get_template('pack').render() # Build kern = self._build_kernel('pack_view', src, 'iiiPPPP') # Compute the grid and thread-block size block = (128, 1, 1) grid = get_grid_for_block(block, v.n) # If MPI is CUDA aware then we just need to pack the buffer if self.backend.mpitype == 'cuda-aware': class PackXchgViewKernel(ComputeKernel): def run(self, queue): scomp = queue.cuda_stream_comp # Pack kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.rstrides or 0, m) # Otherwise, we need to both pack the buffer and copy it back else: # Create a CUDA event event = cuda.Event(cuda.event_flags.DISABLE_TIMING) class PackXchgViewKernel(ComputeKernel): def run(self, queue): scomp = queue.cuda_stream_comp scopy = queue.cuda_stream_copy # Pack kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.rstrides or 0, m) # Copy the packed buffer to the host event.record(scomp) scopy.wait_for_event(event) cuda.memcpy_dtoh_async(m.hdata, m.data, scopy) return PackXchgViewKernel()
def errest(self, x, y, z, *, norm): if x.traits != y.traits != z.traits: raise ValueError('Incompatible matrix types') nrow, ldim, dtype = x.traits ncola, ncolb = x.ioshape[1:] # Reduction block dimensions block = (128, 1, 1) # Determine the grid size grid = get_grid_for_block(block, ncolb) # Empty result buffer on host with shape (nvars, nblocks) err_host = cuda.pagelocked_empty((ncola, grid[0]), dtype, 'C') # Device memory allocation err_dev = cuda.mem_alloc(err_host.nbytes) # Get the kernel template src = self.backend.lookup.get_template('errest').render( norm=norm, ncola=ncola, sharesz=block[0]) # Build the reduction kernel rkern = self._build_kernel('errest', src, [np.int32] * 3 + [np.intp] * 4 + [dtype] * 2) # Norm type reducer = np.max if norm == 'uniform' else np.sum class ErrestKernel(ComputeKernel): @property def retval(self): return reducer(err_host, axis=1) def run(self, queue, atol, rtol): rkern.prepared_async_call(grid, block, queue.cuda_stream_comp, nrow, ncolb, ldim, err_dev, x, y, z, atol, rtol) cuda.memcpy_dtoh_async(err_host, err_dev, queue.cuda_stream_comp) return ErrestKernel()
def perform_precomputation(self): # Precompute aa, bb1, bb2 (required for kernel) # compute l Nv = self.vm.Nv() Nrho = self.vm.Nrho() M = self.vm.M() L = self.vm.L() qz = self.vm.qz() qw = self.vm.qw() sz = self.vm.sz() sw = self.vm.sw() vsize = self.vm.vsize() szpre = self._szpre swpre = self._swpre # precision control dint = np.int32 dfloat = np.float64 dcplx = np.complex128 l0 = np.concatenate((np.arange(0,Nv/2, dtype=dint), np.arange(-Nv/2, 0, dtype=dint))) #l = l0[np.mgrid[0:Nv, 0:Nv, 0:Nv]] #l = l.reshape((3,vsize)).astype(dtype_int) l = np.zeros((3,vsize), dtype=dint) for idv in range(vsize): I = int(idv/(Nv*Nv)) J = int((idv%(Nv*Nv))/Nv) K = int((idv%(Nv*Nv))%Nv) l[0,idv] = l0[I]; l[1,idv] = l0[J]; l[2,idv] = l0[K]; d_lx = gpuarray.to_gpu(np.ascontiguousarray(l[0,:])) d_ly = gpuarray.to_gpu(np.ascontiguousarray(l[1,:])) d_lz = gpuarray.to_gpu(np.ascontiguousarray(l[2,:])) # transfer sphere points to gpu d_sz_x = gpuarray.to_gpu(np.ascontiguousarray(sz[:,0])) d_sz_y = gpuarray.to_gpu(np.ascontiguousarray(sz[:,1])) d_sz_z = gpuarray.to_gpu(np.ascontiguousarray(sz[:,2])) # define complex to complex plan rank = 3 n = np.array([Nv, Nv, Nv], dtype=np.int32) #planD2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_D2Z) self.planZ2Z_MNrho = cufftPlanMany(rank, n.ctypes.data, None, 1, vsize, None, 1, vsize, CUFFT_Z2Z, M*Nrho) self.planZ2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_Z2Z) dfltargs = dict( Nrho=Nrho, M=M, vsize=vsize, sw=sw, prefac=self._prefactor, soasz=self.backend.soasz, cases=self._cases, masses=self.vm.masses(), qw=qw, qz=qz, L=L, sz=sz, gamma=self._gamma, eta=self._eta, Mpre=self._Mpre, szpre=szpre, swpre=swpre ) src = DottedTemplateLookup( 'frfs.solvers.dgfsbi.kernels.scattering', dfltargs ).get_template(self.scattering_model).render() # Compile the source code and retrieve the kernel print("\nCompiling scattering kernels, this may take some time ...") module = compiler.SourceModule(src) self.block = (256, 1, 1) self.grid = get_grid_for_block(self.block, vsize) print("Starting precomputation, this may take some time ...") start, end = cuda.Event(), cuda.Event() cuda.Context.synchronize() start.record() start.synchronize() self.d_aa = gpuarray.empty(Nrho*M*vsize, dtype=dfloat) precompute_aa = module.get_function("precompute_a") precompute_aa.prepare('PPPP') precompute_aa.set_cache_config(cuda.func_cache.PREFER_L1) precompute_aa.prepared_call(self.grid, self.block, d_lx.ptr, d_ly.ptr, d_lz.ptr, self.d_aa.ptr) self.d_bb1 = {}; self.d_bb2 = {} precompute_bb = {} for cp, cq in self._cases: cpcq = str(cp)+str(cq) self.d_bb1[cpcq] = gpuarray.empty(Nrho*M*vsize, dtype=dcplx) self.d_bb2[cpcq] = gpuarray.zeros(vsize, dtype=dcplx) precompute_bb[cpcq] = module.get_function("precompute_bc_"+cpcq) precompute_bb[cpcq].prepare('IIdddPPPPPPPP') precompute_bb[cpcq].set_cache_config(cuda.func_cache.PREFER_L1) for p in range(Nrho): fac = np.pi/L*qz[p] fac_b = swpre*pow(qz[p], self._gamma[cpcq]+2) fac_c = qw[p]*sw*fac_b for q in range(M): precompute_bb[cpcq].prepared_call(self.grid, self.block, dint(p), dint(q), dfloat(fac), dfloat(fac_b), dfloat(fac_c), d_lx.ptr, d_ly.ptr, d_lz.ptr, d_sz_x.ptr, d_sz_y.ptr, d_sz_z.ptr, self.d_bb1[cpcq].ptr, self.d_bb2[cpcq].ptr ) end.record() end.synchronize() secs = start.time_till(end)*1e-3 print("Finished precomputation in: %fs" % (secs)) # transform scalar to complex self.r2zKern = module.get_function("r2z") self.r2zKern.prepare('IIIIIIPP') self.r2zKern.set_cache_config(cuda.func_cache.PREFER_L1) # Prepare the cosSinMul kernel for execution self.cosSinMultKern = {} #self.computeQGKern = {} self.outKern = {} for cp, cq in self._cases: idx = str(cp) + str(cq) self.cosSinMultKern[idx] = module.get_function("cosSinMul_"+idx) self.cosSinMultKern[idx].prepare('PPPPP') self.cosSinMultKern[idx].set_cache_config( cuda.func_cache.PREFER_L1) #self.computeQGKern[idx] = module.get_function("computeQG_"+idx) #self.computeQGKern[idx].prepare('PPP') #self.computeQGKern[idx].set_cache_config( # cuda.func_cache.PREFER_L1) self.outKern[idx] = module.get_function("output_"+idx) self.outKern[idx].prepare('IIIIIIPPPP') self.outKern[idx].set_cache_config( cuda.func_cache.PREFER_L1) # prepare the computeQG kernel self.computeQGKern = module.get_function("computeQG") self.computeQGKern.prepare('PPP') self.computeQGKern.set_cache_config(cuda.func_cache.PREFER_L1) # Prepare the prodKern kernel for execution self.prodKern = module.get_function("prod") self.prodKern.prepare('PPP') self.prodKern.set_cache_config(cuda.func_cache.PREFER_L1) # Prepare the ax kernel for execution self.ax2Kern = module.get_function("ax2") self.ax2Kern.prepare('PPP') self.ax2Kern.set_cache_config(cuda.func_cache.PREFER_L1) # define scratch spaces self.d_FTf = gpuarray.empty(vsize, dtype=dcplx) self.d_FTg = gpuarray.empty(vsize, dtype=dcplx) self.d_f1C = gpuarray.empty_like(self.d_FTf) self.d_f2C = gpuarray.empty_like(self.d_FTf) self.d_QG = gpuarray.empty_like(self.d_FTf) self.d_t1 = gpuarray.empty(M*Nrho*vsize, dtype=dcplx) self.d_t2 = gpuarray.empty_like(self.d_t1) self.d_t3 = gpuarray.empty_like(self.d_t1)