def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs): super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs) initcondcls = subclass_where(DGFSInitConditionStd, model='maxwellian') bc = initcondcls(cfg, self._vm, cfgsect, wall=False) f0 = bc.get_init_vals().reshape(self._vm.vsize(), 1) self._d_bnd_f0 = gpuarray.to_gpu(f0) # template dfltargs = dict(dtype=cfg.dtypename, vsize=self._vm.vsize(), cw=self._vm.cw(), nl=nl, x=xsol) kernsrc = DottedTemplateLookup('dgfs1D.std.kernels.bcs', dfltargs).get_template( self.type).render() kernmod = compiler.SourceModule(kernsrc) # block size block = (128, 1, 1) grid_Nv = get_grid_for_block(block, self._vm.vsize()) # for extracting right face values applyBCFunc = get_kernel(kernmod, "applyBC", [np.intp] * 4 + [cfg.dtype]) self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call( grid_Nv, block, ul.ptr, ur.ptr, self._vm.d_cvx().ptr, self._d_bnd_f0.ptr, t)
def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs): super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs) initcondcls = subclass_where(DGFSInitConditionStd, model='maxwellian') bc = initcondcls(cfg, self._vm, cfgsect, wall=True) f0 = bc.get_init_vals().reshape(self._vm.vsize(), 1) self._d_bnd_f0 = gpuarray.to_gpu(f0) unondim = bc.unondim() # storage self._bc_vals_num = gpuarray.empty(self._vm.vsize(), self._d_bnd_f0.dtype) self._bc_vals_den = gpuarray.empty_like(self._bc_vals_num) dfltargs = dict(dtype=cfg.dtypename, vsize=self._vm.vsize(), cw=self._vm.cw(), ux=unondim[0, 0], nl=nl, x=xsol) kernsrc = DottedTemplateLookup('dgfs1D.std.kernels.bcs', dfltargs).get_template( self.type).render() kernmod = compiler.SourceModule(kernsrc) # block size block = (128, 1, 1) grid_Nv = get_grid_for_block(block, self._vm.vsize()) # for extracting right face values applyBCFunc = get_kernel(kernmod, "applyBC", [np.intp] * 5 + [unondim.dtype]) self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call( grid_Nv, block, ul.ptr, ur.ptr, self._vm.d_cvx().ptr, self._d_bnd_f0.ptr, self._wall_nden.ptr, t) # for extracting left face values updateBCFunc = get_kernel(kernmod, "updateBC", [np.intp] * 5 + [unondim.dtype]) def updateBC(ul, t): updateBCFunc.prepared_call(grid_Nv, block, ul.ptr, self._vm.d_cvx().ptr, self._d_bnd_f0.ptr, self._bc_vals_num.ptr, self._bc_vals_den.ptr, t) self._wall_nden = -(gpuarray.sum(self._bc_vals_num) / gpuarray.sum(self._bc_vals_den)) #print(xsol, self._wall_nden.get()) self._updateBCKern = updateBC
def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs): super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs) initcondcls = subclass_where(DGFSInitConditionBi, model='maxwellian') bc = initcondcls(cfg, self._vm, cfgsect, wall=False) f0 = bc.get_init_vals() self._d_bnd_f0 = [gpuarray.to_gpu(f.ravel()) for f in f0] unondim = bc.unondim() # template dfltargs = dict(dtype=cfg.dtypename, vsize=self._vm.vsize(), cw=self._vm.cw(), ux=unondim[0, 0], nl=nl, x=xsol) kernsrc = DottedTemplateLookup('dgfs1D.bi.kernels.bcs', dfltargs).get_template( self.type).render() kernmod = compiler.SourceModule(kernsrc) # block size block = (128, 1, 1) grid_Nv = get_grid_for_block(block, self._vm.vsize()) # for applying the boundary condition def make_applyBC(p, applyBCFunc): def applyBC(ul, ur, t): applyBCFunc.prepared_call(grid_Nv, block, ul.ptr, ur.ptr, self._vm.d_cvx().ptr, self._d_bnd_f0[p].ptr, t) return applyBC applyBCFunc = get_kernel(kernmod, "applyBC", [np.intp] * 4 + [unondim.dtype]) for p in range(vm.nspcs()): self._applyBCKern[p] = make_applyBC(p, applyBCFunc)
def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs): super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs) dfltargs = dict(dtype=cfg.dtypename, vsize=self._vm.vsize(), cw=self._vm.cw(), nl=nl, x=xsol) kernsrc = DottedTemplateLookup('dgfs1D.std.kernels.bcs', dfltargs).get_template( self.type).render() kernmod = compiler.SourceModule(kernsrc) # block size block = (128, 1, 1) grid_Nv = get_grid_for_block(block, self._vm.vsize()) # copy the left face values to the right applyBCFunc = get_kernel(kernmod, "applyBC", 'PP') self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call( grid_Nv, block, ul.ptr, ur.ptr)
def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs): super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs) #initcondcls = subclass_where(DGFSInitConditionStd, model='maxwellian-expr-nondim') #bc = initcondcls(cfg, self._vm, cfgsect) #f0 = np.zeros((1,1,self._vm.vsize())) #bc.apply_init_vals(f0, 1, 1, xsol) self.vm = self._vm rho, ux, T = map(lambda v: cfg.lookupfloat(cfgsect, v), ('rho', 'ux', 'T')) f0 = self.maxwellian(rho, ux, 0, 0, T) f0 = f0.reshape(self._vm.vsize(), 1) self._d_bnd_f0 = gpuarray.to_gpu(f0) # template dfltargs = dict(dtype=cfg.dtypename, vsize=self._vm.vsize(), cw=self._vm.cw(), nl=nl, x=xsol, u=ux) kernsrc = DottedTemplateLookup( 'dgfs1D.std.kernels.bcs', dfltargs).get_template('dgfs-inlet-nondim').render() kernmod = compiler.SourceModule(kernsrc) # block size block = (128, 1, 1) grid_Nv = get_grid_for_block(block, self._vm.vsize()) # for extracting right face values applyBCFunc = get_kernel(kernmod, "applyBC", [np.intp] * 4 + [cfg.dtype]) self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call( grid_Nv, block, ul.ptr, ur.ptr, self._vm.d_cvx().ptr, self._d_bnd_f0.ptr, t)
def main(Ne=None, dt=None): # who am I in this world? (Bulleh Shah, 18th century sufi poet) comm, rank, root = get_comm_rank_root() # read the inputs (from people) cfg, args = initialize() if Ne is not None: cfg._cp.set('mesh', 'Ne', str(int(Ne))) if dt is not None: cfg._cp.set('time-integrator', 'dt', str(float(dt))) mesh = Mesh(cfg) # define 1D mesh (construct a 1D world view) xmesh = mesh.xmesh # number of elements (how refined perspectives do we want/have?) Ne = mesh.Ne # define the basis (what is the basis for those perspectives?) bsKind = cfg.lookup('basis', 'kind') #assert bsKind == 'nodal-sem-gll', "Only one supported as of now" basiscls = subclass_where(Basis, basis_kind=bsKind) basis = basiscls(cfg) # number of local degrees of freedom (depth/granualirity of perspectives) K = basis.K # number of solution points (how far can I interpolate my learning) Nq = basis.Nq # left/right face maps Nqf = basis.Nqf # number of points used in reconstruction at faces mapL, mapR = np.arange(Ne + 1) + (Nqf - 1) * Ne - 1, np.arange(Ne + 1) mapL[0], mapR[-1] = 0, Ne * Nqf - 1 Nf = len(mapL) # the zeros z = basis.z # jacobian of the mapping from D^{st}=[-1,1] to D jac, invjac = mesh.jac, mesh.invjac # load the velocity mesh vm = DGFSVelocityMeshStd(cfg) Nv = vm.vsize() # load the scattering model smn = cfg.lookup('scattering-model', 'type') scatteringcls = subclass_where(DGFSScatteringModelAstd, scattering_model=smn) sm = scatteringcls(cfg, vm, Ne=Ne) # initial time, time step, final time ti, dt, tf = cfg.lookupfloats('time-integrator', ('tstart', 'dt', 'tend')) nsteps = np.ceil((tf - ti) / dt) dt = (tf - ti) / nsteps # Compute the location of the solution points xsol = np.array( [0.5 * (xmesh[j] + xmesh[j + 1]) + jac[j] * z for j in range(Ne)]).T xcoeff = np.einsum("kq,qe->ke", basis.fwdTransMat, xsol) # Determine the grid/block NeNv = Ne * Nv KNeNv = K * Ne * Nv NqNeNv = Nq * Ne * Nv NqfNeNv = Nqf * Ne * Nv NfNv = Nf * Nv block = (128, 1, 1) grid_Nv = get_grid_for_block(block, Nv) grid_NeNv = get_grid_for_block(block, Ne * Nv) grid_KNeNv = get_grid_for_block(block, K * Ne * Nv) # operator generator for matrix operations matOpGen = lambda v: lambda arg0, arg1: v.prepared_call( grid_NeNv, block, NeNv, arg0.ptr, NeNv, arg1.ptr, NeNv) # forward trans, backward, backward (at faces), derivative kernels fwdTrans_Op, bwdTrans_Op, bwdTransFace_Op, deriv_Op, invMass_Op, \ computeCellAvg_Op, extractDrLin_Op = map( matOpGen, (basis.fwdTransOp, basis.bwdTransOp, basis.bwdTransFaceOp, basis.derivOp, basis.invMassOp, basis.computeCellAvgKern, basis.extractDrLinKern) ) # U, V operator kernels trans_U_Op = tuple(map(matOpGen, basis.uTransOps)) trans_V_Op = tuple(map(matOpGen, basis.vTransOps)) # prepare the kernel for extracting face/interface values dfltargs = dict(K=K, Ne=Ne, Nq=Nq, vsize=Nv, dtype=cfg.dtypename, mapL=mapL, mapR=mapR, offsetL=0, offsetR=len(mapR) - 1, invjac=invjac, gRD=basis.gRD, gLD=basis.gLD, xsol=xsol) kernsrc = DottedTemplateLookup('dgfs1D.std.kernels', dfltargs).get_template('std').render() kernmod = compiler.SourceModule(kernsrc) dfltargs.update(nalph=sm.nalph, Dr=basis.derivMat) kernlimssrc = DottedTemplateLookup( 'dgfs1D.astd.kernels', dfltargs).get_template('limiters').render() kernlimsmod = compiler.SourceModule(kernlimssrc) # prepare operators for execution (see std.mako for description) (extLeft_Op, extRight_Op, transferBC_L_Op, transferBC_R_Op, insertBC_L_Op, insertBC_R_Op) = map( lambda v: lambda *args: get_kernel(kernmod, v, 'PP').prepared_call( grid_Nv, block, *list(map(lambda c: c.ptr, args))), ("extract_left", "extract_right", "transfer_bc_left", "transfer_bc_right", "insert_bc_left", "insert_bc_right")) # The boundary conditions (by default all boundaries are processor bnds) bcl_type, bcr_type = 'dgfs-periodic', 'dgfs-periodic' # the mesh is decomposed in linear fashion, so rank 0 gets left boundary if rank == 0: bcl_type = cfg.lookup('soln-bcs-xlo', 'type') # and the last rank comm.size-1 gets the right boundary if rank == comm.size - 1: bcr_type = cfg.lookup('soln-bcs-xhi', 'type') # prepare kernels for left boundary bcl_cls = subclass_where(DGFSBCStd, type=bcl_type) bcl = bcl_cls(xmesh[0], -1., vm, cfg, 'soln-bcs-xlo') updateBC_L_Op = bcl.updateBCKern applyBC_L_Op = bcl.applyBCKern # prepare kernels for right boundary bcr_cls = subclass_where(DGFSBCStd, type=bcr_type) bcr = bcr_cls(xmesh[-1], 1., vm, cfg, 'soln-bcs-xhi') updateBC_R_Op = bcr.updateBCKern applyBC_R_Op = bcr.applyBCKern #if bcl_type == 'dgfs-cyclic' or bcr_type == 'dgfs-cyclic': # assert(bcl_type==bcr_type); # flux kernel flux = get_kernel(kernmod, "flux", 'PPPPP') flux_Op = lambda d_uL, d_uR, d_jL, d_jR: flux.prepared_call( grid_Nv, block, d_uL.ptr, d_uR.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr) # multiply the derivative by the advection velocity mulbyadv = get_kernel(kernmod, "mul_by_adv", 'PP') mulbyadv_Op = lambda d_ux: mulbyadv.prepared_call(grid_KNeNv, block, vm.d_cvx().ptr, d_ux.ptr) # multiply the coefficient by the inverse jacobian mulbyinvjac = get_kernel(kernmod, "mul_by_invjac", 'P') mulbyinvjac_Op = lambda d_ux: mulbyinvjac.prepared_call( grid_Nv, block, d_ux.ptr) # \alpha AX + \beta Y kernel (for operations on coefficients) axnpbyCoeff = get_axnpby_kerns(2, range(K), NeNv, cfg.dtype) axnpbyCoeff_Op = lambda a0, x0, a1, x1: axnpbyCoeff.prepared_call( grid_NeNv, block, x0.ptr, x1.ptr, a0, a1) # total flux kernel (sums up surface and volume terms) totalFlux = get_kernel(kernmod, "totalFlux", 'PPPP') totalFlux_Op = lambda d_ux, d_jL, d_jR: totalFlux.prepared_call( grid_Nv, block, d_ux.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr) # linear limiter limitLin = get_kernel(kernlimsmod, "limitLin", 'PPPP') limitLin_Op = lambda d_u, d_ulx, d_uavg, d_ulim: \ limitLin.prepared_call(grid_Nv, block, d_u.ptr, d_ulx.ptr, d_uavg.ptr, d_ulim.ptr) # allocations on gpu d_usol = gpuarray.empty(NqNeNv, dtype=cfg.dtype) d_usolF = gpuarray.empty(NqfNeNv, dtype=cfg.dtype) d_uL = gpuarray.empty(NfNv, dtype=cfg.dtype) d_uR = gpuarray.empty(NfNv, dtype=cfg.dtype) d_jL = gpuarray.empty(NfNv, dtype=cfg.dtype) d_jR = gpuarray.empty(NfNv, dtype=cfg.dtype) d_bcL = gpuarray.empty(Nv, dtype=cfg.dtype) d_bcR = gpuarray.empty(Nv, dtype=cfg.dtype) d_bcT = gpuarray.empty(Nv, dtype=cfg.dtype) d_ux = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_f = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_g = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_ucoeff = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_ucoeffPrev = gpuarray.empty_like(d_ucoeff) # check if this is a new run if hasattr(args, 'process_run'): usol = np.empty((Nq, Ne, Nv), dtype=cfg.dtype) # temporary storage # load the initial condition model icn = cfg.lookup('soln-ics', 'type') initcondcls = subclass_where(DGFSInitConditionStd, model=icn) ic = initcondcls(cfg, vm, 'soln-ics') ic.apply_init_vals(usol, Nq, Ne, xsol, mesh=mesh, basis=basis, sm=sm) # transfer the information to the gpu d_usol.set(usol.ravel()) # forward transform to coefficient space fwdTrans_Op(d_usol, d_ucoeff) # check if we are restarting if hasattr(args, 'process_restart'): import h5py as h5py check(len(args.dist[0]) == comm.size, "No. of distributions != nranks") with h5py.File(args.dist[0][rank].name, 'r') as h5f: dst = h5f['coeff'] ti = dst.attrs['time'] d_ucoeff.set(dst[:]) check(dst.attrs['K'] == K, "Inconsistent distribution K") check(dst.attrs['Ne'] == Ne, "Inconsistent distribution Ne") check(dst.attrs['Nv'] == Nv, "Inconsistent distribution N") # backward transform to solution space bwdTrans_Op(d_ucoeff, d_usol) # prepare the post-processing handlers # For computing moments moments = DGFSMomWriterStd(ti, basis.interpMat, xcoeff, d_ucoeff, vm, cfg, 'dgfsmomwriter') # For computing residual residual = DGFSResidualStd(cfg, 'dgfsresidual') # For writing distribution function distribution = DGFSDistributionStd(ti, (K, Ne, Nv), cfg, 'dgfsdistwriter') # Actual algorithm # initialize axnpbyCoeff_Op(0., d_ucoeffPrev, 1., d_ucoeff) sigModes = basis.sigModes # define the neighbours from mpi4py import MPI down_nbr, up_nbr = comm.rank - 1, comm.rank + 1 if up_nbr >= comm.size: up_nbr = MPI.PROC_NULL if down_nbr < 0: down_nbr = MPI.PROC_NULL # define the explicit part def explicit(time, d_ucoeff_in, d_ucoeff_out): # reconstruct solution at faces bwdTransFace_Op(d_ucoeff_in, d_usolF) # Step:1 extract the solution at faces extLeft_Op(d_usolF, d_uL) extRight_Op(d_usolF, d_uR) # transfer left boundary information in send buffer transferBC_L_Op(d_uL, d_bcL) # Transfer the left ghost BC info transferBC_R_Op(d_uR, d_bcR) # Transfer the right ghost BC info # this can be adjusted in case of RDMA enabled MPI support #h_bcL, h_bcR = d_bcL.get(), d_bcR.get() #h_bcL, h_bcR = map(lambda v: v.gpudata.as_buffer(v.nbytes), # (d_bcL, d_bcR)) # send information req1 = comm.isend(d_bcR, dest=up_nbr) # to upstream neighbour req2 = comm.isend(d_bcL, dest=down_nbr) # to downstream neighbour # recieve information h_bcL = comm.recv(source=down_nbr) # from downstream neighbour h_bcR = comm.recv(source=up_nbr) # from upstream neighbour MPI.Request.Waitall([req1, req2]) # set information at left, right boundary if h_bcL: d_bcL.set(h_bcL) else: transferBC_L_Op(d_uL, d_bcL) if h_bcR: d_bcR.set(h_bcR) else: transferBC_R_Op(d_uR, d_bcR) # The physical-periodic boundary condition if comm.size == 1 and bcr_type == 'dgfs-cyclic': copy(d_bcT, d_bcL) copy(d_bcL, d_bcR) copy(d_bcR, d_bcT) else: # At left, receive from right-most communicator; and vice-versa req1 = req2 = MPI.REQUEST_NULL if bcl_type == 'dgfs-cyclic': req1 = comm.isend(d_bcL, dest=comm.size - 1) if bcr_type == 'dgfs-cyclic': req2 = comm.isend(d_bcR, dest=0) if bcr_type == 'dgfs-cyclic': h_bcR = comm.recv(source=0) if bcl_type == 'dgfs-cyclic': h_bcL = comm.recv(source=comm.size - 1) MPI.Request.Waitall([req1, req2]) if bcl_type == 'dgfs-cyclic': d_bcL.set(h_bcL) elif bcr_type == 'dgfs-cyclic': d_bcR.set(h_bcR) # At left boundary #transferBC_L_Op(d_uL, d_bcL) # Transfer the ghost BC info updateBC_L_Op(d_bcL, time) # now update boundary info applyBC_L_Op(d_bcL, d_bcT, time) # apply boundary condition insertBC_L_Op(d_bcT, d_uL) # insert info to global face-flux # At right boundary #transferBC_R_Op(d_uR, d_bcL) # Transfer the ghost BC info updateBC_R_Op(d_bcR, time) # now update boundary info applyBC_R_Op(d_bcR, d_bcT, time) # apply boundary condition insertBC_R_Op(d_bcT, d_uR) # insert info to global face-flux # Step:2 Compute the flux and jumps (all operations in single call) #fL, fR = cvx*uL, cvx*uR #fupw = 0.5*(fL + fR) + 0.5*np.abs(cvx)*(uL - uR) #jL = fupw - fL # Compute the jump at left boundary #jR = fupw - fR # Compute the jump at right boundary flux_Op(d_uL, d_uR, d_jL, d_jR) # Step:3 evaluate the derivative # ux = -cvx*np.einsum("ml,em->el", Sx, ucoeff) deriv_Op(d_ucoeff_in, d_ux) mulbyadv_Op(d_ux) # Compute the continuous flux for each element in strong form totalFlux_Op(d_ux, d_jL, d_jR) # multiply by the inverse jacobian # Now we have f* = d_ux mulbyinvjac_Op(d_ux) # project back to coefficient space invMass_Op(d_ux, d_ucoeff_out) d_uavg, d_ulx = map(gpuarray.empty_like, [d_ucoeff] * 2) def limit(d_ucoeff_in, d_ucoeff_out): assert comm.size == 1, "Not implemented" #assert basis.basis_kind == 'nodal-sem-gll', "Not implemented" # Extract the cell average computeCellAvg_Op(d_ucoeff_in, d_uavg) # extract gradient of the linear polynomial extractDrLin_Op(d_ucoeff_in, d_ulx) mulbyinvjac_Op(d_ulx) # limit functions in all cells limitLin_Op(d_ucoeff_in, d_ulx, d_uavg, d_ucoeff_out) # define a time-integrator (we use Euler scheme: good enough for steady) odestype = cfg.lookup('time-integrator', 'scheme') odescls = subclass_where(DGFSIntegratorAstd, intg_kind=odestype) limitOn = cfg.lookupordefault('time-integrator', 'limiter', 0) # Finally start everything time = ti # initialize time in case of restart nacptsteps = 0 # number of elasped steps in the current run # initialize ode: this performs pre-integration for multi-step schemes odes = odescls(explicit, sm, (K, Ne, Nv), cfg.dtype, t=time, dt=dt, f0=d_ucoeff) # start timer start = timer() while (time < tf): # March in time odes.integrate(time, dt, nacptsteps, d_ucoeff) if limitOn: limit(d_ucoeff, d_ucoeff) # increment time time += dt nacptsteps += 1 # Final step: post processing routines residual(time, nacptsteps, d_ucoeff, d_ucoeffPrev) moments(dt, time, d_ucoeff) distribution(dt, time, d_ucoeff) # copy the solution for the next time step cuda.memcpy_dtod(d_ucoeffPrev.ptr, d_ucoeff.ptr, d_ucoeff.nbytes) # print elasped time end = timer() elapsed = np.array([end - start]) if rank != root: comm.Reduce(elapsed, None, op=get_mpi('sum'), root=root) else: comm.Reduce(get_mpi('in_place'), elapsed, op=get_mpi('sum'), root=root) avgtime = elapsed[0] / comm.size print("Nsteps", nacptsteps, ", elapsed time", avgtime, "s") return d_ucoeff, mesh, vm, basis
def load_modules(self): """Load modules (this must be called internally)""" # number of stages nbdf = [1, 2, 3, 4, 5, 6]; nars = [1, 2, 3, 4] # extract the template dfltargs = dict(vsize=self.vm.vsize(), nalph=self.nalph, omega=self._omega, Pr=self._Pr, dtype=self.cfg.dtypename, nbdf=nbdf, nars=nars) src = DottedTemplateLookup('dgfs1D.astd.kernels.scattering', dfltargs ).get_template(self.scattering_model).render() # Compile the source code and retrieve the kernel module = compiler.SourceModule(src) # data type name prefix dtn = self.cfg.dtypename[0] # construct maxwellian given (rho, rho*ux, rho*uy, rho*uz, E) self.cmaxwellianKern = get_kernel(module, "cmaxwellian", 'iPPPPP') # construct the collision operator self.collideKern = get_kernel(module, "collide", dtn+'iPPPP') self.collideNuKern = get_kernel(module, "collide_nu", dtn+'iPPPP') # update the moment self.updateMomKernsBDF = tuple(map( lambda q: get_kernel(module, "updateMom{0}_BDF".format(q), dtn+'i'+dtn+(dtn+'P')*(2*q+1)+dtn), nbdf )) self.updateMomKernsARS = tuple(map( lambda q: get_kernel(module, "updateMom{0}_ARS".format(q), dtn+'i'+dtn+dtn*(2*q)+'P'*(2*q+1)), nars )) self.updateMomKernsLM = tuple(map( lambda q: get_kernel(module, "updateMom{0}_LM".format(q), dtn+'i'+dtn+dtn*(3*q+1)+'P'*(3*q+1)), nbdf )) # update the distribution self.updateDistKernsBDF = tuple(map( lambda q: get_kernel(module, "updateDistribution{0}_BDF".format(q), dtn+'i'+dtn+(dtn+'P')*(2*q+2)+'P'), nbdf )) self.updateDistNuKernsBDF = tuple(map( lambda q: get_kernel(module, "updateDistributionNu{0}_BDF".format(q), dtn+'i'+dtn+(dtn+'P')*(2*q+2)+'P'), nbdf )) self.updateDistKernsARS = tuple(map( lambda q: get_kernel(module, "updateDistribution{0}_ARS".format(q), dtn+'i'+dtn+dtn*(2*q)+'P'*(4*q+2)), nars )) self.updateDistWeightKernsSSPL = tuple(map( lambda q: get_kernel(module, "updateDistributionWeight{0}_SSPL".format(q), dtn+'i'+dtn+dtn*(2*q)+'P'*(4*q+3)), nars )) self.updateDistKernsLM = tuple(map( lambda q: get_kernel(module, "updateDistribution{0}_LM".format(q), dtn+'i'+dtn+dtn*(3*q+1)+'P'*(4*q+3)), nbdf )) self.module = module
def perform_precomputation(self): # Precompute aa, bb1, bb2 (required for kernel) # compute l Nv = self.vm.Nv() Nrho = self.vm.Nrho() M = self.vm.M() L = self.vm.L() qz = self.vm.qz() qw = self.vm.qw() sz = self.vm.sz() sw = self.vm.sw() vsize = self.vm.vsize() szpre = self._szpre swpre = self._swpre check(self.cfg.dtype == np.float64, "Need to extend for single precision") # precision control dint = np.int32 dfloat = np.float64 dcplx = np.complex128 if self.cfg.dtype == np.float32: dfloat = np.float32 dcplx = np.complex64 l0 = np.concatenate( (np.arange(0, Nv / 2, dtype=dint), np.arange(-Nv / 2, 0, dtype=dint))) l = np.zeros((3, vsize), dtype=dint) for idv in range(vsize): I = int(idv / (Nv * Nv)) J = int((idv % (Nv * Nv)) / Nv) K = int((idv % (Nv * Nv)) % Nv) l[0, idv] = l0[I] l[1, idv] = l0[J] l[2, idv] = l0[K] d_lx = gpuarray.to_gpu(np.ascontiguousarray(l[0, :])) d_ly = gpuarray.to_gpu(np.ascontiguousarray(l[1, :])) d_lz = gpuarray.to_gpu(np.ascontiguousarray(l[2, :])) # transfer sphere points to gpu d_sz_x = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 0])) d_sz_y = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 1])) d_sz_z = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 2])) # define complex to complex plan rank = 3 n = np.array([Nv, Nv, Nv], dtype=np.int32) #planD2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_D2Z) self.planZ2Z_MNrho = cufftPlanMany(rank, n.ctypes.data, None, 1, vsize, None, 1, vsize, CUFFT_Z2Z, M * Nrho) self.planZ2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_Z2Z) dfltargs = dict( dtype=self.cfg.dtypename, Nrho=Nrho, M=M, vsize=vsize, sw=sw, prefac=self._prefactor, cases=self._cases, masses=self.vm.masses(), qw=qw, qz=qz, L=L, sz=sz, gamma=self._gamma, eta=self._eta, Mpre=self._Mpre, szpre=szpre, swpre=swpre #, Ne=self._Ne ) src = DottedTemplateLookup('dgfs1D.bi.kernels.scattering', dfltargs).get_template( self.scattering_model).render() # Compile the source code and retrieve the kernel print("\nCompiling scattering kernels, this may take some time ...") module = compiler.SourceModule(src) self.block = (256, 1, 1) self.grid = get_grid_for_block(self.block, vsize) print("Starting precomputation, this may take some time ...") start, end = cuda.Event(), cuda.Event() cuda.Context.synchronize() start.record() start.synchronize() self.d_aa = gpuarray.empty(Nrho * M * vsize, dtype=dfloat) precompute_aa = get_kernel(module, "precompute_a", 'PPPP') precompute_aa.prepared_call(self.grid, self.block, d_lx.ptr, d_ly.ptr, d_lz.ptr, self.d_aa.ptr) self.d_bb1 = {} self.d_bb2 = {} precompute_bb = {} for cp, cq in self._cases: cpcq = str(cp) + str(cq) self.d_bb1[cpcq] = gpuarray.empty(Nrho * M * vsize, dtype=dcplx) self.d_bb2[cpcq] = gpuarray.zeros(vsize, dtype=dcplx) precompute_bb[cpcq] = module.get_function("precompute_bc_" + cpcq) precompute_bb[cpcq].prepare('IIdddPPPPPPPP') precompute_bb[cpcq].set_cache_config(cuda.func_cache.PREFER_L1) for p in range(Nrho): fac = np.pi / L * qz[p] fac_b = swpre * pow(qz[p], self._gamma[cpcq] + 2) fac_c = qw[p] * sw * fac_b for q in range(M): precompute_bb[cpcq].prepared_call( self.grid, self.block, dint(p), dint(q), dfloat(fac), dfloat(fac_b), dfloat(fac_c), d_lx.ptr, d_ly.ptr, d_lz.ptr, d_sz_x.ptr, d_sz_y.ptr, d_sz_z.ptr, self.d_bb1[cpcq].ptr, self.d_bb2[cpcq].ptr) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 print("Finished precomputation in: %fs" % (secs)) # transform scalar to complex self.r2zKern = module.get_function("r2z") self.r2zKern.prepare('IIIPP') self.r2zKern.set_cache_config(cuda.func_cache.PREFER_L1) # Prepare the cosSinMul kernel for execution self.cosSinMultKern = {} #self.computeQGKern = {} self.outKern = {} for cp, cq in self._cases: idx = str(cp) + str(cq) self.cosSinMultKern[idx] = module.get_function("cosSinMul_" + idx) self.cosSinMultKern[idx].prepare('PPPPP') self.cosSinMultKern[idx].set_cache_config( cuda.func_cache.PREFER_L1) #self.computeQGKern[idx] = module.get_function("computeQG_"+idx) #self.computeQGKern[idx].prepare('PPP') #self.computeQGKern[idx].set_cache_config( # cuda.func_cache.PREFER_L1) self.outKern[idx] = module.get_function("output_" + idx) self.outKern[idx].prepare('IIIIPPPP') self.outKern[idx].set_cache_config(cuda.func_cache.PREFER_L1) # prepare the computeQG kernel self.computeQGKern = module.get_function("computeQG") self.computeQGKern.prepare('PPP') self.computeQGKern.set_cache_config(cuda.func_cache.PREFER_L1) # Prepare the prodKern kernel for execution self.prodKern = module.get_function("prod") self.prodKern.prepare('PPP') self.prodKern.set_cache_config(cuda.func_cache.PREFER_L1) # Prepare the ax kernel for execution self.ax2Kern = module.get_function("ax2") self.ax2Kern.prepare('PPP') self.ax2Kern.set_cache_config(cuda.func_cache.PREFER_L1) # define scratch spaces self.d_FTf = gpuarray.empty(vsize, dtype=dcplx) self.d_FTg = gpuarray.empty(vsize, dtype=dcplx) self.d_f1C = gpuarray.empty_like(self.d_FTf) self.d_f2C = gpuarray.empty_like(self.d_FTf) self.d_QG = gpuarray.empty_like(self.d_FTf) self.d_t1 = gpuarray.empty(M * Nrho * vsize, dtype=dcplx) self.d_t2 = gpuarray.empty_like(self.d_t1) self.d_t3 = gpuarray.empty_like(self.d_t1)
def main(): # who am I in this world? (Bulleh Shah, 18th century sufi poet) comm, rank, root = get_comm_rank_root() # read the inputs (from people) cfg, args = initialize() mesh = Mesh(cfg) # define 1D mesh (construct a 1D world view) xmesh = mesh.xmesh # number of elements (how refined perspectives do we want/have?) Ne = mesh.Ne # define the basis (what is the basis for those perspectives?) bsKind = cfg.lookup('basis', 'kind') basiscls = subclass_where(Basis, basis_kind=bsKind) basis = basiscls(cfg) # number of local degrees of freedom (depth/granualirity of perspectives) K = basis.K # number of solution points (how far can I interpolate my learning) Nq = basis.Nq # left/right face maps Nqf = basis.Nqf # number of points used in reconstruction at faces mapL, mapR = np.arange(Ne+1)+(Nqf-1)*Ne-1, np.arange(Ne+1) mapL[0], mapR[-1] = 0, Ne*Nqf-1 Nf = len(mapL) # the zeros z = basis.z # jacobian of the mapping from D^{st}=[-1,1] to D jac, invjac = mesh.jac, mesh.invjac # load the velocity mesh vm = DGFSVelocityMeshBi(cfg) Nv = vm.vsize() # load the scattering model smn = cfg.lookup('scattering-model', 'type') scatteringcls = subclass_where(DGFSScatteringModelBi, scattering_model=smn) sm = scatteringcls(cfg, vm, Ne=Ne) # initial time, time step, final time ti, dt, tf = cfg.lookupfloats('time-integrator', ('tstart', 'dt', 'tend')) nsteps = np.ceil((tf - ti)/dt) dt = (tf - ti)/nsteps # Compute the location of the solution points xsol = np.array([0.5*(xmesh[j]+xmesh[j+1])+jac[j]*z for j in range(Ne)]).T xcoeff = np.einsum("kq,qe->ke", basis.fwdTransMat, xsol) # Determine the grid/block NeNv = Ne*Nv KNeNv = K*Ne*Nv NqNeNv = Nq*Ne*Nv NqfNeNv = Nqf*Ne*Nv NfNv = Nf*Nv block = (128, 1, 1) grid_Nv = get_grid_for_block(block, Nv) grid_NeNv = get_grid_for_block(block, Ne*Nv) grid_KNeNv = get_grid_for_block(block, K*Ne*Nv) # operator generator for matrix operations matOpGen = lambda v: lambda arg0, arg1: v.prepared_call( grid_NeNv, block, NeNv, arg0.ptr, NeNv, arg1.ptr, NeNv) # forward trans, backward, backward (at faces), derivative kernels fwdTrans_Op, bwdTrans_Op, bwdTransFace_Op, deriv_Op, invMass_Op = map( matOpGen, (basis.fwdTransOp, basis.bwdTransOp, basis.bwdTransFaceOp, basis.derivOp, basis.invMassOp) ) # U, V operator kernels trans_U_Op = tuple(map(matOpGen, basis.uTransOps)) trans_V_Op = tuple(map(matOpGen, basis.vTransOps)) # prepare the kernel for extracting face/interface values dfltargs = dict( K=K, Ne=Ne, Nq=Nq, vsize=Nv, dtype=cfg.dtypename, mapL=mapL, mapR=mapR, offsetL=0, offsetR=len(mapR)-1, invjac=invjac, gRD=basis.gRD, gLD=basis.gLD) kernsrc = DottedTemplateLookup('dgfs1D.bi.kernels', dfltargs).get_template('bi').render() kernmod = compiler.SourceModule(kernsrc) # prepare operators for execution (see bi.mako for description) (extLeft_Op, extRight_Op, transferBC_L_Op, transferBC_R_Op, insertBC_L_Op, insertBC_R_Op) = map(lambda v: lambda *args: get_kernel(kernmod, v, 'PP').prepared_call( grid_Nv, block, *list(map(lambda c: c.ptr, args)) ), ("extract_left", "extract_right", "transfer_bc_left", "transfer_bc_right", "insert_bc_left", "insert_bc_right") ) # The boundary conditions (by default all boundaries are processor bnds) bcl_type, bcr_type = 'dgfs-periodic', 'dgfs-periodic' # the mesh is decomposed in linear fashion, so rank 0 gets left boundary if rank==0: bcl_type = cfg.lookup('soln-bcs-xlo', 'type') # and the last rank comm.size-1 gets the right boundary if rank==comm.size-1: bcr_type = cfg.lookup('soln-bcs-xhi', 'type') # prepare kernels for left boundary bcl_cls = subclass_where(DGFSBCBi, type=bcl_type) bcl = bcl_cls(xmesh[0], -1., vm, cfg, 'soln-bcs-xlo') updateBC_L_Op = bcl.updateBCKern applyBC_L_Op = bcl.applyBCKern # prepare kernels for right boundary bcr_cls = subclass_where(DGFSBCBi, type=bcr_type) bcr = bcr_cls(xmesh[-1], 1., vm, cfg, 'soln-bcs-xhi') updateBC_R_Op = bcr.updateBCKern applyBC_R_Op = bcr.applyBCKern # flux kernel flux = get_kernel(kernmod, "flux", 'PPPPP') flux_Op = lambda d_uL, d_uR, d_jL, d_jR: flux.prepared_call( grid_Nv, block, d_uL.ptr, d_uR.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr) # multiply the derivative by the advection velocity mulbyadv = get_kernel(kernmod, "mul_by_adv", 'PP') mulbyadv_Op = lambda d_ux: mulbyadv.prepared_call( grid_KNeNv, block, vm.d_cvx().ptr, d_ux.ptr) # multiply the coefficient by the inverse jacobian mulbyinvjac = get_kernel(kernmod, "mul_by_invjac", 'P') mulbyinvjac_Op = lambda d_ux: mulbyinvjac.prepared_call( grid_Nv, block, d_ux.ptr) # \alpha AX + \beta Y kernel (for operations on coefficients) axnpbyCoeff = get_axnpby_kerns(2, range(K), NeNv, cfg.dtype) axnpbyCoeff_Op = lambda a0, x0, a1, x1: axnpbyCoeff.prepared_call( grid_NeNv, block, x0.ptr, x1.ptr, a0, a1) # \alpha AX + \beta Y kernel (for operations on physical solutions) axnpbySol = get_axnpby_kerns(2, range(Nq), NeNv, cfg.dtype) axnpbySol_Op = lambda a0, x0, a1, x1: axnpbySol.prepared_call( grid_NeNv, block, x0.ptr, x1.ptr, a0, a1) # total flux kernel (sums up surface and volume terms) totalFlux = get_kernel(kernmod, "totalFlux", 'PPPP') totalFlux_Op = lambda d_ux, d_jL, d_jR: totalFlux.prepared_call( grid_Nv, block, d_ux.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr) # allocations on gpu d_usol = gpuarray.empty(NqNeNv, dtype=cfg.dtype) d_usolF = gpuarray.empty(NqfNeNv, dtype=cfg.dtype) d_uL = gpuarray.empty(NfNv, dtype=cfg.dtype) d_uR = gpuarray.empty(NfNv, dtype=cfg.dtype) d_jL = gpuarray.empty(NfNv, dtype=cfg.dtype) d_jR = gpuarray.empty(NfNv, dtype=cfg.dtype) d_bcL = gpuarray.empty(Nv, dtype=cfg.dtype) d_bcR = gpuarray.empty(Nv, dtype=cfg.dtype) d_bcT = gpuarray.empty(Nv, dtype=cfg.dtype) d_ux = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_f = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_g = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_ucoeffs = [gpuarray.empty_like(d_ux) for p in range(vm.nspcs())] d_ucoeffPrevs = [gpuarray.empty_like(d_ux) for p in range(vm.nspcs())] # check if this is a new run if hasattr(args, 'process_run'): usol = np.empty((Nq, Ne, Nv), dtype=cfg.dtype) # temporary storage # load the initial condition model icn = cfg.lookup('soln-ics', 'type') initcondcls = subclass_where(DGFSInitConditionBi, model=icn) ic = initcondcls(cfg, vm, 'soln-ics') for p in range(vm.nspcs()): ic.apply_init_vals(p, usol, Nq, Ne, xsol) # transfer the information to the gpu d_usol.set(usol.ravel()) # forward transform to coefficient space fwdTrans_Op(d_usol, d_ucoeffs[p]) # check if we are restarting if hasattr(args, 'process_restart'): import h5py as h5py check(len(args.dist[0])==comm.size, "No. of distributions != nranks") with h5py.File(args.dist[0][rank].name, 'r') as h5f: for p, d_ucoeff in enumerate(d_ucoeffs): dst = h5f['coeff'+str(p)] ti = dst.attrs['time'] d_ucoeff.set(dst[:]) check(dst.attrs['K']==K, "Inconsistent distribution K") check(dst.attrs['Ne']==Ne, "Inconsistent distribution Ne") check(dst.attrs['Nv']==Nv, "Inconsistent distribution N") # backward transform to solution space #bwdTrans_Op(d_ucoeff, d_usol) # prepare the post-processing handlers # For computing moments moments = DGFSMomWriterBi(ti, basis.interpMat, xcoeff, d_ucoeffs, vm, cfg, 'dgfsmomwriter') # For computing residual residual = DGFSResidualBi(cfg, 'dgfsresidual') # For writing distribution function distribution = DGFSDistributionBi(ti, (K, Ne, Nv), cfg, 'dgfsdistwriter') # Actual algorithm # allocation for time integrators # initialize for p in range(vm.nspcs()): axnpbyCoeff_Op(0., d_ucoeffPrevs[p], 1., d_ucoeffs[p]) sigModes = basis.sigModes # define the neighbours from mpi4py import MPI down_nbr, up_nbr = comm.rank - 1, comm.rank + 1; if up_nbr >= comm.size: up_nbr = MPI.PROC_NULL if down_nbr < 0: down_nbr = MPI.PROC_NULL # define the ode rhs def rhs(p, time, d_ucoeffs_in, d_ucoeff_out): # reconstruct solution at faces bwdTransFace_Op(d_ucoeffs_in[p], d_usolF) # Step:1 extract the solution at faces extLeft_Op(d_usolF, d_uL) extRight_Op(d_usolF, d_uR) # transfer left boundary information in send buffer transferBC_L_Op(d_uL, d_bcL) # Transfer the left ghost BC info transferBC_R_Op(d_uR, d_bcR) # Transfer the right ghost BC info # this can be adjusted in case of RDMA enabled MPI support h_bcL, h_bcR = d_bcL.get(), d_bcR.get() #h_bcL, h_bcR = map(lambda v: v.gpudata.as_buffer(v.nbytes), # (d_bcL, d_bcR)) # send information req1 = comm.isend(d_bcR, dest=up_nbr) # to upstream neighbour req2 = comm.isend(d_bcL, dest=down_nbr) # to downstream neighbour # recieve information h_bcL = comm.recv(source=down_nbr) # from downstream neighbour h_bcR = comm.recv(source=up_nbr) # from upstream neighbour MPI.Request.Waitall([req1, req2]) # set information at left boundary if h_bcL: d_bcL.set(h_bcL) else: transferBC_L_Op(d_uL, d_bcL) # Transfer the ghost BC info # set information at right boundary if h_bcR: d_bcR.set(h_bcR) else: transferBC_R_Op(d_uR, d_bcR) # Transfer the ghost BC info # At left boundary #transferBC_L_Op(d_uL, d_bcL) # Transfer the ghost BC info updateBC_L_Op[p](d_bcL, time) # now update boundary info applyBC_L_Op[p](d_bcL, d_bcT, time) # apply boundary condition insertBC_L_Op(d_bcT, d_uL) # insert info to global face-flux # At right boundary #transferBC_R_Op(d_uR, d_bcL) # Transfer the ghost BC info updateBC_R_Op[p](d_bcR, time) # now update boundary info applyBC_R_Op[p](d_bcR, d_bcT, time) # apply boundary condition insertBC_R_Op(d_bcT, d_uR) # insert info to global face-flux # Step:2 Compute the flux and jumps (all operations in single call) #fL, fR = cvx*uL, cvx*uR #fupw = 0.5*(fL + fR) + 0.5*np.abs(cvx)*(uL - uR) #jL = fupw - fL # Compute the jump at left boundary #jR = fupw - fR # Compute the jump at right boundary flux_Op(d_uL, d_uR, d_jL, d_jR) # Step:3 evaluate the derivative # ux = -cvx*np.einsum("ml,em->el", Sx, ucoeff) deriv_Op(d_ucoeffs_in[p], d_ux) mulbyadv_Op(d_ux) # Compute the continuous flux for each element in strong form totalFlux_Op(d_ux, d_jL, d_jR) # multiply by the inverse jacobian mulbyinvjac_Op(d_ux) # Step:4 Add collision kernel contribution #ux += Q(\sum U^{m}_{ar} ucoeff_{aej}, \sum V^{m}_{ra} ucoeff_{aej}) cases = [str(p)+str(q) for q in range(vm.nspcs())] for m in range(K): trans_U_Op[m](d_ucoeffs_in[p], d_f) for q in range(vm.nspcs()): trans_V_Op[m](d_ucoeffs_in[q], d_g) for r, e in it.product(sigModes[m], range(Ne)): sm.fs(cases[q], d_f, d_g, d_ux, e, r, m) #for q in range(vm.nspcs()): # for r, e in it.product(range(K), range(Ne)): # sm.fs(cases[q], d_ucoeffs_in[p], d_ucoeffs_in[q], d_ux, e, r, r) # Step:5 Multiply by inverse mass matrix invMass_Op(d_ux, d_ucoeff_out) # define a time-integrator odestype = cfg.lookup('time-integrator', 'scheme') odescls = subclass_where(DGFSIntegratorBi, intg_kind=odestype) odes = odescls(rhs, (K, Ne, Nv), cfg.dtype, vm.nspcs()) # Finally start everything time = ti # initialize time in case of restart nacptsteps = 0 # number of elasped steps in the current run # start timer start = timer() while(time < tf): # March in time odes.integrate(time, dt, d_ucoeffs) # increment time time += dt nacptsteps += 1 # Final step: post processing routines residual(time, nacptsteps, d_ucoeffPrevs, d_ucoeffs) moments(dt, time, d_ucoeffs) distribution(dt, time, d_ucoeffs) # copy the solution for the next time step for p in range(vm.nspcs()): cuda.memcpy_dtod(d_ucoeffPrevs[p].ptr, d_ucoeffs[p].ptr, d_ucoeffs[p].nbytes) # print elasped time end = timer() elapsed = np.array([end - start]) if rank==root: comm.Allreduce(get_mpi('in_place'), elapsed, op=get_mpi('sum')) avgtime = elapsed[0]/comm.size print("Nsteps", nacptsteps, ", elapsed time", avgtime, "s")
def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs): super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs) #initcondcls = subclass_where(DGFSInitConditionBi, model='maxwellian') #bc = initcondcls(cfg, self._vm, cfgsect, wall=True) #f0 = bc.get_init_vals() #self._d_bnd_f0 = [gpuarray.to_gpu(f.ravel()) for f in f0] #unondim = bc.unondim() #ndenini = 1. ux = cfg.lookupexpr(cfgsect, 'ux') uy = cfg.lookupexpr(cfgsect, 'uy') uz = cfg.lookupexpr(cfgsect, 'uz') T = cfg.lookupexpr(cfgsect, 'T') ux = '((' + ux + ')/' + str(self._vm.u0()) + ')' uy = '((' + uy + ')/' + str(self._vm.u0()) + ')' uz = '((' + uz + ')/' + str(self._vm.u0()) + ')' T = '((' + T + ')/' + str(self._vm.T0()) + ')' # mass ratios mr = vm.masses() # storage self._bc_vals_num = [ gpuarray.empty(self._vm.vsize(), cfg.dtype) for p in range(vm.nspcs()) ] self._bc_vals_den = [ gpuarray.empty(self._vm.vsize(), cfg.dtype) for p in range(vm.nspcs()) ] self._wall_nden = [ gpuarray.empty(1, dtype=cfg.dtype) for p in range(vm.nspcs()) ] self._mr = [ gpuarray.to_gpu(np.array(mr[p], dtype=cfg.dtype)) for p in range(vm.nspcs()) ] dfltargs = dict(dtype=cfg.dtypename, vsize=self._vm.vsize(), cw=self._vm.cw(), nl=nl, x=xsol, ux=ux, uy=uy, uz=uz, T=T) kernsrc = DottedTemplateLookup('dgfs1D.bi.kernels.bcs', dfltargs).get_template( self.type).render() kernmod = compiler.SourceModule(kernsrc) # block size block = (128, 1, 1) grid_Nv = get_grid_for_block(block, self._vm.vsize()) # for applying the boundary condition def make_applyBC(p, applyBCFunc): def applyBC(ul, ur, t): applyBCFunc.prepared_call(grid_Nv, block, ul.ptr, ur.ptr, self._vm.d_cvx().ptr, self._vm.d_cvy().ptr, self._vm.d_cvz().ptr, self._mr[p].ptr, self._wall_nden[p].ptr, t) return applyBC applyBCFunc = get_kernel(kernmod, "applyBC", [np.intp] * 7 + [cfg.dtype]) for p in range(vm.nspcs()): self._applyBCKern[p] = make_applyBC(p, applyBCFunc) # for extracting left face values def make_updateBC(p, updateBCFunc): def updateBC(ul, t): updateBCFunc.prepared_call(grid_Nv, block, ul.ptr, self._vm.d_cvx().ptr, self._vm.d_cvy().ptr, self._vm.d_cvz().ptr, self._mr[p].ptr, self._bc_vals_num[p].ptr, self._bc_vals_den[p].ptr, t) self._wall_nden[p] = -(gpuarray.sum(self._bc_vals_num[p]) / gpuarray.sum(self._bc_vals_den[p])) return updateBC updateBCFunc = get_kernel(kernmod, "updateBC", [np.intp] * 7 + [cfg.dtype]) for p in range(vm.nspcs()): self._updateBCKern[p] = make_updateBC(p, updateBCFunc)
def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs): super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs) #initcondcls = subclass_where(DGFSInitConditionStd, model='maxwellian') #bc = initcondcls(cfg, self._vm, cfgsect, wall=True) #f0 = bc.get_init_vals().reshape(self._vm.vsize(), 1) #self._d_bnd_f0 = gpuarray.to_gpu(f0) #unondim = bc.unondim() rhoini = 1. ux = cfg.lookupexpr(cfgsect, 'ux') uy = cfg.lookupexpr(cfgsect, 'uy') uz = cfg.lookupexpr(cfgsect, 'uz') T = cfg.lookupexpr(cfgsect, 'T') ux = '((' + ux + ')/' + str(self._vm.u0()) + ')' uy = '((' + uy + ')/' + str(self._vm.u0()) + ')' uz = '((' + uz + ')/' + str(self._vm.u0()) + ')' T = '((' + T + ')/' + str(self._vm.T0()) + ')' # storage self._bc_vals_num = gpuarray.empty(self._vm.vsize(), cfg.dtype) self._bc_vals_den = gpuarray.empty_like(self._bc_vals_num) dfltargs = dict(dtype=cfg.dtypename, vsize=self._vm.vsize(), cw=self._vm.cw(), nl=nl, x=xsol, ux=ux, uy=uy, uz=uz, T=T) kernsrc = DottedTemplateLookup('dgfs1D.std.kernels.bcs', dfltargs).get_template( self.type).render() kernmod = compiler.SourceModule(kernsrc) # block size block = (128, 1, 1) grid_Nv = get_grid_for_block(block, self._vm.vsize()) # for extracting right face values applyBCFunc = get_kernel(kernmod, "applyBC", [np.intp] * 6 + [cfg.dtype]) self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call( grid_Nv, block, ul.ptr, ur.ptr, self._vm.d_cvx().ptr, self._vm.d_cvy().ptr, self._vm.d_cvz().ptr, self._wall_nden.ptr, t) # for extracting left face values updateBCFunc = get_kernel(kernmod, "updateBC", [np.intp] * 6 + [cfg.dtype]) def updateBC(ul, t): updateBCFunc.prepared_call(grid_Nv, block, ul.ptr, self._vm.d_cvx().ptr, self._vm.d_cvy().ptr, self._vm.d_cvz().ptr, self._bc_vals_num.ptr, self._bc_vals_den.ptr, t) self._wall_nden = -(gpuarray.sum(self._bc_vals_num) / gpuarray.sum(self._bc_vals_den)) self._updateBCKern = updateBC