def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs): super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs) initcondcls = subclass_where(DGFSInitConditionStd, model='maxwellian') bc = initcondcls(cfg, self._vm, cfgsect, wall=False) f0 = bc.get_init_vals().reshape(self._vm.vsize(), 1) self._d_bnd_f0 = gpuarray.to_gpu(f0) # template dfltargs = dict(dtype=cfg.dtypename, vsize=self._vm.vsize(), cw=self._vm.cw(), nl=nl, x=xsol) kernsrc = DottedTemplateLookup('dgfs1D.std.kernels.bcs', dfltargs).get_template( self.type).render() kernmod = compiler.SourceModule(kernsrc) # block size block = (128, 1, 1) grid_Nv = get_grid_for_block(block, self._vm.vsize()) # for extracting right face values applyBCFunc = get_kernel(kernmod, "applyBC", [np.intp] * 4 + [cfg.dtype]) self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call( grid_Nv, block, ul.ptr, ur.ptr, self._vm.d_cvx().ptr, self._d_bnd_f0.ptr, t)
def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs): super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs) initcondcls = subclass_where(DGFSInitConditionStd, model='maxwellian') bc = initcondcls(cfg, self._vm, cfgsect, wall=True) f0 = bc.get_init_vals().reshape(self._vm.vsize(), 1) self._d_bnd_f0 = gpuarray.to_gpu(f0) unondim = bc.unondim() # storage self._bc_vals_num = gpuarray.empty(self._vm.vsize(), self._d_bnd_f0.dtype) self._bc_vals_den = gpuarray.empty_like(self._bc_vals_num) dfltargs = dict(dtype=cfg.dtypename, vsize=self._vm.vsize(), cw=self._vm.cw(), ux=unondim[0, 0], nl=nl, x=xsol) kernsrc = DottedTemplateLookup('dgfs1D.std.kernels.bcs', dfltargs).get_template( self.type).render() kernmod = compiler.SourceModule(kernsrc) # block size block = (128, 1, 1) grid_Nv = get_grid_for_block(block, self._vm.vsize()) # for extracting right face values applyBCFunc = get_kernel(kernmod, "applyBC", [np.intp] * 5 + [unondim.dtype]) self._applyBCKern = lambda ul, ur, t: applyBCFunc.prepared_call( grid_Nv, block, ul.ptr, ur.ptr, self._vm.d_cvx().ptr, self._d_bnd_f0.ptr, self._wall_nden.ptr, t) # for extracting left face values updateBCFunc = get_kernel(kernmod, "updateBC", [np.intp] * 5 + [unondim.dtype]) def updateBC(ul, t): updateBCFunc.prepared_call(grid_Nv, block, ul.ptr, self._vm.d_cvx().ptr, self._d_bnd_f0.ptr, self._bc_vals_num.ptr, self._bc_vals_den.ptr, t) self._wall_nden = -(gpuarray.sum(self._bc_vals_num) / gpuarray.sum(self._bc_vals_den)) #print(xsol, self._wall_nden.get()) self._updateBCKern = updateBC
def __init__(self, xsol, nl, vm, cfg, cfgsect, **kwargs): super().__init__(xsol, nl, vm, cfg, cfgsect, **kwargs) initcondcls = subclass_where(DGFSInitConditionBi, model='maxwellian') bc = initcondcls(cfg, self._vm, cfgsect, wall=False) f0 = bc.get_init_vals() self._d_bnd_f0 = [gpuarray.to_gpu(f.ravel()) for f in f0] unondim = bc.unondim() # template dfltargs = dict(dtype=cfg.dtypename, vsize=self._vm.vsize(), cw=self._vm.cw(), ux=unondim[0, 0], nl=nl, x=xsol) kernsrc = DottedTemplateLookup('dgfs1D.bi.kernels.bcs', dfltargs).get_template( self.type).render() kernmod = compiler.SourceModule(kernsrc) # block size block = (128, 1, 1) grid_Nv = get_grid_for_block(block, self._vm.vsize()) # for applying the boundary condition def make_applyBC(p, applyBCFunc): def applyBC(ul, ur, t): applyBCFunc.prepared_call(grid_Nv, block, ul.ptr, ur.ptr, self._vm.d_cvx().ptr, self._d_bnd_f0[p].ptr, t) return applyBC applyBCFunc = get_kernel(kernmod, "applyBC", [np.intp] * 4 + [unondim.dtype]) for p in range(vm.nspcs()): self._applyBCKern[p] = make_applyBC(p, applyBCFunc)
def main(Ne=None, dt=None): # who am I in this world? (Bulleh Shah, 18th century sufi poet) comm, rank, root = get_comm_rank_root() # read the inputs (from people) cfg, args = initialize() if Ne is not None: cfg._cp.set('mesh', 'Ne', str(int(Ne))) if dt is not None: cfg._cp.set('time-integrator', 'dt', str(float(dt))) mesh = Mesh(cfg) # define 1D mesh (construct a 1D world view) xmesh = mesh.xmesh # number of elements (how refined perspectives do we want/have?) Ne = mesh.Ne # define the basis (what is the basis for those perspectives?) bsKind = cfg.lookup('basis', 'kind') #assert bsKind == 'nodal-sem-gll', "Only one supported as of now" basiscls = subclass_where(Basis, basis_kind=bsKind) basis = basiscls(cfg) # number of local degrees of freedom (depth/granualirity of perspectives) K = basis.K # number of solution points (how far can I interpolate my learning) Nq = basis.Nq # left/right face maps Nqf = basis.Nqf # number of points used in reconstruction at faces mapL, mapR = np.arange(Ne + 1) + (Nqf - 1) * Ne - 1, np.arange(Ne + 1) mapL[0], mapR[-1] = 0, Ne * Nqf - 1 Nf = len(mapL) # the zeros z = basis.z # jacobian of the mapping from D^{st}=[-1,1] to D jac, invjac = mesh.jac, mesh.invjac # load the velocity mesh vm = DGFSVelocityMeshStd(cfg) Nv = vm.vsize() # load the scattering model smn = cfg.lookup('scattering-model', 'type') scatteringcls = subclass_where(DGFSScatteringModelAstd, scattering_model=smn) sm = scatteringcls(cfg, vm, Ne=Ne) # initial time, time step, final time ti, dt, tf = cfg.lookupfloats('time-integrator', ('tstart', 'dt', 'tend')) nsteps = np.ceil((tf - ti) / dt) dt = (tf - ti) / nsteps # Compute the location of the solution points xsol = np.array( [0.5 * (xmesh[j] + xmesh[j + 1]) + jac[j] * z for j in range(Ne)]).T xcoeff = np.einsum("kq,qe->ke", basis.fwdTransMat, xsol) # Determine the grid/block NeNv = Ne * Nv KNeNv = K * Ne * Nv NqNeNv = Nq * Ne * Nv NqfNeNv = Nqf * Ne * Nv NfNv = Nf * Nv block = (128, 1, 1) grid_Nv = get_grid_for_block(block, Nv) grid_NeNv = get_grid_for_block(block, Ne * Nv) grid_KNeNv = get_grid_for_block(block, K * Ne * Nv) # operator generator for matrix operations matOpGen = lambda v: lambda arg0, arg1: v.prepared_call( grid_NeNv, block, NeNv, arg0.ptr, NeNv, arg1.ptr, NeNv) # forward trans, backward, backward (at faces), derivative kernels fwdTrans_Op, bwdTrans_Op, bwdTransFace_Op, deriv_Op, invMass_Op, \ computeCellAvg_Op, extractDrLin_Op = map( matOpGen, (basis.fwdTransOp, basis.bwdTransOp, basis.bwdTransFaceOp, basis.derivOp, basis.invMassOp, basis.computeCellAvgKern, basis.extractDrLinKern) ) # U, V operator kernels trans_U_Op = tuple(map(matOpGen, basis.uTransOps)) trans_V_Op = tuple(map(matOpGen, basis.vTransOps)) # prepare the kernel for extracting face/interface values dfltargs = dict(K=K, Ne=Ne, Nq=Nq, vsize=Nv, dtype=cfg.dtypename, mapL=mapL, mapR=mapR, offsetL=0, offsetR=len(mapR) - 1, invjac=invjac, gRD=basis.gRD, gLD=basis.gLD, xsol=xsol) kernsrc = DottedTemplateLookup('dgfs1D.std.kernels', dfltargs).get_template('std').render() kernmod = compiler.SourceModule(kernsrc) dfltargs.update(nalph=sm.nalph, Dr=basis.derivMat) kernlimssrc = DottedTemplateLookup( 'dgfs1D.astd.kernels', dfltargs).get_template('limiters').render() kernlimsmod = compiler.SourceModule(kernlimssrc) # prepare operators for execution (see std.mako for description) (extLeft_Op, extRight_Op, transferBC_L_Op, transferBC_R_Op, insertBC_L_Op, insertBC_R_Op) = map( lambda v: lambda *args: get_kernel(kernmod, v, 'PP').prepared_call( grid_Nv, block, *list(map(lambda c: c.ptr, args))), ("extract_left", "extract_right", "transfer_bc_left", "transfer_bc_right", "insert_bc_left", "insert_bc_right")) # The boundary conditions (by default all boundaries are processor bnds) bcl_type, bcr_type = 'dgfs-periodic', 'dgfs-periodic' # the mesh is decomposed in linear fashion, so rank 0 gets left boundary if rank == 0: bcl_type = cfg.lookup('soln-bcs-xlo', 'type') # and the last rank comm.size-1 gets the right boundary if rank == comm.size - 1: bcr_type = cfg.lookup('soln-bcs-xhi', 'type') # prepare kernels for left boundary bcl_cls = subclass_where(DGFSBCStd, type=bcl_type) bcl = bcl_cls(xmesh[0], -1., vm, cfg, 'soln-bcs-xlo') updateBC_L_Op = bcl.updateBCKern applyBC_L_Op = bcl.applyBCKern # prepare kernels for right boundary bcr_cls = subclass_where(DGFSBCStd, type=bcr_type) bcr = bcr_cls(xmesh[-1], 1., vm, cfg, 'soln-bcs-xhi') updateBC_R_Op = bcr.updateBCKern applyBC_R_Op = bcr.applyBCKern #if bcl_type == 'dgfs-cyclic' or bcr_type == 'dgfs-cyclic': # assert(bcl_type==bcr_type); # flux kernel flux = get_kernel(kernmod, "flux", 'PPPPP') flux_Op = lambda d_uL, d_uR, d_jL, d_jR: flux.prepared_call( grid_Nv, block, d_uL.ptr, d_uR.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr) # multiply the derivative by the advection velocity mulbyadv = get_kernel(kernmod, "mul_by_adv", 'PP') mulbyadv_Op = lambda d_ux: mulbyadv.prepared_call(grid_KNeNv, block, vm.d_cvx().ptr, d_ux.ptr) # multiply the coefficient by the inverse jacobian mulbyinvjac = get_kernel(kernmod, "mul_by_invjac", 'P') mulbyinvjac_Op = lambda d_ux: mulbyinvjac.prepared_call( grid_Nv, block, d_ux.ptr) # \alpha AX + \beta Y kernel (for operations on coefficients) axnpbyCoeff = get_axnpby_kerns(2, range(K), NeNv, cfg.dtype) axnpbyCoeff_Op = lambda a0, x0, a1, x1: axnpbyCoeff.prepared_call( grid_NeNv, block, x0.ptr, x1.ptr, a0, a1) # total flux kernel (sums up surface and volume terms) totalFlux = get_kernel(kernmod, "totalFlux", 'PPPP') totalFlux_Op = lambda d_ux, d_jL, d_jR: totalFlux.prepared_call( grid_Nv, block, d_ux.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr) # linear limiter limitLin = get_kernel(kernlimsmod, "limitLin", 'PPPP') limitLin_Op = lambda d_u, d_ulx, d_uavg, d_ulim: \ limitLin.prepared_call(grid_Nv, block, d_u.ptr, d_ulx.ptr, d_uavg.ptr, d_ulim.ptr) # allocations on gpu d_usol = gpuarray.empty(NqNeNv, dtype=cfg.dtype) d_usolF = gpuarray.empty(NqfNeNv, dtype=cfg.dtype) d_uL = gpuarray.empty(NfNv, dtype=cfg.dtype) d_uR = gpuarray.empty(NfNv, dtype=cfg.dtype) d_jL = gpuarray.empty(NfNv, dtype=cfg.dtype) d_jR = gpuarray.empty(NfNv, dtype=cfg.dtype) d_bcL = gpuarray.empty(Nv, dtype=cfg.dtype) d_bcR = gpuarray.empty(Nv, dtype=cfg.dtype) d_bcT = gpuarray.empty(Nv, dtype=cfg.dtype) d_ux = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_f = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_g = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_ucoeff = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_ucoeffPrev = gpuarray.empty_like(d_ucoeff) # check if this is a new run if hasattr(args, 'process_run'): usol = np.empty((Nq, Ne, Nv), dtype=cfg.dtype) # temporary storage # load the initial condition model icn = cfg.lookup('soln-ics', 'type') initcondcls = subclass_where(DGFSInitConditionStd, model=icn) ic = initcondcls(cfg, vm, 'soln-ics') ic.apply_init_vals(usol, Nq, Ne, xsol, mesh=mesh, basis=basis, sm=sm) # transfer the information to the gpu d_usol.set(usol.ravel()) # forward transform to coefficient space fwdTrans_Op(d_usol, d_ucoeff) # check if we are restarting if hasattr(args, 'process_restart'): import h5py as h5py check(len(args.dist[0]) == comm.size, "No. of distributions != nranks") with h5py.File(args.dist[0][rank].name, 'r') as h5f: dst = h5f['coeff'] ti = dst.attrs['time'] d_ucoeff.set(dst[:]) check(dst.attrs['K'] == K, "Inconsistent distribution K") check(dst.attrs['Ne'] == Ne, "Inconsistent distribution Ne") check(dst.attrs['Nv'] == Nv, "Inconsistent distribution N") # backward transform to solution space bwdTrans_Op(d_ucoeff, d_usol) # prepare the post-processing handlers # For computing moments moments = DGFSMomWriterStd(ti, basis.interpMat, xcoeff, d_ucoeff, vm, cfg, 'dgfsmomwriter') # For computing residual residual = DGFSResidualStd(cfg, 'dgfsresidual') # For writing distribution function distribution = DGFSDistributionStd(ti, (K, Ne, Nv), cfg, 'dgfsdistwriter') # Actual algorithm # initialize axnpbyCoeff_Op(0., d_ucoeffPrev, 1., d_ucoeff) sigModes = basis.sigModes # define the neighbours from mpi4py import MPI down_nbr, up_nbr = comm.rank - 1, comm.rank + 1 if up_nbr >= comm.size: up_nbr = MPI.PROC_NULL if down_nbr < 0: down_nbr = MPI.PROC_NULL # define the explicit part def explicit(time, d_ucoeff_in, d_ucoeff_out): # reconstruct solution at faces bwdTransFace_Op(d_ucoeff_in, d_usolF) # Step:1 extract the solution at faces extLeft_Op(d_usolF, d_uL) extRight_Op(d_usolF, d_uR) # transfer left boundary information in send buffer transferBC_L_Op(d_uL, d_bcL) # Transfer the left ghost BC info transferBC_R_Op(d_uR, d_bcR) # Transfer the right ghost BC info # this can be adjusted in case of RDMA enabled MPI support #h_bcL, h_bcR = d_bcL.get(), d_bcR.get() #h_bcL, h_bcR = map(lambda v: v.gpudata.as_buffer(v.nbytes), # (d_bcL, d_bcR)) # send information req1 = comm.isend(d_bcR, dest=up_nbr) # to upstream neighbour req2 = comm.isend(d_bcL, dest=down_nbr) # to downstream neighbour # recieve information h_bcL = comm.recv(source=down_nbr) # from downstream neighbour h_bcR = comm.recv(source=up_nbr) # from upstream neighbour MPI.Request.Waitall([req1, req2]) # set information at left, right boundary if h_bcL: d_bcL.set(h_bcL) else: transferBC_L_Op(d_uL, d_bcL) if h_bcR: d_bcR.set(h_bcR) else: transferBC_R_Op(d_uR, d_bcR) # The physical-periodic boundary condition if comm.size == 1 and bcr_type == 'dgfs-cyclic': copy(d_bcT, d_bcL) copy(d_bcL, d_bcR) copy(d_bcR, d_bcT) else: # At left, receive from right-most communicator; and vice-versa req1 = req2 = MPI.REQUEST_NULL if bcl_type == 'dgfs-cyclic': req1 = comm.isend(d_bcL, dest=comm.size - 1) if bcr_type == 'dgfs-cyclic': req2 = comm.isend(d_bcR, dest=0) if bcr_type == 'dgfs-cyclic': h_bcR = comm.recv(source=0) if bcl_type == 'dgfs-cyclic': h_bcL = comm.recv(source=comm.size - 1) MPI.Request.Waitall([req1, req2]) if bcl_type == 'dgfs-cyclic': d_bcL.set(h_bcL) elif bcr_type == 'dgfs-cyclic': d_bcR.set(h_bcR) # At left boundary #transferBC_L_Op(d_uL, d_bcL) # Transfer the ghost BC info updateBC_L_Op(d_bcL, time) # now update boundary info applyBC_L_Op(d_bcL, d_bcT, time) # apply boundary condition insertBC_L_Op(d_bcT, d_uL) # insert info to global face-flux # At right boundary #transferBC_R_Op(d_uR, d_bcL) # Transfer the ghost BC info updateBC_R_Op(d_bcR, time) # now update boundary info applyBC_R_Op(d_bcR, d_bcT, time) # apply boundary condition insertBC_R_Op(d_bcT, d_uR) # insert info to global face-flux # Step:2 Compute the flux and jumps (all operations in single call) #fL, fR = cvx*uL, cvx*uR #fupw = 0.5*(fL + fR) + 0.5*np.abs(cvx)*(uL - uR) #jL = fupw - fL # Compute the jump at left boundary #jR = fupw - fR # Compute the jump at right boundary flux_Op(d_uL, d_uR, d_jL, d_jR) # Step:3 evaluate the derivative # ux = -cvx*np.einsum("ml,em->el", Sx, ucoeff) deriv_Op(d_ucoeff_in, d_ux) mulbyadv_Op(d_ux) # Compute the continuous flux for each element in strong form totalFlux_Op(d_ux, d_jL, d_jR) # multiply by the inverse jacobian # Now we have f* = d_ux mulbyinvjac_Op(d_ux) # project back to coefficient space invMass_Op(d_ux, d_ucoeff_out) d_uavg, d_ulx = map(gpuarray.empty_like, [d_ucoeff] * 2) def limit(d_ucoeff_in, d_ucoeff_out): assert comm.size == 1, "Not implemented" #assert basis.basis_kind == 'nodal-sem-gll', "Not implemented" # Extract the cell average computeCellAvg_Op(d_ucoeff_in, d_uavg) # extract gradient of the linear polynomial extractDrLin_Op(d_ucoeff_in, d_ulx) mulbyinvjac_Op(d_ulx) # limit functions in all cells limitLin_Op(d_ucoeff_in, d_ulx, d_uavg, d_ucoeff_out) # define a time-integrator (we use Euler scheme: good enough for steady) odestype = cfg.lookup('time-integrator', 'scheme') odescls = subclass_where(DGFSIntegratorAstd, intg_kind=odestype) limitOn = cfg.lookupordefault('time-integrator', 'limiter', 0) # Finally start everything time = ti # initialize time in case of restart nacptsteps = 0 # number of elasped steps in the current run # initialize ode: this performs pre-integration for multi-step schemes odes = odescls(explicit, sm, (K, Ne, Nv), cfg.dtype, t=time, dt=dt, f0=d_ucoeff) # start timer start = timer() while (time < tf): # March in time odes.integrate(time, dt, nacptsteps, d_ucoeff) if limitOn: limit(d_ucoeff, d_ucoeff) # increment time time += dt nacptsteps += 1 # Final step: post processing routines residual(time, nacptsteps, d_ucoeff, d_ucoeffPrev) moments(dt, time, d_ucoeff) distribution(dt, time, d_ucoeff) # copy the solution for the next time step cuda.memcpy_dtod(d_ucoeffPrev.ptr, d_ucoeff.ptr, d_ucoeff.nbytes) # print elasped time end = timer() elapsed = np.array([end - start]) if rank != root: comm.Reduce(elapsed, None, op=get_mpi('sum'), root=root) else: comm.Reduce(get_mpi('in_place'), elapsed, op=get_mpi('sum'), root=root) avgtime = elapsed[0] / comm.size print("Nsteps", nacptsteps, ", elapsed time", avgtime, "s") return d_ucoeff, mesh, vm, basis
def main(): # who am I in this world? (Bulleh Shah, 18th century sufi poet) comm, rank, root = get_comm_rank_root() # read the inputs (from people) cfg, args = initialize() mesh = Mesh(cfg) # define 1D mesh (construct a 1D world view) xmesh = mesh.xmesh # number of elements (how refined perspectives do we want/have?) Ne = mesh.Ne # define the basis (what is the basis for those perspectives?) bsKind = cfg.lookup('basis', 'kind') basiscls = subclass_where(Basis, basis_kind=bsKind) basis = basiscls(cfg) # number of local degrees of freedom (depth/granualirity of perspectives) K = basis.K # number of solution points (how far can I interpolate my learning) Nq = basis.Nq # left/right face maps Nqf = basis.Nqf # number of points used in reconstruction at faces mapL, mapR = np.arange(Ne+1)+(Nqf-1)*Ne-1, np.arange(Ne+1) mapL[0], mapR[-1] = 0, Ne*Nqf-1 Nf = len(mapL) # the zeros z = basis.z # jacobian of the mapping from D^{st}=[-1,1] to D jac, invjac = mesh.jac, mesh.invjac # load the velocity mesh vm = DGFSVelocityMeshBi(cfg) Nv = vm.vsize() # load the scattering model smn = cfg.lookup('scattering-model', 'type') scatteringcls = subclass_where(DGFSScatteringModelBi, scattering_model=smn) sm = scatteringcls(cfg, vm, Ne=Ne) # initial time, time step, final time ti, dt, tf = cfg.lookupfloats('time-integrator', ('tstart', 'dt', 'tend')) nsteps = np.ceil((tf - ti)/dt) dt = (tf - ti)/nsteps # Compute the location of the solution points xsol = np.array([0.5*(xmesh[j]+xmesh[j+1])+jac[j]*z for j in range(Ne)]).T xcoeff = np.einsum("kq,qe->ke", basis.fwdTransMat, xsol) # Determine the grid/block NeNv = Ne*Nv KNeNv = K*Ne*Nv NqNeNv = Nq*Ne*Nv NqfNeNv = Nqf*Ne*Nv NfNv = Nf*Nv block = (128, 1, 1) grid_Nv = get_grid_for_block(block, Nv) grid_NeNv = get_grid_for_block(block, Ne*Nv) grid_KNeNv = get_grid_for_block(block, K*Ne*Nv) # operator generator for matrix operations matOpGen = lambda v: lambda arg0, arg1: v.prepared_call( grid_NeNv, block, NeNv, arg0.ptr, NeNv, arg1.ptr, NeNv) # forward trans, backward, backward (at faces), derivative kernels fwdTrans_Op, bwdTrans_Op, bwdTransFace_Op, deriv_Op, invMass_Op = map( matOpGen, (basis.fwdTransOp, basis.bwdTransOp, basis.bwdTransFaceOp, basis.derivOp, basis.invMassOp) ) # U, V operator kernels trans_U_Op = tuple(map(matOpGen, basis.uTransOps)) trans_V_Op = tuple(map(matOpGen, basis.vTransOps)) # prepare the kernel for extracting face/interface values dfltargs = dict( K=K, Ne=Ne, Nq=Nq, vsize=Nv, dtype=cfg.dtypename, mapL=mapL, mapR=mapR, offsetL=0, offsetR=len(mapR)-1, invjac=invjac, gRD=basis.gRD, gLD=basis.gLD) kernsrc = DottedTemplateLookup('dgfs1D.bi.kernels', dfltargs).get_template('bi').render() kernmod = compiler.SourceModule(kernsrc) # prepare operators for execution (see bi.mako for description) (extLeft_Op, extRight_Op, transferBC_L_Op, transferBC_R_Op, insertBC_L_Op, insertBC_R_Op) = map(lambda v: lambda *args: get_kernel(kernmod, v, 'PP').prepared_call( grid_Nv, block, *list(map(lambda c: c.ptr, args)) ), ("extract_left", "extract_right", "transfer_bc_left", "transfer_bc_right", "insert_bc_left", "insert_bc_right") ) # The boundary conditions (by default all boundaries are processor bnds) bcl_type, bcr_type = 'dgfs-periodic', 'dgfs-periodic' # the mesh is decomposed in linear fashion, so rank 0 gets left boundary if rank==0: bcl_type = cfg.lookup('soln-bcs-xlo', 'type') # and the last rank comm.size-1 gets the right boundary if rank==comm.size-1: bcr_type = cfg.lookup('soln-bcs-xhi', 'type') # prepare kernels for left boundary bcl_cls = subclass_where(DGFSBCBi, type=bcl_type) bcl = bcl_cls(xmesh[0], -1., vm, cfg, 'soln-bcs-xlo') updateBC_L_Op = bcl.updateBCKern applyBC_L_Op = bcl.applyBCKern # prepare kernels for right boundary bcr_cls = subclass_where(DGFSBCBi, type=bcr_type) bcr = bcr_cls(xmesh[-1], 1., vm, cfg, 'soln-bcs-xhi') updateBC_R_Op = bcr.updateBCKern applyBC_R_Op = bcr.applyBCKern # flux kernel flux = get_kernel(kernmod, "flux", 'PPPPP') flux_Op = lambda d_uL, d_uR, d_jL, d_jR: flux.prepared_call( grid_Nv, block, d_uL.ptr, d_uR.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr) # multiply the derivative by the advection velocity mulbyadv = get_kernel(kernmod, "mul_by_adv", 'PP') mulbyadv_Op = lambda d_ux: mulbyadv.prepared_call( grid_KNeNv, block, vm.d_cvx().ptr, d_ux.ptr) # multiply the coefficient by the inverse jacobian mulbyinvjac = get_kernel(kernmod, "mul_by_invjac", 'P') mulbyinvjac_Op = lambda d_ux: mulbyinvjac.prepared_call( grid_Nv, block, d_ux.ptr) # \alpha AX + \beta Y kernel (for operations on coefficients) axnpbyCoeff = get_axnpby_kerns(2, range(K), NeNv, cfg.dtype) axnpbyCoeff_Op = lambda a0, x0, a1, x1: axnpbyCoeff.prepared_call( grid_NeNv, block, x0.ptr, x1.ptr, a0, a1) # \alpha AX + \beta Y kernel (for operations on physical solutions) axnpbySol = get_axnpby_kerns(2, range(Nq), NeNv, cfg.dtype) axnpbySol_Op = lambda a0, x0, a1, x1: axnpbySol.prepared_call( grid_NeNv, block, x0.ptr, x1.ptr, a0, a1) # total flux kernel (sums up surface and volume terms) totalFlux = get_kernel(kernmod, "totalFlux", 'PPPP') totalFlux_Op = lambda d_ux, d_jL, d_jR: totalFlux.prepared_call( grid_Nv, block, d_ux.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr) # allocations on gpu d_usol = gpuarray.empty(NqNeNv, dtype=cfg.dtype) d_usolF = gpuarray.empty(NqfNeNv, dtype=cfg.dtype) d_uL = gpuarray.empty(NfNv, dtype=cfg.dtype) d_uR = gpuarray.empty(NfNv, dtype=cfg.dtype) d_jL = gpuarray.empty(NfNv, dtype=cfg.dtype) d_jR = gpuarray.empty(NfNv, dtype=cfg.dtype) d_bcL = gpuarray.empty(Nv, dtype=cfg.dtype) d_bcR = gpuarray.empty(Nv, dtype=cfg.dtype) d_bcT = gpuarray.empty(Nv, dtype=cfg.dtype) d_ux = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_f = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_g = gpuarray.empty(KNeNv, dtype=cfg.dtype) d_ucoeffs = [gpuarray.empty_like(d_ux) for p in range(vm.nspcs())] d_ucoeffPrevs = [gpuarray.empty_like(d_ux) for p in range(vm.nspcs())] # check if this is a new run if hasattr(args, 'process_run'): usol = np.empty((Nq, Ne, Nv), dtype=cfg.dtype) # temporary storage # load the initial condition model icn = cfg.lookup('soln-ics', 'type') initcondcls = subclass_where(DGFSInitConditionBi, model=icn) ic = initcondcls(cfg, vm, 'soln-ics') for p in range(vm.nspcs()): ic.apply_init_vals(p, usol, Nq, Ne, xsol) # transfer the information to the gpu d_usol.set(usol.ravel()) # forward transform to coefficient space fwdTrans_Op(d_usol, d_ucoeffs[p]) # check if we are restarting if hasattr(args, 'process_restart'): import h5py as h5py check(len(args.dist[0])==comm.size, "No. of distributions != nranks") with h5py.File(args.dist[0][rank].name, 'r') as h5f: for p, d_ucoeff in enumerate(d_ucoeffs): dst = h5f['coeff'+str(p)] ti = dst.attrs['time'] d_ucoeff.set(dst[:]) check(dst.attrs['K']==K, "Inconsistent distribution K") check(dst.attrs['Ne']==Ne, "Inconsistent distribution Ne") check(dst.attrs['Nv']==Nv, "Inconsistent distribution N") # backward transform to solution space #bwdTrans_Op(d_ucoeff, d_usol) # prepare the post-processing handlers # For computing moments moments = DGFSMomWriterBi(ti, basis.interpMat, xcoeff, d_ucoeffs, vm, cfg, 'dgfsmomwriter') # For computing residual residual = DGFSResidualBi(cfg, 'dgfsresidual') # For writing distribution function distribution = DGFSDistributionBi(ti, (K, Ne, Nv), cfg, 'dgfsdistwriter') # Actual algorithm # allocation for time integrators # initialize for p in range(vm.nspcs()): axnpbyCoeff_Op(0., d_ucoeffPrevs[p], 1., d_ucoeffs[p]) sigModes = basis.sigModes # define the neighbours from mpi4py import MPI down_nbr, up_nbr = comm.rank - 1, comm.rank + 1; if up_nbr >= comm.size: up_nbr = MPI.PROC_NULL if down_nbr < 0: down_nbr = MPI.PROC_NULL # define the ode rhs def rhs(p, time, d_ucoeffs_in, d_ucoeff_out): # reconstruct solution at faces bwdTransFace_Op(d_ucoeffs_in[p], d_usolF) # Step:1 extract the solution at faces extLeft_Op(d_usolF, d_uL) extRight_Op(d_usolF, d_uR) # transfer left boundary information in send buffer transferBC_L_Op(d_uL, d_bcL) # Transfer the left ghost BC info transferBC_R_Op(d_uR, d_bcR) # Transfer the right ghost BC info # this can be adjusted in case of RDMA enabled MPI support h_bcL, h_bcR = d_bcL.get(), d_bcR.get() #h_bcL, h_bcR = map(lambda v: v.gpudata.as_buffer(v.nbytes), # (d_bcL, d_bcR)) # send information req1 = comm.isend(d_bcR, dest=up_nbr) # to upstream neighbour req2 = comm.isend(d_bcL, dest=down_nbr) # to downstream neighbour # recieve information h_bcL = comm.recv(source=down_nbr) # from downstream neighbour h_bcR = comm.recv(source=up_nbr) # from upstream neighbour MPI.Request.Waitall([req1, req2]) # set information at left boundary if h_bcL: d_bcL.set(h_bcL) else: transferBC_L_Op(d_uL, d_bcL) # Transfer the ghost BC info # set information at right boundary if h_bcR: d_bcR.set(h_bcR) else: transferBC_R_Op(d_uR, d_bcR) # Transfer the ghost BC info # At left boundary #transferBC_L_Op(d_uL, d_bcL) # Transfer the ghost BC info updateBC_L_Op[p](d_bcL, time) # now update boundary info applyBC_L_Op[p](d_bcL, d_bcT, time) # apply boundary condition insertBC_L_Op(d_bcT, d_uL) # insert info to global face-flux # At right boundary #transferBC_R_Op(d_uR, d_bcL) # Transfer the ghost BC info updateBC_R_Op[p](d_bcR, time) # now update boundary info applyBC_R_Op[p](d_bcR, d_bcT, time) # apply boundary condition insertBC_R_Op(d_bcT, d_uR) # insert info to global face-flux # Step:2 Compute the flux and jumps (all operations in single call) #fL, fR = cvx*uL, cvx*uR #fupw = 0.5*(fL + fR) + 0.5*np.abs(cvx)*(uL - uR) #jL = fupw - fL # Compute the jump at left boundary #jR = fupw - fR # Compute the jump at right boundary flux_Op(d_uL, d_uR, d_jL, d_jR) # Step:3 evaluate the derivative # ux = -cvx*np.einsum("ml,em->el", Sx, ucoeff) deriv_Op(d_ucoeffs_in[p], d_ux) mulbyadv_Op(d_ux) # Compute the continuous flux for each element in strong form totalFlux_Op(d_ux, d_jL, d_jR) # multiply by the inverse jacobian mulbyinvjac_Op(d_ux) # Step:4 Add collision kernel contribution #ux += Q(\sum U^{m}_{ar} ucoeff_{aej}, \sum V^{m}_{ra} ucoeff_{aej}) cases = [str(p)+str(q) for q in range(vm.nspcs())] for m in range(K): trans_U_Op[m](d_ucoeffs_in[p], d_f) for q in range(vm.nspcs()): trans_V_Op[m](d_ucoeffs_in[q], d_g) for r, e in it.product(sigModes[m], range(Ne)): sm.fs(cases[q], d_f, d_g, d_ux, e, r, m) #for q in range(vm.nspcs()): # for r, e in it.product(range(K), range(Ne)): # sm.fs(cases[q], d_ucoeffs_in[p], d_ucoeffs_in[q], d_ux, e, r, r) # Step:5 Multiply by inverse mass matrix invMass_Op(d_ux, d_ucoeff_out) # define a time-integrator odestype = cfg.lookup('time-integrator', 'scheme') odescls = subclass_where(DGFSIntegratorBi, intg_kind=odestype) odes = odescls(rhs, (K, Ne, Nv), cfg.dtype, vm.nspcs()) # Finally start everything time = ti # initialize time in case of restart nacptsteps = 0 # number of elasped steps in the current run # start timer start = timer() while(time < tf): # March in time odes.integrate(time, dt, d_ucoeffs) # increment time time += dt nacptsteps += 1 # Final step: post processing routines residual(time, nacptsteps, d_ucoeffPrevs, d_ucoeffs) moments(dt, time, d_ucoeffs) distribution(dt, time, d_ucoeffs) # copy the solution for the next time step for p in range(vm.nspcs()): cuda.memcpy_dtod(d_ucoeffPrevs[p].ptr, d_ucoeffs[p].ptr, d_ucoeffs[p].nbytes) # print elasped time end = timer() elapsed = np.array([end - start]) if rank==root: comm.Allreduce(get_mpi('in_place'), elapsed, op=get_mpi('sum')) avgtime = elapsed[0]/comm.size print("Nsteps", nacptsteps, ", elapsed time", avgtime, "s")