Ejemplos de get_comm_rank_root en Python, ejemplos de dgfs1D.nputil.get_comm_rank_root en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: moments.py Proyecto: jaisw7/dgfs1D_gpu

    def __call__(self, dt, tcurr, coeffs):
        if abs(self.tout_next - tcurr) > 0.5 * dt:
            return

        # compute the moments
        self._compute_moments_1D(coeffs)

        # Write out the file
        fname = self.basename.format(t=tcurr)
        solnfname = os.path.join(self.basedir, fname)
        sol = self._bulksoln.swapaxes(0, 1).reshape(self.leaddim, -1)

        # get the entire information
        comm, rank, root = get_comm_rank_root()
        sol = comm.gather(sol, root=root)
        if rank == root:
            sol = np.vstack(sol)
            np.savetxt(solnfname,
                       np.hstack((self.xsol, sol)),
                       "%.5e",
                       header="t={0} \nx {1}".format(tcurr, self.fields),
                       comments="#")

        # Compute the next output time
        self.tout_next = tcurr + self.dt_out

Ejemplo n.º 2

0

Mostrar archivo

Archivo: moments.py Proyecto: jaisw7/dgfs1D_gpu

    def __init__(self,
                 tcurr,
                 im,
                 xcoeff,
                 coeffs,
                 vm,
                 cfg,
                 cfgsect,
                 suffix=None,
                 extn='.txt'):

        self.vm = vm

        # Construct the solution writer
        self.basedir = cfg.lookuppath(cfgsect, 'basedir', '.', abs=True)
        self.basename = cfg.lookup(cfgsect, 'basename')
        if not self.basename.endswith(extn):
            self.basename += extn

        # these variables are computed
        privarmap = ['rho', 'U:x', 'U:y', 'T', 'Q:x', 'p', 'nden']
        lv = len(privarmap)
        newvar = []
        for p in range(vm.nspcs()):
            newvar.extend(privarmap)
            for ivar in range(-1, -lv - 1, -1):
                newvar[ivar] += ':' + str(p)
        privarmap = newvar

        Ns = len(privarmap)
        self.fields = ", ".join(privarmap)

        # Output time step and next output time
        self.dt_out = cfg.lookupfloat(cfgsect, 'dt-out')
        self.tout_next = tcurr

        # get info
        _, Ne = xcoeff.shape

        # define the interpolation operator
        K, Nqr = im.shape
        self.im = im

        # size of the output
        self.leaddim = Nqr * Ne

        self.xsol = np.einsum('mr,me->re', self.im, xcoeff)
        self.xsol = self.xsol.T.reshape((-1, 1)) * vm.H0()

        # get the entire information
        comm, rank, root = get_comm_rank_root()
        self.xsol = comm.gather(self.xsol, root=root)
        if rank == root: self.xsol = np.vstack(self.xsol)

        # allocate variables
        self._bulksoln = np.empty((Nqr, Ne, Ns))
        self._bulksolntot = np.empty((Nqr, Ne, 4))

        # helps us to compute moments from restart file
        self(1e-10, tcurr, coeffs)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: distributionslice.py Proyecto: jaisw7/dgfs1D_gpu

    def __call__(self, dt, tcurr, coeff):
        if abs(self.tout_next - tcurr) > 0.5 * dt:
            return

        # Write out the file
        fname = self.basename.format(t=tcurr)
        solnfname = os.path.join(self.basedir, fname)

        # get the entire information
        comm, rank, root = get_comm_rank_root()
        if rank == self._thisRank:
            soln = coeff.get().reshape((self.K, self.Ne, self.vm.vsize()))
            soln = np.einsum('mr,mej->rej', self.im, soln)

            soln = soln[self.Nqr_out, self.Ne_out, :]
            soln = soln.reshape((self.Nv, self.Nv, self.Nv))

            if self.Nvx_out == -1:
                soln = soln[:, self.Nvy_out, self.Nvz_out]
            elif self.Nvy_out == -1:
                soln = soln[self.Nvx_out, :, self.Nvz_out]
            elif self.Nvz_out == -1:
                soln = soln[self.Nvx_out, self.Nvy_out, :]

            np.savetxt(solnfname,
                       np.vstack((self._vslice, soln)).T,
                       comments="#")

        # Compute the next output time
        self.tout_next = tcurr + self.dt_out

Ejemplo n.º 4

0

Mostrar archivo

    def __init__(self, tcurr, shape, cfg, cfgsect, extn='.h5'):

        self.K, self.Ne, self.Nv = shape

        # Construct the solution writer
        self.basedir = cfg.lookuppath(cfgsect, 'basedir', '.', abs=True)
        self.basename = cfg.lookup(cfgsect, 'basename')
        if not self.basename.endswith(extn):
            self.basename += extn

        _, rank, _ = get_comm_rank_root()
        self.basename += "_rank=" + str(rank)

        # Output time step and next output time
        self.dt_out = cfg.lookupfloat(cfgsect, 'dt-out')
        self.tout_next = tcurr + self.dt_out

Ejemplo n.º 5

0

Mostrar archivo

Archivo: residual.py Proyecto: jaisw7/dgfs1D_gpu

    def __call__(self, tcurr, nsteps, solprev, solcurr):
        if ((self.nsteps > 0
             and ((nsteps % self.nsteps == 0) or nsteps == 1))):
            #    or (self.dt_out>0 and abs(tcurr % self.dt_out) < 1e-8)):

            comm, rank, root = get_comm_rank_root()
            diff = solcurr - solprev
            res = np.array([
                gpuarray.dot(diff, diff).get(),
                gpuarray.dot(solprev, solprev).get()
            ])

            if rank != root:
                comm.Reduce(res, None, op=get_mpi('sum'), root=root)
            else:
                comm.Reduce(get_mpi('in_place'),
                            res,
                            op=get_mpi('sum'),
                            root=root)
                print("residual at t = ", tcurr, np.sqrt(res[0] / res[1]))

Ejemplo n.º 6

0

Mostrar archivo

    def __init__(self, cfg):
        self.cfg = cfg

        mt = cfg.lookupordefault(msect, 'type', 'classic')
        assert mt in self._meshTypes, "Valid mesh:" + str(
            self._meshTypes.keys())

        self._meshType = self._meshTypes[mt]

        # define 1D mesh (We just need a sorted point distribution)
        xmesh = self._meshType(self)

        # non-dimensionalize the domain
        self._H0 = cfg.lookupfloat(msect, 'H0')
        xmesh /= self._H0

        # number of elements
        self._Ne = len(xmesh) - 1

        # if this is an mpi process with more than one rank
        comm, rank, _ = get_comm_rank_root()
        nranks = comm.size
        perProcNe = self._Ne // nranks
        assert perProcNe * nranks == self._Ne, "Non-uniform number of elements!"
        sidx, eidx = rank * perProcNe, (rank + 1) * perProcNe + 1
        xmesh = xmesh[sidx:eidx]
        self._Ne = perProcNe
        print("Elements per proc", self._Ne)

        # length of the domain
        self._xlo, self._xhi = np.min(xmesh), np.max(xmesh)

        # size of each element
        h = np.diff(xmesh).reshape(-1, 1)
        assert np.min(h) > 1e-8, "Too small element size"

        # jacobian of the mapping from D^{st}=[-1,1] to D
        self._jac, self._invjac = h / 2., 2. / h

        # the primitive mesh points
        self._xmesh = xmesh

Ejemplo n.º 7

0

Mostrar archivo

Archivo: moments.py Proyecto: jaisw7/dgfs1D_gpu

    def __call__(self, dt, tcurr, coeff):
        if abs(self.tout_next - tcurr) > 0.5 * dt:
            return

        # compute the moments
        if self.cfg.dim == 3:
            self._compute_moments_nD(coeff)
        else:
            self._compute_moments_1D(coeff)

        # Write out the file
        fname = self.basename.format(t=tcurr)
        solnfname = os.path.join(self.basedir, fname)
        sol = self._bulksoln.swapaxes(0, 1).reshape(self.leaddim, -1)

        # get the entire information
        comm, rank, root = get_comm_rank_root()
        sol = comm.gather(sol, root=root)
        if rank == root:
            sol = np.vstack(sol)
            np.savetxt(solnfname,
                       np.hstack((self.xsol, sol)),
                       header="t={0} \n {1}".format(tcurr, self.fields),
                       comments="#")

        # Compute the next output time
        self.tout_next = tcurr + self.dt_out

        if 1 == 0 and self.cfg.dim == 2:
            x, y = self.xsol[:, 0], self.xsol[:, 1]
            rho = sol[:, 0]
            import matplotlib.pyplot as plt
            Nq, Ne, _ = self._bulksoln.shape
            Nq, Ne = map(int, (Nq**0.5, Ne**0.5))
            x, y, rho = map(
                lambda v: v.reshape(Ne, Ne, Nq, Nq).swapaxes(1, 2).reshape(
                    Ne * Nq, Ne * Nq), (x, y, rho))
            plt.contourf(x, y, rho)
            plt.savefig('plot_%s.pdf' % (fname, ))

Ejemplo n.º 8

0

Mostrar archivo

Archivo: astd.py Proyecto: jaisw7/dgfs1D_gpu

def main(Ne=None, dt=None):
    # who am I in this world? (Bulleh Shah, 18th century sufi poet)
    comm, rank, root = get_comm_rank_root()

    # read the inputs (from people)
    cfg, args = initialize()
    if Ne is not None: cfg._cp.set('mesh', 'Ne', str(int(Ne)))
    if dt is not None: cfg._cp.set('time-integrator', 'dt', str(float(dt)))
    mesh = Mesh(cfg)

    # define 1D mesh (construct a 1D world view)
    xmesh = mesh.xmesh

    # number of elements (how refined perspectives do we want/have?)
    Ne = mesh.Ne

    # define the basis (what is the basis for those perspectives?)
    bsKind = cfg.lookup('basis', 'kind')
    #assert bsKind == 'nodal-sem-gll', "Only one supported as of now"
    basiscls = subclass_where(Basis, basis_kind=bsKind)
    basis = basiscls(cfg)

    # number of local degrees of freedom (depth/granualirity of perspectives)
    K = basis.K

    # number of solution points (how far can I interpolate my learning)
    Nq = basis.Nq

    # left/right face maps
    Nqf = basis.Nqf  # number of points used in reconstruction at faces
    mapL, mapR = np.arange(Ne + 1) + (Nqf - 1) * Ne - 1, np.arange(Ne + 1)
    mapL[0], mapR[-1] = 0, Ne * Nqf - 1
    Nf = len(mapL)

    # the zeros
    z = basis.z

    # jacobian of the mapping from D^{st}=[-1,1] to D
    jac, invjac = mesh.jac, mesh.invjac

    # load the velocity mesh
    vm = DGFSVelocityMeshStd(cfg)
    Nv = vm.vsize()

    # load the scattering model
    smn = cfg.lookup('scattering-model', 'type')
    scatteringcls = subclass_where(DGFSScatteringModelAstd,
                                   scattering_model=smn)
    sm = scatteringcls(cfg, vm, Ne=Ne)

    # initial time, time step, final time
    ti, dt, tf = cfg.lookupfloats('time-integrator', ('tstart', 'dt', 'tend'))
    nsteps = np.ceil((tf - ti) / dt)
    dt = (tf - ti) / nsteps

    # Compute the location of the solution points
    xsol = np.array(
        [0.5 * (xmesh[j] + xmesh[j + 1]) + jac[j] * z for j in range(Ne)]).T
    xcoeff = np.einsum("kq,qe->ke", basis.fwdTransMat, xsol)

    # Determine the grid/block
    NeNv = Ne * Nv
    KNeNv = K * Ne * Nv
    NqNeNv = Nq * Ne * Nv
    NqfNeNv = Nqf * Ne * Nv
    NfNv = Nf * Nv
    block = (128, 1, 1)
    grid_Nv = get_grid_for_block(block, Nv)
    grid_NeNv = get_grid_for_block(block, Ne * Nv)
    grid_KNeNv = get_grid_for_block(block, K * Ne * Nv)

    # operator generator for matrix operations
    matOpGen = lambda v: lambda arg0, arg1: v.prepared_call(
        grid_NeNv, block, NeNv, arg0.ptr, NeNv, arg1.ptr, NeNv)

    # forward trans, backward, backward (at faces), derivative kernels
    fwdTrans_Op, bwdTrans_Op, bwdTransFace_Op, deriv_Op, invMass_Op, \
        computeCellAvg_Op, extractDrLin_Op = map(
        matOpGen, (basis.fwdTransOp, basis.bwdTransOp,
            basis.bwdTransFaceOp, basis.derivOp, basis.invMassOp,
            basis.computeCellAvgKern, basis.extractDrLinKern)
    )

    # U, V operator kernels
    trans_U_Op = tuple(map(matOpGen, basis.uTransOps))
    trans_V_Op = tuple(map(matOpGen, basis.vTransOps))

    # prepare the kernel for extracting face/interface values
    dfltargs = dict(K=K,
                    Ne=Ne,
                    Nq=Nq,
                    vsize=Nv,
                    dtype=cfg.dtypename,
                    mapL=mapL,
                    mapR=mapR,
                    offsetL=0,
                    offsetR=len(mapR) - 1,
                    invjac=invjac,
                    gRD=basis.gRD,
                    gLD=basis.gLD,
                    xsol=xsol)
    kernsrc = DottedTemplateLookup('dgfs1D.std.kernels',
                                   dfltargs).get_template('std').render()
    kernmod = compiler.SourceModule(kernsrc)

    dfltargs.update(nalph=sm.nalph, Dr=basis.derivMat)
    kernlimssrc = DottedTemplateLookup(
        'dgfs1D.astd.kernels', dfltargs).get_template('limiters').render()
    kernlimsmod = compiler.SourceModule(kernlimssrc)

    # prepare operators for execution (see std.mako for description)
    (extLeft_Op, extRight_Op, transferBC_L_Op, transferBC_R_Op, insertBC_L_Op,
     insertBC_R_Op) = map(
         lambda v: lambda *args: get_kernel(kernmod, v, 'PP').prepared_call(
             grid_Nv, block, *list(map(lambda c: c.ptr, args))),
         ("extract_left", "extract_right", "transfer_bc_left",
          "transfer_bc_right", "insert_bc_left", "insert_bc_right"))

    # The boundary conditions (by default all boundaries are processor bnds)
    bcl_type, bcr_type = 'dgfs-periodic', 'dgfs-periodic'

    # the mesh is decomposed in linear fashion, so rank 0 gets left boundary
    if rank == 0: bcl_type = cfg.lookup('soln-bcs-xlo', 'type')

    # and the last rank comm.size-1 gets the right boundary
    if rank == comm.size - 1: bcr_type = cfg.lookup('soln-bcs-xhi', 'type')

    # prepare kernels for left boundary
    bcl_cls = subclass_where(DGFSBCStd, type=bcl_type)
    bcl = bcl_cls(xmesh[0], -1., vm, cfg, 'soln-bcs-xlo')
    updateBC_L_Op = bcl.updateBCKern
    applyBC_L_Op = bcl.applyBCKern

    # prepare kernels for right boundary
    bcr_cls = subclass_where(DGFSBCStd, type=bcr_type)
    bcr = bcr_cls(xmesh[-1], 1., vm, cfg, 'soln-bcs-xhi')
    updateBC_R_Op = bcr.updateBCKern
    applyBC_R_Op = bcr.applyBCKern

    #if bcl_type == 'dgfs-cyclic' or bcr_type == 'dgfs-cyclic':
    #    assert(bcl_type==bcr_type);

    # flux kernel
    flux = get_kernel(kernmod, "flux", 'PPPPP')
    flux_Op = lambda d_uL, d_uR, d_jL, d_jR: flux.prepared_call(
        grid_Nv, block, d_uL.ptr, d_uR.ptr,
        vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr)

    # multiply the derivative by the advection velocity
    mulbyadv = get_kernel(kernmod, "mul_by_adv", 'PP')
    mulbyadv_Op = lambda d_ux: mulbyadv.prepared_call(grid_KNeNv, block,
                                                      vm.d_cvx().ptr, d_ux.ptr)

    # multiply the coefficient by the inverse jacobian
    mulbyinvjac = get_kernel(kernmod, "mul_by_invjac", 'P')
    mulbyinvjac_Op = lambda d_ux: mulbyinvjac.prepared_call(
        grid_Nv, block, d_ux.ptr)

    # \alpha AX + \beta Y kernel (for operations on coefficients)
    axnpbyCoeff = get_axnpby_kerns(2, range(K), NeNv, cfg.dtype)
    axnpbyCoeff_Op = lambda a0, x0, a1, x1: axnpbyCoeff.prepared_call(
        grid_NeNv, block, x0.ptr, x1.ptr, a0, a1)

    # total flux kernel (sums up surface and volume terms)
    totalFlux = get_kernel(kernmod, "totalFlux", 'PPPP')
    totalFlux_Op = lambda d_ux, d_jL, d_jR: totalFlux.prepared_call(
        grid_Nv, block, d_ux.ptr,
        vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr)

    # linear limiter
    limitLin = get_kernel(kernlimsmod, "limitLin", 'PPPP')
    limitLin_Op = lambda d_u, d_ulx, d_uavg, d_ulim: \
        limitLin.prepared_call(grid_Nv, block, d_u.ptr, d_ulx.ptr,
            d_uavg.ptr, d_ulim.ptr)

    # allocations on gpu
    d_usol = gpuarray.empty(NqNeNv, dtype=cfg.dtype)
    d_usolF = gpuarray.empty(NqfNeNv, dtype=cfg.dtype)
    d_uL = gpuarray.empty(NfNv, dtype=cfg.dtype)
    d_uR = gpuarray.empty(NfNv, dtype=cfg.dtype)
    d_jL = gpuarray.empty(NfNv, dtype=cfg.dtype)
    d_jR = gpuarray.empty(NfNv, dtype=cfg.dtype)
    d_bcL = gpuarray.empty(Nv, dtype=cfg.dtype)
    d_bcR = gpuarray.empty(Nv, dtype=cfg.dtype)
    d_bcT = gpuarray.empty(Nv, dtype=cfg.dtype)
    d_ux = gpuarray.empty(KNeNv, dtype=cfg.dtype)
    d_f = gpuarray.empty(KNeNv, dtype=cfg.dtype)
    d_g = gpuarray.empty(KNeNv, dtype=cfg.dtype)

    d_ucoeff = gpuarray.empty(KNeNv, dtype=cfg.dtype)
    d_ucoeffPrev = gpuarray.empty_like(d_ucoeff)

    # check if this is a new run
    if hasattr(args, 'process_run'):
        usol = np.empty((Nq, Ne, Nv), dtype=cfg.dtype)  # temporary storage

        # load the initial condition model
        icn = cfg.lookup('soln-ics', 'type')
        initcondcls = subclass_where(DGFSInitConditionStd, model=icn)
        ic = initcondcls(cfg, vm, 'soln-ics')
        ic.apply_init_vals(usol, Nq, Ne, xsol, mesh=mesh, basis=basis, sm=sm)

        # transfer the information to the gpu
        d_usol.set(usol.ravel())

        # forward transform to coefficient space
        fwdTrans_Op(d_usol, d_ucoeff)

    # check if we are restarting
    if hasattr(args, 'process_restart'):
        import h5py as h5py
        check(len(args.dist[0]) == comm.size, "No. of distributions != nranks")
        with h5py.File(args.dist[0][rank].name, 'r') as h5f:
            dst = h5f['coeff']
            ti = dst.attrs['time']
            d_ucoeff.set(dst[:])
            check(dst.attrs['K'] == K, "Inconsistent distribution K")
            check(dst.attrs['Ne'] == Ne, "Inconsistent distribution Ne")
            check(dst.attrs['Nv'] == Nv, "Inconsistent distribution N")

        # backward transform to solution space
        bwdTrans_Op(d_ucoeff, d_usol)

    # prepare the post-processing handlers
    # For computing moments
    moments = DGFSMomWriterStd(ti, basis.interpMat, xcoeff, d_ucoeff, vm, cfg,
                               'dgfsmomwriter')

    # For computing residual
    residual = DGFSResidualStd(cfg, 'dgfsresidual')

    # For writing distribution function
    distribution = DGFSDistributionStd(ti, (K, Ne, Nv), cfg, 'dgfsdistwriter')

    # Actual algorithm

    # initialize
    axnpbyCoeff_Op(0., d_ucoeffPrev, 1., d_ucoeff)
    sigModes = basis.sigModes

    # define the neighbours
    from mpi4py import MPI
    down_nbr, up_nbr = comm.rank - 1, comm.rank + 1
    if up_nbr >= comm.size: up_nbr = MPI.PROC_NULL
    if down_nbr < 0: down_nbr = MPI.PROC_NULL

    # define the explicit part
    def explicit(time, d_ucoeff_in, d_ucoeff_out):

        # reconstruct solution at faces
        bwdTransFace_Op(d_ucoeff_in, d_usolF)

        # Step:1 extract the solution at faces
        extLeft_Op(d_usolF, d_uL)
        extRight_Op(d_usolF, d_uR)

        # transfer left boundary information in send buffer
        transferBC_L_Op(d_uL, d_bcL)  # Transfer the left ghost BC info
        transferBC_R_Op(d_uR, d_bcR)  # Transfer the right ghost BC info

        # this can be adjusted in case of RDMA enabled MPI support
        #h_bcL, h_bcR = d_bcL.get(), d_bcR.get()
        #h_bcL, h_bcR = map(lambda v: v.gpudata.as_buffer(v.nbytes),
        #               (d_bcL, d_bcR))

        # send information
        req1 = comm.isend(d_bcR, dest=up_nbr)  # to upstream neighbour
        req2 = comm.isend(d_bcL, dest=down_nbr)  # to downstream neighbour

        # recieve information
        h_bcL = comm.recv(source=down_nbr)  # from downstream neighbour
        h_bcR = comm.recv(source=up_nbr)  # from upstream neighbour
        MPI.Request.Waitall([req1, req2])

        # set information at left, right boundary
        if h_bcL: d_bcL.set(h_bcL)
        else: transferBC_L_Op(d_uL, d_bcL)

        if h_bcR: d_bcR.set(h_bcR)
        else: transferBC_R_Op(d_uR, d_bcR)

        # The physical-periodic boundary condition
        if comm.size == 1 and bcr_type == 'dgfs-cyclic':
            copy(d_bcT, d_bcL)
            copy(d_bcL, d_bcR)
            copy(d_bcR, d_bcT)
        else:
            # At left, receive from right-most communicator; and vice-versa
            req1 = req2 = MPI.REQUEST_NULL
            if bcl_type == 'dgfs-cyclic':
                req1 = comm.isend(d_bcL, dest=comm.size - 1)
            if bcr_type == 'dgfs-cyclic': req2 = comm.isend(d_bcR, dest=0)
            if bcr_type == 'dgfs-cyclic': h_bcR = comm.recv(source=0)
            if bcl_type == 'dgfs-cyclic':
                h_bcL = comm.recv(source=comm.size - 1)
            MPI.Request.Waitall([req1, req2])
            if bcl_type == 'dgfs-cyclic': d_bcL.set(h_bcL)
            elif bcr_type == 'dgfs-cyclic': d_bcR.set(h_bcR)

        # At left boundary
        #transferBC_L_Op(d_uL, d_bcL)       # Transfer the ghost BC info
        updateBC_L_Op(d_bcL, time)  # now update boundary info
        applyBC_L_Op(d_bcL, d_bcT, time)  # apply boundary condition
        insertBC_L_Op(d_bcT, d_uL)  # insert info to global face-flux

        # At right boundary
        #transferBC_R_Op(d_uR, d_bcL)       # Transfer the ghost BC info
        updateBC_R_Op(d_bcR, time)  # now update boundary info
        applyBC_R_Op(d_bcR, d_bcT, time)  # apply boundary condition
        insertBC_R_Op(d_bcT, d_uR)  # insert info to global face-flux

        # Step:2 Compute the flux and jumps (all operations in single call)
        #fL, fR = cvx*uL, cvx*uR
        #fupw = 0.5*(fL + fR) + 0.5*np.abs(cvx)*(uL - uR)
        #jL = fupw - fL  # Compute the jump at left boundary
        #jR = fupw - fR  # Compute the jump at right boundary
        flux_Op(d_uL, d_uR, d_jL, d_jR)

        # Step:3 evaluate the derivative
        # ux = -cvx*np.einsum("ml,em->el", Sx, ucoeff)
        deriv_Op(d_ucoeff_in, d_ux)
        mulbyadv_Op(d_ux)

        # Compute the continuous flux for each element in strong form
        totalFlux_Op(d_ux, d_jL, d_jR)

        # multiply by the inverse jacobian
        # Now we have f* = d_ux
        mulbyinvjac_Op(d_ux)

        # project back to coefficient space
        invMass_Op(d_ux, d_ucoeff_out)

    d_uavg, d_ulx = map(gpuarray.empty_like, [d_ucoeff] * 2)

    def limit(d_ucoeff_in, d_ucoeff_out):
        assert comm.size == 1, "Not implemented"
        #assert basis.basis_kind == 'nodal-sem-gll', "Not implemented"

        # Extract the cell average
        computeCellAvg_Op(d_ucoeff_in, d_uavg)

        # extract gradient of the linear polynomial
        extractDrLin_Op(d_ucoeff_in, d_ulx)
        mulbyinvjac_Op(d_ulx)

        # limit functions in all cells
        limitLin_Op(d_ucoeff_in, d_ulx, d_uavg, d_ucoeff_out)

    # define a time-integrator (we use Euler scheme: good enough for steady)
    odestype = cfg.lookup('time-integrator', 'scheme')
    odescls = subclass_where(DGFSIntegratorAstd, intg_kind=odestype)
    limitOn = cfg.lookupordefault('time-integrator', 'limiter', 0)

    # Finally start everything
    time = ti  # initialize time in case of restart
    nacptsteps = 0  # number of elasped steps in the current run

    # initialize ode: this performs pre-integration for multi-step schemes
    odes = odescls(explicit,
                   sm, (K, Ne, Nv),
                   cfg.dtype,
                   t=time,
                   dt=dt,
                   f0=d_ucoeff)

    # start timer
    start = timer()
    while (time < tf):

        # March in time
        odes.integrate(time, dt, nacptsteps, d_ucoeff)
        if limitOn: limit(d_ucoeff, d_ucoeff)

        # increment time
        time += dt
        nacptsteps += 1

        # Final step: post processing routines
        residual(time, nacptsteps, d_ucoeff, d_ucoeffPrev)
        moments(dt, time, d_ucoeff)
        distribution(dt, time, d_ucoeff)

        # copy the solution for the next time step
        cuda.memcpy_dtod(d_ucoeffPrev.ptr, d_ucoeff.ptr, d_ucoeff.nbytes)

    # print elasped time
    end = timer()
    elapsed = np.array([end - start])
    if rank != root: comm.Reduce(elapsed, None, op=get_mpi('sum'), root=root)
    else:
        comm.Reduce(get_mpi('in_place'), elapsed, op=get_mpi('sum'), root=root)
        avgtime = elapsed[0] / comm.size
        print("Nsteps", nacptsteps, ", elapsed time", avgtime, "s")

    return d_ucoeff, mesh, vm, basis

Ejemplo n.º 9

0

Mostrar archivo

Archivo: moments.py Proyecto: jaisw7/dgfs1D_gpu

    def __init__(self,
                 tcurr,
                 im,
                 xcoeff,
                 coeff,
                 vm,
                 cfg,
                 cfgsect,
                 suffix=None,
                 extn='.txt'):

        self.vm = vm
        self.cfg = cfg

        # Construct the solution writer
        self.basedir = cfg.lookuppath(cfgsect, 'basedir', '.', abs=True)
        self.basename = cfg.lookup(cfgsect, 'basename')
        if not self.basename.endswith(extn):
            self.basename += extn

        # these variables are computed
        privarmap = [
            'rho', 'U:x', 'U:y', 'T', 'Q:x', 'Q:y', 'P:xx', 'P:yy', 'P:xy', 'p'
        ]
        if cfg.dim == 3:
            privarmap = [
                'rho', 'U:x', 'U:y', 'U:z', 'T', 'Q:x', 'Q:y', 'Q:z', 'P:xx',
                'P:xy', 'P:xz', 'P:yy', 'P:yz', 'P:zz', 'p'
            ]
        Ns = len(privarmap)
        privarmap = ["x" + str(v) for v in range(cfg.dim)] + privarmap
        self.fields = ", ".join(privarmap)

        # Output time step and next output time
        self.dt_out = cfg.lookupfloat(cfgsect, 'dt-out')
        self.tout_next = tcurr

        # get info
        #_, Ne = xcoeff.shape
        Ne = xcoeff.shape[1]

        # define the interpolation operator
        K, Nqr = im.shape
        self.im = im

        # size of the output
        self.leaddim = Nqr * Ne

        #self.xsol = np.einsum('mr,mej->rej', self.im, xcoeff)
        self.xsol = (np.matmul(self.im.T, xcoeff.reshape(K, -1)).reshape(
            Nqr, Ne, -1)) * vm.H0()
        #self.xsol = self.xsol.reshape((-1,1))*vm.H0()
        #print(self.xsol[:,0,:], self.im)
        #exit()

        # get the entire information
        comm, rank, root = get_comm_rank_root()
        self.xsol = self.xsol.swapaxes(0, 1).reshape(self.leaddim, -1)
        self.xsol = comm.gather(self.xsol, root=root)
        if rank == root: self.xsol = np.vstack(self.xsol)

        # allocate variables
        self._bulksoln = np.empty((Nqr, Ne, Ns))

        # helps us to compute moments from restart file
        self(1e-10, tcurr, coeff)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: distributionslice.py Proyecto: jaisw7/dgfs1D_gpu

    def __init__(self, tcurr, shape, im, vm, cfg, cfgsect, extn='.txt'):

        self.K, self.Ne, _ = shape
        self.vm = vm
        self.Nv = self.vm.Nv()

        # Construct the solution writer
        self.basedir = cfg.lookuppath(cfgsect, 'basedir', '.', abs=True)
        self.basename = cfg.lookup(cfgsect, 'basename')
        if not self.basename.endswith(extn):
            self.basename += extn

        # Output time step and next output time
        self.dt_out = cfg.lookupfloat(cfgsect, 'dt-out')
        self.tout_next = tcurr + self.dt_out

        # define the interpolation operator
        _, self.Nqr = im.shape
        self.im = im

        # Find total Ne
        comm, rank, _ = get_comm_rank_root()
        nranks = comm.size
        self.Ne_t = self.Ne * nranks

        # l on the current velocity mesh
        self.l = self.get_cl(self.Nv)

        # l on the reconstructed mesh
        self.Nvr = cfg.lookupordefault(cfgsect, 'Nvr', self.Nv)
        self.lr = self.get_cl(self.Nvr)

        # The spatial location where the slice is to be obtained
        self.Ne_out = cfg.lookupint(cfgsect, 'Ne-out')
        check(self.Ne_out < self.Ne_t, "Ne_out should be within [0, Ne)")

        self.Nqr_out = cfg.lookupint(cfgsect, 'Nqr-out')
        check(self.Nqr_out < self.Nqr, "Ne_out should be within [0, Nqr)")

        self.Nvx_out = cfg.lookupordefault(cfgsect, 'Nvx-out', -1)
        self.Nvy_out = cfg.lookupordefault(cfgsect, 'Nvy-out', -1)
        self.Nvz_out = cfg.lookupordefault(cfgsect, 'Nvz-out', -1)

        if self.Nvx_out == -1 and self.Nvy_out == -1 and self.Nvz_out == -1:
            raise ValueError(
                '''2 of the three variables (Nvx-out, Nvy-out, Nvz-out)
                should be provided''')

        if self.Nvx_out != -1 and self.Nvy_out != -1 and self.Nvz_out != -1:
            raise ValueError(
                '''Exactly 2 of the three variables (Nvx-out, Nvy-out, Nvz-out)
                should be provided''')

        # On which processor does this spatial location lie?
        self._thisRank = self.Ne_out // self.Ne

        # allocate variables
        if self.Nvx_out == -1:
            check(self.Nvy_out < self.Nv and self.Nvy_out >= 0,
                  "Nvy-out should be between [0, Ne)")
            check(self.Nvz_out < self.Nv and self.Nvz_out >= 0,
                  "Nvz-out should be between [0, Ne)")
            self._vslice = self.vm.cv()[0].reshape(
                (self.Nv, self.Nv, self.Nv))[:, self.Nvy_out, self.Nvz_out]

        elif self.Nvy_out == -1:
            check(self.Nvx_out < self.Ne_t and self.Nvx_out >= 0,
                  "Nvx-out should be between [0, Ne)")
            check(self.Nvz_out < self.Ne_t and self.Nvz_out >= 0,
                  "Nvz-out should be between [0, Ne)")
            self._vslice = self.vm.cv()[1].reshape(
                (self.Nv, self.Nv, self.Nv))[self.Nvx_out, :, self.Nvz_out]

        elif self.Nvz_out == -1:
            check(self.Nvx_out < self.Ne_t and self.Nvx_out >= 0,
                  "Nvx-out should be between [0, Ne)")
            check(self.Nvy_out < self.Ne_t and self.Nvy_out >= 0,
                  "Nvy-out should be between [0, Ne)")
            self._vslice = self.vm.cv()[2].reshape(
                (self.Nv, self.Nv, self.Nv))[self.Nvx_out, self.Nvy_out, :]

Ejemplo n.º 11

0

Mostrar archivo

def main():
    # who am I in this world? (Bulleh Shah, 18th century sufi poet)
    comm, rank, root = get_comm_rank_root()

    # read the inputs (from people)
    cfg, args = initialize()
    mesh = Mesh(cfg)

    # define 1D mesh (construct a 1D world view)
    xmesh = mesh.xmesh

    # number of elements (how refined perspectives do we want/have?)
    Ne = mesh.Ne

    # define the basis (what is the basis for those perspectives?)
    bsKind = cfg.lookup('basis', 'kind')
    basiscls = subclass_where(Basis, basis_kind=bsKind)
    basis = basiscls(cfg)

    # number of local degrees of freedom (depth/granualirity of perspectives)
    K = basis.K

    # number of solution points (how far can I interpolate my learning)
    Nq = basis.Nq

    # left/right face maps
    Nqf = basis.Nqf  # number of points used in reconstruction at faces
    mapL, mapR = np.arange(Ne+1)+(Nqf-1)*Ne-1, np.arange(Ne+1)
    mapL[0], mapR[-1] = 0, Ne*Nqf-1
    Nf = len(mapL)

    # the zeros
    z = basis.z
    
    # jacobian of the mapping from D^{st}=[-1,1] to D
    jac, invjac = mesh.jac, mesh.invjac

    # load the velocity mesh
    vm = DGFSVelocityMeshBi(cfg)
    Nv = vm.vsize()

    # load the scattering model
    smn = cfg.lookup('scattering-model', 'type')
    scatteringcls = subclass_where(DGFSScatteringModelBi, 
        scattering_model=smn)
    sm = scatteringcls(cfg, vm, Ne=Ne)

    # initial time, time step, final time
    ti, dt, tf = cfg.lookupfloats('time-integrator', ('tstart', 'dt', 'tend'))
    nsteps = np.ceil((tf - ti)/dt)
    dt = (tf - ti)/nsteps

    # Compute the location of the solution points 
    xsol = np.array([0.5*(xmesh[j]+xmesh[j+1])+jac[j]*z for j in range(Ne)]).T
    xcoeff = np.einsum("kq,qe->ke", basis.fwdTransMat, xsol)

    # Determine the grid/block
    NeNv = Ne*Nv
    KNeNv = K*Ne*Nv
    NqNeNv = Nq*Ne*Nv
    NqfNeNv = Nqf*Ne*Nv
    NfNv = Nf*Nv
    block = (128, 1, 1)
    grid_Nv = get_grid_for_block(block, Nv)
    grid_NeNv = get_grid_for_block(block, Ne*Nv)
    grid_KNeNv = get_grid_for_block(block, K*Ne*Nv)

    # operator generator for matrix operations
    matOpGen = lambda v: lambda arg0, arg1: v.prepared_call(
                grid_NeNv, block, NeNv, arg0.ptr, NeNv, arg1.ptr, NeNv)
    
    # forward trans, backward, backward (at faces), derivative kernels
    fwdTrans_Op, bwdTrans_Op, bwdTransFace_Op, deriv_Op, invMass_Op = map(
        matOpGen, (basis.fwdTransOp, basis.bwdTransOp, 
            basis.bwdTransFaceOp, basis.derivOp, basis.invMassOp)
    )

    # U, V operator kernels
    trans_U_Op = tuple(map(matOpGen, basis.uTransOps))
    trans_V_Op = tuple(map(matOpGen, basis.vTransOps))

    # prepare the kernel for extracting face/interface values
    dfltargs = dict(
        K=K, Ne=Ne, Nq=Nq, vsize=Nv, dtype=cfg.dtypename,
        mapL=mapL, mapR=mapR, offsetL=0, offsetR=len(mapR)-1,
        invjac=invjac, gRD=basis.gRD, gLD=basis.gLD)
    kernsrc = DottedTemplateLookup('dgfs1D.bi.kernels', 
                                    dfltargs).get_template('bi').render()
    kernmod = compiler.SourceModule(kernsrc)

    # prepare operators for execution (see bi.mako for description)
    (extLeft_Op, extRight_Op, transferBC_L_Op, transferBC_R_Op, 
        insertBC_L_Op, insertBC_R_Op) = map(lambda v: 
        lambda *args: get_kernel(kernmod, v, 'PP').prepared_call(
            grid_Nv, block, *list(map(lambda c: c.ptr, args))
        ), ("extract_left", "extract_right", "transfer_bc_left", 
            "transfer_bc_right", "insert_bc_left", "insert_bc_right")
    )

    # The boundary conditions (by default all boundaries are processor bnds)
    bcl_type, bcr_type = 'dgfs-periodic', 'dgfs-periodic'

    # the mesh is decomposed in linear fashion, so rank 0 gets left boundary
    if rank==0: bcl_type = cfg.lookup('soln-bcs-xlo', 'type')

    # and the last rank comm.size-1 gets the right boundary
    if rank==comm.size-1:  bcr_type = cfg.lookup('soln-bcs-xhi', 'type')
    
    # prepare kernels for left boundary    
    bcl_cls = subclass_where(DGFSBCBi, type=bcl_type)
    bcl = bcl_cls(xmesh[0], -1., vm, cfg, 'soln-bcs-xlo')
    updateBC_L_Op = bcl.updateBCKern
    applyBC_L_Op = bcl.applyBCKern
    
    # prepare kernels for right boundary
    bcr_cls = subclass_where(DGFSBCBi, type=bcr_type)
    bcr = bcr_cls(xmesh[-1], 1., vm, cfg, 'soln-bcs-xhi')
    updateBC_R_Op = bcr.updateBCKern
    applyBC_R_Op = bcr.applyBCKern

    # flux kernel
    flux = get_kernel(kernmod, "flux", 'PPPPP')
    flux_Op = lambda d_uL, d_uR, d_jL, d_jR: flux.prepared_call(
            grid_Nv, block, 
            d_uL.ptr, d_uR.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr)

    # multiply the derivative by the advection velocity
    mulbyadv = get_kernel(kernmod, "mul_by_adv", 'PP')
    mulbyadv_Op = lambda d_ux: mulbyadv.prepared_call(
                    grid_KNeNv, block, vm.d_cvx().ptr, d_ux.ptr)

    # multiply the coefficient by the inverse jacobian
    mulbyinvjac = get_kernel(kernmod, "mul_by_invjac", 'P')
    mulbyinvjac_Op = lambda d_ux: mulbyinvjac.prepared_call(
                    grid_Nv, block, d_ux.ptr)

    # \alpha AX + \beta Y kernel (for operations on coefficients)
    axnpbyCoeff = get_axnpby_kerns(2, range(K), NeNv, cfg.dtype)
    axnpbyCoeff_Op = lambda a0, x0, a1, x1: axnpbyCoeff.prepared_call(
                    grid_NeNv, block, x0.ptr, x1.ptr, a0, a1)

    # \alpha AX + \beta Y kernel (for operations on physical solutions)
    axnpbySol = get_axnpby_kerns(2, range(Nq), NeNv, cfg.dtype)
    axnpbySol_Op = lambda a0, x0, a1, x1: axnpbySol.prepared_call(
                    grid_NeNv, block, x0.ptr, x1.ptr, a0, a1)

    # total flux kernel (sums up surface and volume terms)
    totalFlux = get_kernel(kernmod, "totalFlux", 'PPPP')
    totalFlux_Op = lambda d_ux, d_jL, d_jR: totalFlux.prepared_call(
            grid_Nv, block, d_ux.ptr, vm.d_cvx().ptr, d_jL.ptr, d_jR.ptr)

    # allocations on gpu
    d_usol = gpuarray.empty(NqNeNv, dtype=cfg.dtype)
    d_usolF = gpuarray.empty(NqfNeNv, dtype=cfg.dtype)
    d_uL = gpuarray.empty(NfNv, dtype=cfg.dtype) 
    d_uR = gpuarray.empty(NfNv, dtype=cfg.dtype) 
    d_jL = gpuarray.empty(NfNv, dtype=cfg.dtype) 
    d_jR = gpuarray.empty(NfNv, dtype=cfg.dtype) 
    d_bcL = gpuarray.empty(Nv, dtype=cfg.dtype) 
    d_bcR = gpuarray.empty(Nv, dtype=cfg.dtype) 
    d_bcT = gpuarray.empty(Nv, dtype=cfg.dtype) 
    d_ux = gpuarray.empty(KNeNv, dtype=cfg.dtype)
    d_f = gpuarray.empty(KNeNv, dtype=cfg.dtype)
    d_g = gpuarray.empty(KNeNv, dtype=cfg.dtype)

    d_ucoeffs = [gpuarray.empty_like(d_ux) for p in range(vm.nspcs())]
    d_ucoeffPrevs = [gpuarray.empty_like(d_ux) for p in range(vm.nspcs())]

    # check if this is a new run
    if hasattr(args, 'process_run'):
        usol = np.empty((Nq, Ne, Nv), dtype=cfg.dtype)  # temporary storage

        # load the initial condition model
        icn = cfg.lookup('soln-ics', 'type')
        initcondcls = subclass_where(DGFSInitConditionBi, model=icn)
        ic = initcondcls(cfg, vm, 'soln-ics')
        
        for p in range(vm.nspcs()):
            ic.apply_init_vals(p, usol, Nq, Ne, xsol)

            # transfer the information to the gpu
            d_usol.set(usol.ravel())

            # forward transform to coefficient space
            fwdTrans_Op(d_usol, d_ucoeffs[p])

    # check if we are restarting
    if hasattr(args, 'process_restart'):
        import h5py as h5py
        check(len(args.dist[0])==comm.size, "No. of distributions != nranks")
        with h5py.File(args.dist[0][rank].name, 'r') as h5f:
            for p, d_ucoeff in enumerate(d_ucoeffs): 
                dst = h5f['coeff'+str(p)]
                ti = dst.attrs['time']
                d_ucoeff.set(dst[:])
                check(dst.attrs['K']==K, "Inconsistent distribution K")
                check(dst.attrs['Ne']==Ne, "Inconsistent distribution Ne")
                check(dst.attrs['Nv']==Nv, "Inconsistent distribution N")

                # backward transform to solution space
                #bwdTrans_Op(d_ucoeff, d_usol)
    
    # prepare the post-processing handlers    
    # For computing moments
    moments = DGFSMomWriterBi(ti, basis.interpMat, xcoeff, d_ucoeffs, vm, cfg, 
        'dgfsmomwriter')

    # For computing residual
    residual = DGFSResidualBi(cfg, 'dgfsresidual')

    # For writing distribution function
    distribution = DGFSDistributionBi(ti, (K, Ne, Nv), cfg, 
        'dgfsdistwriter')

    # Actual algorithm
    # allocation for time integrators

    # initialize
    for p in range(vm.nspcs()):
        axnpbyCoeff_Op(0., d_ucoeffPrevs[p], 1., d_ucoeffs[p])
    sigModes = basis.sigModes

    # define the neighbours
    from mpi4py import MPI
    down_nbr, up_nbr = comm.rank - 1, comm.rank + 1;
    if up_nbr >= comm.size: up_nbr = MPI.PROC_NULL
    if down_nbr < 0: down_nbr = MPI.PROC_NULL

    # define the ode rhs    
    def rhs(p, time, d_ucoeffs_in, d_ucoeff_out):

        # reconstruct solution at faces
        bwdTransFace_Op(d_ucoeffs_in[p], d_usolF)

        # Step:1 extract the solution at faces
        extLeft_Op(d_usolF, d_uL)
        extRight_Op(d_usolF, d_uR)
        
        # transfer left boundary information in send buffer
        transferBC_L_Op(d_uL, d_bcL)       # Transfer the left ghost BC info
        transferBC_R_Op(d_uR, d_bcR)       # Transfer the right ghost BC info

        # this can be adjusted in case of RDMA enabled MPI support
        h_bcL, h_bcR = d_bcL.get(), d_bcR.get()
        #h_bcL, h_bcR = map(lambda v: v.gpudata.as_buffer(v.nbytes), 
        #               (d_bcL, d_bcR))
        
        # send information
        req1 = comm.isend(d_bcR, dest=up_nbr)  # to upstream neighbour
        req2 = comm.isend(d_bcL, dest=down_nbr)  # to downstream neighbour 

        # recieve information
        h_bcL = comm.recv(source=down_nbr)  # from downstream neighbour
        h_bcR = comm.recv(source=up_nbr)    # from upstream neighbour
        MPI.Request.Waitall([req1, req2])
        
        # set information at left boundary
        if h_bcL:
            d_bcL.set(h_bcL)
        else:
            transferBC_L_Op(d_uL, d_bcL)  # Transfer the ghost BC info
        
        # set information at right boundary
        if h_bcR:
            d_bcR.set(h_bcR)
        else:
            transferBC_R_Op(d_uR, d_bcR)  # Transfer the ghost BC info

        # At left boundary        
        #transferBC_L_Op(d_uL, d_bcL)       # Transfer the ghost BC info
        updateBC_L_Op[p](d_bcL, time)         # now update boundary info 
        applyBC_L_Op[p](d_bcL, d_bcT, time)   # apply boundary condition
        insertBC_L_Op(d_bcT, d_uL)         # insert info to global face-flux

        # At right boundary        
        #transferBC_R_Op(d_uR, d_bcL)       # Transfer the ghost BC info
        updateBC_R_Op[p](d_bcR, time)         # now update boundary info 
        applyBC_R_Op[p](d_bcR, d_bcT, time)   # apply boundary condition
        insertBC_R_Op(d_bcT, d_uR)         # insert info to global face-flux

        # Step:2 Compute the flux and jumps (all operations in single call)
        #fL, fR = cvx*uL, cvx*uR
        #fupw = 0.5*(fL + fR) + 0.5*np.abs(cvx)*(uL - uR)
        #jL = fupw - fL  # Compute the jump at left boundary
        #jR = fupw - fR  # Compute the jump at right boundary
        flux_Op(d_uL, d_uR, d_jL, d_jR)

        # Step:3 evaluate the derivative 
        # ux = -cvx*np.einsum("ml,em->el", Sx, ucoeff)
        deriv_Op(d_ucoeffs_in[p], d_ux)
        mulbyadv_Op(d_ux)

        # Compute the continuous flux for each element in strong form
        totalFlux_Op(d_ux, d_jL, d_jR)
        
        # multiply by the inverse jacobian
        mulbyinvjac_Op(d_ux)

        # Step:4 Add collision kernel contribution
        #ux += Q(\sum U^{m}_{ar} ucoeff_{aej}, \sum V^{m}_{ra} ucoeff_{aej})
        cases = [str(p)+str(q) for q in range(vm.nspcs())]
        for m in range(K):
            trans_U_Op[m](d_ucoeffs_in[p], d_f)
            for q in range(vm.nspcs()):            
                trans_V_Op[m](d_ucoeffs_in[q], d_g)
                for r, e in it.product(sigModes[m], range(Ne)):
                    sm.fs(cases[q], d_f, d_g, d_ux, e, r, m)
        
        #for q in range(vm.nspcs()):            
        #  for r, e in it.product(range(K), range(Ne)):
        #    sm.fs(cases[q], d_ucoeffs_in[p], d_ucoeffs_in[q], d_ux, e, r, r)

        # Step:5 Multiply by inverse mass matrix
        invMass_Op(d_ux, d_ucoeff_out)

        
    # define a time-integrator
    odestype = cfg.lookup('time-integrator', 'scheme')
    odescls = subclass_where(DGFSIntegratorBi, intg_kind=odestype)
    odes = odescls(rhs, (K, Ne, Nv), cfg.dtype, vm.nspcs())

    # Finally start everything
    time = ti  # initialize time in case of restart
    nacptsteps = 0 # number of elasped steps in the current run

    # start timer
    start = timer()

    while(time < tf):

        # March in time 
        odes.integrate(time, dt, d_ucoeffs)

        # increment time
        time += dt 
        nacptsteps += 1

        # Final step: post processing routines
        residual(time, nacptsteps, d_ucoeffPrevs, d_ucoeffs)
        moments(dt, time, d_ucoeffs)
        distribution(dt, time, d_ucoeffs)

        # copy the solution for the next time step
        for p in range(vm.nspcs()):
            cuda.memcpy_dtod(d_ucoeffPrevs[p].ptr, d_ucoeffs[p].ptr, 
                d_ucoeffs[p].nbytes)

    # print elasped time
    end = timer()
    elapsed = np.array([end - start])
    if rank==root:
        comm.Allreduce(get_mpi('in_place'), elapsed, op=get_mpi('sum'))
        avgtime = elapsed[0]/comm.size
        print("Nsteps", nacptsteps, ", elapsed time", avgtime, "s")