Example #1
0
    def constructMaxwellian(self, U, M, Ut):
        nupts, ldim, _ = M.traits
        nvars, neles = M.ioshape[1:]

        grid = get_grid_for_block(self.block, U.ioshape[0])
        self.momentNormKern.prepared_call(grid, self.block, U.ioshape[0], U,
                                          Ut)

        grid = get_grid_for_block(self.block, nupts * nvars * neles)
        self.cmaxwellianKern.prepared_call(grid, self.block, nupts, ldim,
                                           nvars, neles, self.d_cvx.ptr,
                                           self.d_cvy.ptr, self.d_cvz.ptr, M,
                                           Ut)
Example #2
0
    def swap_axes(self, fin, fout):
        nupts, ldim, _ = fin.traits
        nvars, neles = fout.ioshape[1:]

        grid_swap = get_grid_for_block(self.block, nupts * nvars * neles)
        self.swapKern.prepared_call(grid_swap, self.block, nupts, ldim, nvars,
                                    neles, fin, fout)
Example #3
0
    def mul(self, a, b, out, alpha=1.0, beta=0.0):
        # Ensure the matrices are compatible
        if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol:
            raise ValueError('Incompatible matrices for out = a*b')

        # Check that A is constant
        if 'const' not in a.tags:
            raise NotSuitableError('GiMMiK requires a constant a matrix')

        # Check that A is reasonably sparse
        if np.count_nonzero(a.get()) > self.max_nnz:
            raise NotSuitableError('Matrix too dense for GiMMiK')

        # Generate
        src = generate_mm(a.get(),
                          dtype=a.dtype,
                          platform='cuda',
                          alpha=alpha,
                          beta=beta)

        # Build
        fun = self._build_kernel('gimmik_mm', src, 'iPiPi')

        # Determine the grid/block
        block = (128, 1, 1)
        grid = get_grid_for_block(block, b.ncol)

        class MulKernel(ComputeKernel):
            def run(self, queue):
                fun.prepared_async_call(grid, block, queue.cuda_stream_comp,
                                        b.ncol, b, b.leaddim, out, out.leaddim)

        return MulKernel()
Example #4
0
    def copy_to_reg(self, *arr, subdims=None):
        nrow, ldimr, dtype = arr[1].traits
        _, ldimf, _ = arr[0].traits
        ncolar, ncolb = arr[1].ioshape[1:]
        ncolaf, _ = arr[0].ioshape[1:]

        # Render the kernel template
        src = self.backend.lookup.get_template('copy_dgfs').render(
            subdims=subdims or range(ncolar), ncola0=ncolaf, ncola1=ncolar)

        #print(arr[0].traits)
        #print(arr[0].ioshape)
        #print(arr[1].traits)
        #print(arr[1].ioshape)
        #raise ValueError('copy_to_reg')

        # Build the kernel
        kern = self._build_kernel('copy_dgfs', src, [np.int32] * 4 +
                                  [np.intp] * 2 + [np.int32] * 4)

        # Determine the grid/block
        block = (128, 1, 1)
        grid = get_grid_for_block(block, ncolb, nrow)

        class CopyToDGFSKernel(ComputeKernel):
            def run(self, queue, *consts):
                args = list(arr) + list(consts)

                kern.prepared_call(grid, block, nrow, ncolb, ldimf, ldimr,
                                   *args)

        return CopyToDGFSKernel()
Example #5
0
    def axnpby_dgfs_full(self, *arr, subdims=None):
        if any(arr[0].traits != x.traits for x in arr[1:]):
            raise ValueError('Incompatible matrix types')

        nv = len(arr)
        nrow, ldim, dtype = arr[0].traits
        ncola, ncolb = arr[0].ioshape[1:]
        size = nrow * ldim

        # Render the kernel template
        src = self.backend.lookup.get_template('axnpby_dgfs_full').render(
            nv=nv)

        # Build the kernel
        kern = self._build_kernel('axnpby_dgfs_full', src, [np.int32] * 2 +
                                  [np.intp] * nv + [dtype] * nv)

        # Determine the grid/block
        block = (128, 1, 1)
        grid = get_grid_for_block(block, size)

        class AxnpbyDGFSFullKernel(ComputeKernel):
            def run(self, queue, *consts):
                args = list(arr) + list(consts)

                # changing from prepared_async_call (queue.cuda_stream_comp,)
                kern.prepared_call(grid, block, size, *args)

        return AxnpbyDGFSFullKernel()
Example #6
0
    def updateMomentARS(self, dt, *args):
        # the size of args should be 4*q+1 for ARS scheme
        q = (len(args) - 1) // 4
        assert len(args) == 4 * q + 1, "Inconsistency in number of parameters"

        lda = np.int(args[-1].ioshape[0])
        grid = get_grid_for_block(self.block, lda)
        self.updateMomKernsARS[q - 1].prepared_call(grid, self.block,
                                                    self._prefactor, lda, dt,
                                                    *args)
Example #7
0
    def updateDistARS(self, dt, *args):
        # the size of args should be 6*q+2 for ARS scheme
        q = (len(args) - 2) // 6
        assert len(args) == 6 * q + 2, "Inconsistency in number of parameters"

        nupts, ldim, _ = args[-1].traits
        nvars, neles = args[-1].ioshape[1:]
        grid = get_grid_for_block(self.block, nupts * nvars * neles)
        self.updateDistKernsARS[q - 1].prepared_call(grid, self.block,
                                                     self._prefactor, nupts,
                                                     ldim, nvars, neles, dt,
                                                     *args)
Example #8
0
    def updateDistBDF(self, dt, *args):
        # the size of args should be 4*q+5 for BDF scheme
        q = (len(args) - 5) // 4
        assert len(args) == 4 * q + 5, "Inconsistency in number of parameters"

        nupts, ldim, _ = args[1].traits
        nvars, neles = args[1].ioshape[1:]
        grid = get_grid_for_block(self.block, nupts * nvars * neles)
        self.updateDistKernsBDF[q - 1].prepared_call(grid, self.block,
                                                     self._prefactor, nupts,
                                                     ldim, nvars, neles, dt,
                                                     *args)
Example #9
0
    def pack(self, mv):
        # An exchange view is simply a regular view plus an exchange matrix
        m, v = mv.xchgmat, mv.view

        # Render the kernel template
        src = self.backend.lookup.get_template('pack').render()

        # Build
        kern = self._build_kernel('pack_view', src, 'iiiPPPP')

        # Compute the grid and thread-block size
        block = (128, 1, 1)
        grid = get_grid_for_block(block, v.n)

        # If MPI is CUDA aware then we just need to pack the buffer
        if self.backend.mpitype == 'cuda-aware':

            class PackXchgViewKernel(ComputeKernel):
                def run(self, queue):
                    scomp = queue.cuda_stream_comp

                    # Pack
                    kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow,
                                             v.nvcol, v.basedata, v.mapping,
                                             v.rstrides or 0, m)

        # Otherwise, we need to both pack the buffer and copy it back
        else:
            # Create a CUDA event
            event = cuda.Event(cuda.event_flags.DISABLE_TIMING)

            class PackXchgViewKernel(ComputeKernel):
                def run(self, queue):
                    scomp = queue.cuda_stream_comp
                    scopy = queue.cuda_stream_copy

                    # Pack
                    kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow,
                                             v.nvcol, v.basedata, v.mapping,
                                             v.rstrides or 0, m)

                    # Copy the packed buffer to the host
                    event.record(scomp)
                    scopy.wait_for_event(event)
                    cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)

        return PackXchgViewKernel()
Example #10
0
    def errest(self, x, y, z, *, norm):
        if x.traits != y.traits != z.traits:
            raise ValueError('Incompatible matrix types')

        nrow, ldim, dtype = x.traits
        ncola, ncolb = x.ioshape[1:]

        # Reduction block dimensions
        block = (128, 1, 1)

        # Determine the grid size
        grid = get_grid_for_block(block, ncolb)

        # Empty result buffer on host with shape (nvars, nblocks)
        err_host = cuda.pagelocked_empty((ncola, grid[0]), dtype, 'C')

        # Device memory allocation
        err_dev = cuda.mem_alloc(err_host.nbytes)

        # Get the kernel template
        src = self.backend.lookup.get_template('errest').render(
            norm=norm, ncola=ncola, sharesz=block[0])

        # Build the reduction kernel
        rkern = self._build_kernel('errest', src, [np.int32] * 3 +
                                   [np.intp] * 4 + [dtype] * 2)

        # Norm type
        reducer = np.max if norm == 'uniform' else np.sum

        class ErrestKernel(ComputeKernel):
            @property
            def retval(self):
                return reducer(err_host, axis=1)

            def run(self, queue, atol, rtol):
                rkern.prepared_async_call(grid, block, queue.cuda_stream_comp,
                                          nrow, ncolb, ldim, err_dev, x, y, z,
                                          atol, rtol)
                cuda.memcpy_dtoh_async(err_host, err_dev,
                                       queue.cuda_stream_comp)

        return ErrestKernel()
Example #11
0
    def perform_precomputation(self):
        # Precompute aa, bb1, bb2 (required for kernel)
        # compute l
        Nv = self.vm.Nv()
        Nrho = self.vm.Nrho()
        M = self.vm.M()
        L = self.vm.L()
        qz = self.vm.qz()
        qw = self.vm.qw()
        sz = self.vm.sz()
        sw = self.vm.sw()
        vsize = self.vm.vsize()
        szpre = self._szpre
        swpre = self._swpre

        # precision control
        dint = np.int32
        dfloat = np.float64
        dcplx = np.complex128

        l0 = np.concatenate((np.arange(0,Nv/2, dtype=dint), 
            np.arange(-Nv/2, 0, dtype=dint)))
        #l = l0[np.mgrid[0:Nv, 0:Nv, 0:Nv]]
        #l = l.reshape((3,vsize)).astype(dtype_int)
        l = np.zeros((3,vsize), dtype=dint)
        for idv in range(vsize):
            I = int(idv/(Nv*Nv))
            J = int((idv%(Nv*Nv))/Nv)
            K = int((idv%(Nv*Nv))%Nv)
            l[0,idv] = l0[I];
            l[1,idv] = l0[J];
            l[2,idv] = l0[K];
        d_lx = gpuarray.to_gpu(np.ascontiguousarray(l[0,:]))
        d_ly = gpuarray.to_gpu(np.ascontiguousarray(l[1,:]))
        d_lz = gpuarray.to_gpu(np.ascontiguousarray(l[2,:]))

        # transfer sphere points to gpu
        d_sz_x = gpuarray.to_gpu(np.ascontiguousarray(sz[:,0]))
        d_sz_y = gpuarray.to_gpu(np.ascontiguousarray(sz[:,1]))
        d_sz_z = gpuarray.to_gpu(np.ascontiguousarray(sz[:,2]))
        
        # define complex to complex plan
        rank = 3
        n = np.array([Nv, Nv, Nv], dtype=np.int32)

        #planD2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_D2Z)
        self.planZ2Z_MNrho = cufftPlanMany(rank, n.ctypes.data,
            None, 1, vsize, 
            None, 1, vsize, 
            CUFFT_Z2Z, M*Nrho)
        self.planZ2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_Z2Z)

        dfltargs = dict(
            Nrho=Nrho, M=M, 
            vsize=vsize, sw=sw, prefac=self._prefactor, 
            soasz=self.backend.soasz,
            cases=self._cases, masses=self.vm.masses(),
            qw=qw, qz=qz, 
            L=L, sz=sz, 
            gamma=self._gamma, eta=self._eta,
            Mpre=self._Mpre, szpre=szpre, swpre=swpre
        )
        src = DottedTemplateLookup(
            'frfs.solvers.dgfsbi.kernels.scattering', dfltargs
        ).get_template(self.scattering_model).render()

        # Compile the source code and retrieve the kernel
        print("\nCompiling scattering kernels, this may take some time ...")
        module = compiler.SourceModule(src)

        self.block = (256, 1, 1)
        self.grid = get_grid_for_block(self.block, vsize)

        print("Starting precomputation, this may take some time ...")
        start, end = cuda.Event(), cuda.Event()
        cuda.Context.synchronize()
        start.record()
        start.synchronize()

        self.d_aa = gpuarray.empty(Nrho*M*vsize, dtype=dfloat)
        precompute_aa = module.get_function("precompute_a")
        precompute_aa.prepare('PPPP')
        precompute_aa.set_cache_config(cuda.func_cache.PREFER_L1)
        precompute_aa.prepared_call(self.grid, self.block, 
            d_lx.ptr, d_ly.ptr, d_lz.ptr, self.d_aa.ptr)

        self.d_bb1 = {}; self.d_bb2 = {}
        precompute_bb = {}
        for cp, cq in self._cases:
            cpcq = str(cp)+str(cq)
            self.d_bb1[cpcq] = gpuarray.empty(Nrho*M*vsize, dtype=dcplx)
            self.d_bb2[cpcq] = gpuarray.zeros(vsize, dtype=dcplx)
            precompute_bb[cpcq] = module.get_function("precompute_bc_"+cpcq)
            precompute_bb[cpcq].prepare('IIdddPPPPPPPP')
            precompute_bb[cpcq].set_cache_config(cuda.func_cache.PREFER_L1)

            for p in range(Nrho):
                fac = np.pi/L*qz[p]
                fac_b = swpre*pow(qz[p], self._gamma[cpcq]+2)
                fac_c = qw[p]*sw*fac_b
                for q in range(M):
                    precompute_bb[cpcq].prepared_call(self.grid, self.block,
                        dint(p), dint(q), dfloat(fac), 
                        dfloat(fac_b), dfloat(fac_c),
                        d_lx.ptr, d_ly.ptr, d_lz.ptr, 
                        d_sz_x.ptr, d_sz_y.ptr, d_sz_z.ptr, 
                        self.d_bb1[cpcq].ptr, self.d_bb2[cpcq].ptr
                    )

        end.record()
        end.synchronize()
        secs = start.time_till(end)*1e-3
        print("Finished precomputation in: %fs" % (secs))

        # transform scalar to complex
        self.r2zKern = module.get_function("r2z")
        self.r2zKern.prepare('IIIIIIPP')
        self.r2zKern.set_cache_config(cuda.func_cache.PREFER_L1)

        # Prepare the cosSinMul kernel for execution
        self.cosSinMultKern = {}
        #self.computeQGKern = {}
        self.outKern = {}
        for cp, cq in self._cases:
            idx = str(cp) + str(cq)
            self.cosSinMultKern[idx] = module.get_function("cosSinMul_"+idx)
            self.cosSinMultKern[idx].prepare('PPPPP')
            self.cosSinMultKern[idx].set_cache_config(
                cuda.func_cache.PREFER_L1)

            #self.computeQGKern[idx] = module.get_function("computeQG_"+idx)
            #self.computeQGKern[idx].prepare('PPP')
            #self.computeQGKern[idx].set_cache_config(
            #    cuda.func_cache.PREFER_L1)

            self.outKern[idx] = module.get_function("output_"+idx)
            self.outKern[idx].prepare('IIIIIIPPPP')
            self.outKern[idx].set_cache_config(
                cuda.func_cache.PREFER_L1)

        # prepare the computeQG kernel
        self.computeQGKern = module.get_function("computeQG")
        self.computeQGKern.prepare('PPP')
        self.computeQGKern.set_cache_config(cuda.func_cache.PREFER_L1)

        # Prepare the prodKern kernel for execution
        self.prodKern = module.get_function("prod")
        self.prodKern.prepare('PPP')
        self.prodKern.set_cache_config(cuda.func_cache.PREFER_L1)

        # Prepare the ax kernel for execution
        self.ax2Kern = module.get_function("ax2")
        self.ax2Kern.prepare('PPP')
        self.ax2Kern.set_cache_config(cuda.func_cache.PREFER_L1)

        # define scratch  spaces
        self.d_FTf = gpuarray.empty(vsize, dtype=dcplx)
        self.d_FTg = gpuarray.empty(vsize, dtype=dcplx)
        self.d_f1C = gpuarray.empty_like(self.d_FTf)
        self.d_f2C = gpuarray.empty_like(self.d_FTf)
        self.d_QG = gpuarray.empty_like(self.d_FTf)
        self.d_t1 = gpuarray.empty(M*Nrho*vsize, dtype=dcplx)
        self.d_t2 = gpuarray.empty_like(self.d_t1)
        self.d_t3 = gpuarray.empty_like(self.d_t1)