コード例 #1
0
ファイル: gimmik.py プロジェクト: FreddieWitherden/PyFR
    def mul(self, a, b, out, alpha=1.0, beta=0.0):
        # Ensure the matrices are compatible
        if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol:
            raise ValueError('Incompatible matrices for out = a*b')

        # Check that A is constant
        if 'const' not in a.tags:
            raise NotSuitableError('GiMMiK requires a constant a matrix')

        # Fetch the matrix and tally up the number of non-zeros
        arr = a.get()
        nnz, nuq = np.count_nonzero(arr), len(np.unique(np.abs(arr)))

        # Check that A is suitable
        if nuq > 28 and nnz / arr.size > 0.15:
            raise NotSuitableError('Matrix is inappropriate for GiMMiK')

        # Generate
        src = generate_mm(arr, dtype=a.dtype, platform='cuda',
                          alpha=alpha, beta=beta)

        # Build
        fun = self._build_kernel('gimmik_mm', src,
                                 [np.int32, np.intp]*2 + [np.int32])

        # Determine the grid/block
        block = (128, 1, 1)
        grid = get_grid_for_block(block, b.ncol)

        class MulKernel(ComputeKernel):
            def run(self, queue):
                fun.exec_async(grid, block, queue.stream_comp, b.ncol, b,
                               b.leaddim, out, out.leaddim)

        return MulKernel()
コード例 #2
0
ファイル: blasext.py プロジェクト: zwghit/PyFR
    def axnpby(self, *arr, subdims=None):
        if any(arr[0].traits != x.traits for x in arr[1:]):
            raise ValueError('Incompatible matrix types')

        nv = len(arr)
        nrow, ldim, dtype = arr[0].traits
        ncola, ncolb = arr[0].ioshape[1:]

        # Render the kernel template
        src = self.backend.lookup.get_template('axnpby').render(
            subdims=subdims or range(ncola), ncola=ncola, nv=nv
        )

        # Build the kernel
        kern = self._build_kernel('axnpby', src,
                                  [np.int32]*3 + [np.intp]*nv + [dtype]*nv)

        # Determine the grid/block
        block = (128, 1, 1)
        grid = get_grid_for_block(block, ncolb, nrow)

        class AxnpbyKernel(ComputeKernel):
            def run(self, queue, *consts):
                args = list(arr) + list(consts)

                kern.prepared_async_call(grid, block, queue.cuda_stream_comp,
                                         nrow, ncolb, ldim, *args)

        return AxnpbyKernel()
コード例 #3
0
ファイル: gimmik.py プロジェクト: tarikdzanic/PyFR-RC
    def mul(self, a, b, out, alpha=1.0, beta=0.0):
        # Ensure the matrices are compatible
        if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol:
            raise ValueError('Incompatible matrices for out = a*b')

        # Check that A is constant
        if 'const' not in a.tags:
            raise NotSuitableError('GiMMiK requires a constant a matrix')

        # Check that A is reasonably sparse
        if np.count_nonzero(a.get()) > self.max_nnz:
            raise NotSuitableError('Matrix too dense for GiMMiK')

        # Generate
        src = generate_mm(a.get(),
                          dtype=a.dtype,
                          platform='cuda',
                          alpha=alpha,
                          beta=beta)

        # Build
        fun = self._build_kernel('gimmik_mm', src, 'iPiPi')

        # Determine the grid/block
        block = (128, 1, 1)
        grid = get_grid_for_block(block, b.ncol)

        class MulKernel(ComputeKernel):
            def run(self, queue):
                fun.prepared_async_call(grid, block, queue.cuda_stream_comp,
                                        b.ncol, b, b.leaddim, out, out.leaddim)

        return MulKernel()
コード例 #4
0
ファイル: gimmik.py プロジェクト: GwenaelGabard/PyFR
    def _mul_gimmik(self, a, b, out, alpha=1.0, beta=0.0):
        # Ensure the matrices are compatible
        if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol:
            raise ValueError('Incompatible matrices for out = a*b')

        # Check that A is constant
        if 'const' not in a.tags:
            raise NotSuitableError('GiMMiK requires a constant a matrix')

        # Check that A is reasonably sparse
        if np.count_nonzero(a.get()) > self.max_nnz:
            raise NotSuitableError('Matrix too dense for GiMMiK')

        # Generate
        src = self._gen_gimmik(
            a.get(), 'cuda', alpha=alpha, beta=beta,
            double=a.dtype == np.float64, reduced=True,
        )

        # Build
        fun = self._build_kernel('gimmik_mm', src, 'PPiii')

        # Determine the grid/block
        block = (128, 1, 1)
        grid = get_grid_for_block(block, b.ncol)

        class MulKernel(ComputeKernel):
            def run(self, queue):
                fun.prepared_async_call(grid, block, queue.cuda_stream_comp,
                                        b, out, b.ncol, b.leaddim,
                                        out.leaddim)

        return MulKernel()
コード例 #5
0
ファイル: packing.py プロジェクト: fbob/PyFR
    def pack(self, mv):
        # An MPI view is simply a regular view plus an MPI matrix
        m, v = mv.mpimat, mv.view

        # Render the kernel template
        tpl = self.backend.lookup.get_template('pack')
        src = tpl.render(dtype=npdtype_to_ctype(m.dtype))

        # Build
        kern = self._build_kernel('pack_view', src, 'iiiPPPPP')

        # Compute the grid and thread-block size
        block = (128, 1, 1)
        grid = get_grid_for_block(block, v.n)

        # Create a CUDA event
        event = cuda.Event(cuda.event_flags.DISABLE_TIMING)

        class PackMPIViewKernel(ComputeKernel):
            def run(self, queue):
                scomp = queue.cuda_stream_comp
                scopy = queue.cuda_stream_copy

                # Pack
                kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow,
                                         v.nvcol, v.basedata, v.mapping,
                                         v.cstrides or 0, v.rstrides or 0, m)

                # Copy the packed buffer to the host
                event.record(scomp)
                scopy.wait_for_event(event)
                cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)

        return PackMPIViewKernel()
コード例 #6
0
ファイル: gimmik.py プロジェクト: vincentlab/PyFR
    def mul(self, a, b, out, alpha=1.0, beta=0.0):
        # Ensure the matrices are compatible
        if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol:
            raise ValueError("Incompatible matrices for out = a*b")

        # Check that A is constant
        if "const" not in a.tags:
            raise NotSuitableError("GiMMiK requires a constant a matrix")

        # Check that A is reasonably sparse
        if np.count_nonzero(a.get()) > self.max_nnz:
            raise NotSuitableError("Matrix too dense for GiMMiK")

        # Generate
        src = generate_mm(a.get(), dtype=a.dtype, platform="cuda", alpha=alpha, beta=beta)

        # Build
        fun = self._build_kernel("gimmik_mm", src, "iPiPi")

        # Determine the grid/block
        block = (128, 1, 1)
        grid = get_grid_for_block(block, b.ncol)

        class MulKernel(ComputeKernel):
            def run(self, queue):
                fun.prepared_async_call(grid, block, queue.cuda_stream_comp, b.ncol, b, b.leaddim, out, out.leaddim)

        return MulKernel()
コード例 #7
0
ファイル: blasext.py プロジェクト: vavrines/PyFR
    def reduction(self, *rs, method, norm, dt_mat=None):
        if any(r.traits != rs[0].traits for r in rs[1:]):
            raise ValueError('Incompatible matrix types')

        cuda = self.backend.cuda
        nrow, ncol, ldim, dtype = rs[0].traits[1:]
        ncola, ncolb = rs[0].ioshape[1:]

        # Reduction block dimensions
        block = (128, 1, 1)

        # Determine the grid size
        grid = get_grid_for_block(block, ncolb, ncola)

        # Empty result buffer on the device
        reduced_dev = cuda.mem_alloc(ncola*grid[0]*rs[0].itemsize)

        # Empty result buffer on the host
        reduced_host = cuda.pagelocked_empty((ncola, grid[0]), dtype)

        tplargs = dict(norm=norm, sharesz=block[0], method=method)

        if method == 'resid':
            tplargs['dt_type'] = 'matrix' if dt_mat else 'scalar'

        # Get the kernel template
        src = self.backend.lookup.get_template('reduction').render(**tplargs)

        regs = list(rs) + [dt_mat] if dt_mat else rs

        # Argument types for reduction kernel
        if method == 'errest':
            argt = [np.int32]*3 + [np.intp]*4 + [dtype]*2
        elif method == 'resid' and dt_mat:
            argt = [np.int32]*3 + [np.intp]*4 + [dtype]
        else:
            argt = [np.int32]*3 + [np.intp]*3 + [dtype]

        # Build the reduction kernel
        rkern = self._build_kernel('reduction', src, argt)

        # Norm type
        reducer = np.max if norm == 'uniform' else np.sum

        class ReductionKernel(Kernel):
            @property
            def retval(self):
                return reducer(reduced_host, axis=1)

            def run(self, queue, *facs):
                rkern.exec_async(grid, block, queue.stream,
                                 nrow, ncolb, ldim, reduced_dev, *regs, *facs)
                cuda.memcpy(reduced_host, reduced_dev, reduced_dev.nbytes,
                            queue.stream)

        return ReductionKernel()
コード例 #8
0
    def pack(self, mv):
        cuda = self.backend.cuda

        # An exchange view is simply a regular view plus an exchange matrix
        m, v = mv.xchgmat, mv.view

        # Render the kernel template
        src = self.backend.lookup.get_template('pack').render()

        # Build
        kern = self._build_kernel('pack_view', src,
                                  [np.int32] * 3 + [np.intp] * 4)

        # Compute the grid and thread-block size
        block = (128, 1, 1)
        grid = get_grid_for_block(block, v.n)

        # If MPI is CUDA aware then we just need to pack the buffer
        if self.backend.mpitype == 'cuda-aware':

            class PackXchgViewKernel(ComputeKernel):
                def run(self, queue):
                    scomp = queue.cuda_stream_comp

                    # Pack
                    kern.exec_async(grid, block, scomp, v.n, v.nvrow, v.nvcol,
                                    v.basedata, v.mapping, v.rstrides or 0, m)

        # Otherwise, we need to both pack the buffer and copy it back
        else:
            # Create a CUDA event
            event = cuda.create_event()

            class PackXchgViewKernel(ComputeKernel):
                def run(self, queue):
                    scomp = queue.cuda_stream_comp
                    scopy = queue.cuda_stream_copy

                    # Pack
                    kern.exec_async(grid, block, scomp, v.n, v.nvrow, v.nvcol,
                                    v.basedata, v.mapping, v.rstrides or 0, m)

                    # Copy the packed buffer to the host
                    event.record(scomp)
                    scopy.wait_for_event(event)
                    cuda.memcpy_async(m.hdata, m.data, m.nbytes, scopy)

        return PackXchgViewKernel()
コード例 #9
0
ファイル: packing.py プロジェクト: pv101/PyFR
    def pack(self, mv):
        # An exchange view is simply a regular view plus an exchange matrix
        m, v = mv.xchgmat, mv.view

        # Render the kernel template
        src = self.backend.lookup.get_template('pack').render()

        # Build
        kern = self._build_kernel('pack_view', src, 'iiiPPPP')

        # Compute the grid and thread-block size
        block = (128, 1, 1)
        grid = get_grid_for_block(block, v.n)

        # If MPI is CUDA aware then we just need to pack the buffer
        if self.backend.mpitype == 'cuda-aware':
            class PackXchgViewKernel(ComputeKernel):
                def run(self, queue):
                    scomp = queue.cuda_stream_comp

                    # Pack
                    kern.prepared_async_call(
                        grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
                        v.mapping, v.rstrides or 0, m
                    )
        # Otherwise, we need to both pack the buffer and copy it back
        else:
            # Create a CUDA event
            event = cuda.Event(cuda.event_flags.DISABLE_TIMING)

            class PackXchgViewKernel(ComputeKernel):
                def run(self, queue):
                    scomp = queue.cuda_stream_comp
                    scopy = queue.cuda_stream_copy

                    # Pack
                    kern.prepared_async_call(
                        grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
                        v.mapping, v.rstrides or 0, m
                    )

                    # Copy the packed buffer to the host
                    event.record(scomp)
                    scopy.wait_for_event(event)
                    cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)

        return PackXchgViewKernel()
コード例 #10
0
ファイル: blasext.py プロジェクト: yifanb/PyFR
    def errest(self, x, y, z, *, norm):
        if x.traits != y.traits != z.traits:
            raise ValueError('Incompatible matrix types')

        nrow, ldim, dtype = x.traits
        ncola, ncolb = x.ioshape[1:]

        # Reduction block dimensions
        block = (128, 1, 1)

        # Determine the grid size
        grid = get_grid_for_block(block, ncolb)

        # Empty result buffer on host with shape (nvars, nblocks)
        err_host = cuda.pagelocked_empty((ncola, grid[0]), dtype, 'C')

        # Device memory allocation
        err_dev = cuda.mem_alloc(err_host.nbytes)

        # Get the kernel template
        src = self.backend.lookup.get_template('errest').render(
            norm=norm, ncola=ncola, sharesz=block[0]
        )

        # Build the reduction kernel
        rkern = self._build_kernel(
            'errest', src, [np.int32]*3 + [np.intp]*4 + [dtype]*2
        )

        # Norm type
        reducer = np.max if norm == 'uniform' else np.sum

        class ErrestKernel(ComputeKernel):
            @property
            def retval(self):
                return reducer(err_host, axis=1)

            def run(self, queue, atol, rtol):
                rkern.prepared_async_call(grid, block, queue.cuda_stream_comp,
                                          nrow, ncolb, ldim, err_dev, x, y, z,
                                          atol, rtol)
                cuda.memcpy_dtoh_async(err_host, err_dev,
                                       queue.cuda_stream_comp)

        return ErrestKernel()
コード例 #11
0
ファイル: blasext.py プロジェクト: bartwozniak/PyFR
    def axnpby(self, y, *xn):
        if any(y.traits != x.traits for x in xn):
            raise ValueError('Incompatible matrix types')

        opts = dict(n=len(xn), dtype=npdtype_to_ctype(y.dtype))
        fn = self._get_function('axnpby', 'axnpby', [np.int32] +
                                [np.intp, y.dtype]*(1 + len(xn)), opts)

        # Determine the total element count in the matrices
        cnt = y.leaddim*y.nrow

        # Compute a suitable block and grid
        block = (1024, 1, 1)
        grid = get_grid_for_block(block, cnt)

        class AxnpbyKernel(ComputeKernel):
            def run(self, scomp, scopy, beta, *alphan):
                args = [i for axn in zip(xn, alphan) for i in axn]
                fn.prepared_async_call(grid, block, scomp, cnt, y, beta, *args)

        return AxnpbyKernel()