Ejemplo n.º 1
0
    def errest(self, x, y, z, *, norm):
        if x.traits != y.traits != z.traits:
            raise ValueError('Incompatible matrix types')

        # Wrap
        xarr = GPUArray(x.leaddim*x.nrow, x.dtype, gpudata=x)
        yarr = GPUArray(y.leaddim*y.nrow, y.dtype, gpudata=y)
        zarr = GPUArray(z.leaddim*z.nrow, z.dtype, gpudata=z)

        # Norm type
        reduce_expr = 'a + b' if norm == 'l2' else 'max(a, b)'

        # Build the reduction kernel
        rkern = ReductionKernel(
            x.dtype, neutral='0', reduce_expr=reduce_expr,
            map_expr='pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2)',
            arguments='{0}* x, {0}* y, {0}* z, {0} atol, {0} rtol'
                      .format(npdtype_to_ctype(x.dtype))
        )

        class ErrestKernel(ComputeKernel):
            @property
            def retval(self):
                return self._retarr.get()

            def run(self, queue, atol, rtol):
                self._retarr = rkern(xarr, yarr, zarr, atol, rtol,
                                     stream=queue.cuda_stream_comp)

        return ErrestKernel()
Ejemplo n.º 2
0
    def errest(self, x, y, z, *, norm):
        if x.traits != y.traits != z.traits:
            raise ValueError('Incompatible matrix types')

        cnt = x.leaddim*x.nrow
        dtype = x.dtype

        # Norm type
        reduce_expr = 'a + b' if norm == 'l2' else 'max(a, b)'

        # Build the reduction kernel
        rkern = ReductionKernel(
            self.backend.ctx, dtype, neutral='0', reduce_expr=reduce_expr,
            map_expr='pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2)',
            arguments='__global {0}* x, __global {0}* y, __global {0}* z, '
                      '{0} atol, {0} rtol'.format(npdtype_to_ctype(dtype))
        )

        class ErrestKernel(ComputeKernel):
            @property
            def retval(self):
                return self._retarr.get()

            def run(self, queue, atol, rtol):
                qcomp = queue.cl_queue_comp

                xarr = Array(qcomp, cnt, dtype, data=x.data)
                yarr = Array(qcomp, cnt, dtype, data=y.data)
                zarr = Array(qcomp, cnt, dtype, data=z.data)

                self._retarr = rkern(xarr, yarr, zarr, atol, rtol,
                                     queue=qcomp)

        return ErrestKernel()
Ejemplo n.º 3
0
Archivo: packing.py Proyecto: fbob/PyFR
    def pack(self, mv):
        # An MPI view is simply a regular view plus an MPI matrix
        m, v = mv.mpimat, mv.view

        # Render the kernel template
        tpl = self.backend.lookup.get_template('pack')
        src = tpl.render(dtype=npdtype_to_ctype(m.dtype))

        # Build
        kern = self._build_kernel('pack_view', src, 'iiiPPPPP')

        # Compute the grid and thread-block size
        block = (128, 1, 1)
        grid = get_grid_for_block(block, v.n)

        # Create a CUDA event
        event = cuda.Event(cuda.event_flags.DISABLE_TIMING)

        class PackMPIViewKernel(ComputeKernel):
            def run(self, queue):
                scomp = queue.cuda_stream_comp
                scopy = queue.cuda_stream_copy

                # Pack
                kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow,
                                         v.nvcol, v.basedata, v.mapping,
                                         v.cstrides or 0, v.rstrides or 0, m)

                # Copy the packed buffer to the host
                event.record(scomp)
                scopy.wait_for_event(event)
                cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)

        return PackMPIViewKernel()
Ejemplo n.º 4
0
    def errest(self, x, y, z):
        if x.traits != y.traits != z.traits:
            raise ValueError('Incompatible matrix types')

        cnt = x.leaddim * x.nrow
        dtype = x.dtype

        # Build the reduction kernel
        rkern = ReductionKernel(
            self.backend.ctx,
            dtype,
            neutral='0',
            reduce_expr='a + b',
            map_expr='pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2)',
            arguments='__global {0}* x, __global {0}* y, __global {0}* z, '
            '{0} atol, {0} rtol'.format(npdtype_to_ctype(dtype)))

        class ErrestKernel(ComputeKernel):
            @property
            def retval(self):
                return self._retarr.get()

            def run(self, queue, atol, rtol):
                qcomp = queue.cl_queue_comp

                xarr = Array(qcomp, cnt, dtype, data=x.data)
                yarr = Array(qcomp, cnt, dtype, data=y.data)
                zarr = Array(qcomp, cnt, dtype, data=z.data)

                self._retarr = rkern(xarr, yarr, zarr, atol, rtol, queue=qcomp)

        return ErrestKernel()
Ejemplo n.º 5
0
    def pack(self, mv):
        # An MPI view is simply a regular view plus an MPI matrix
        m, v = mv.mpimat, mv.view

        # Render the kernel template
        tpl = self.backend.lookup.get_template("pack")
        src = tpl.render(dtype=npdtype_to_ctype(m.dtype))

        # Build
        kern = self._build_kernel("pack_view", src, "iiiPPPPP")

        class PackMPIViewKernel(ComputeKernel):
            def run(self):
                kern(v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.cstrides or 0, v.rstrides or 0, m)

        return PackMPIViewKernel()
Ejemplo n.º 6
0
    def pack(self, mv):
        # An MPI view is simply a regular view plus an MPI matrix
        m, v = mv.mpimat, mv.view

        # Render the kernel template
        tpl = self.backend.lookup.get_template('pack')
        src = tpl.render(dtype=npdtype_to_ctype(m.dtype))

        # Build
        kern = self._build_kernel('pack_view', src, 'iiiPPPPP')

        class PackMPIViewKernel(ComputeKernel):
            def run(self):
                kern(v.n, v.nvrow, v.nvcol, v.basedata, v.mapping,
                     v.cstrides or 0, v.rstrides or 0, m)

        return PackMPIViewKernel()
Ejemplo n.º 7
0
    def mul(self, a, b, out, alpha=1.0, beta=0.0):
        # Ensure the matrices are compatible
        if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol:
            raise ValueError('Incompatible matrices for out = a*b')

        m, n, k = a.nrow, b.ncol, a.ncol

        if a.dtype == np.float64:
            cblas_gemm = self._wrappers.cblas_dgemm
        else:
            cblas_gemm = self._wrappers.cblas_sgemm

        # If our BLAS library is single threaded then invoke our own
        # parallelization kernel which uses OpenMP to partition the
        # operation along b.ncol (which works extremely well for the
        # extremely long matrices encountered by PyFR).  Otherwise, we
        # let the BLAS library handle parallelization itself (which
        # may, or may not, use OpenMP).
        if self._cblas_type == 'cblas-st':
            # Argument types and template params for par_gemm
            argt = [np.intp, np.int32, np.int32, np.int32,
                    a.dtype, np.intp, np.int32, np.intp, np.int32,
                    a.dtype, np.intp, np.int32]
            opts = dict(dtype=npdtype_to_ctype(a.dtype))

            par_gemm = self._get_function('par_gemm', 'par_gemm', None, argt,
                                          opts)

            # Pointer to the BLAS library GEMM function
            cblas_gemm_ptr = cast(cblas_gemm, c_void_p).value

            class MulKernel(ComputeKernel):
                def run(self):
                    par_gemm(cblas_gemm_ptr, m, n, k, alpha, a, a.leaddim,
                             b, b.leaddim, beta, out, out.leaddim)
        else:
            class MulKernel(ComputeKernel):
                def run(self):
                    cblas_gemm(CBlasOrder.ROW_MAJOR, CBlasTranspose.NO_TRANS,
                               CBlasTranspose.NO_TRANS, m, n, k,
                               alpha, a, a.leaddim, b, b.leaddim,
                               beta, out, out.leaddim)

        return MulKernel()
Ejemplo n.º 8
0
    def axnpby(self, y, *xn):
        if any(y.traits != x.traits for x in xn):
            raise ValueError('Incompatible matrix types')

        opts = dict(n=len(xn), dtype=npdtype_to_ctype(y.dtype))
        fn = self._get_function('axnpby', 'axnpby', [np.int32] +
                                [np.intp, y.dtype]*(1 + len(xn)), opts)

        # Determine the total element count in the matrices
        cnt = y.leaddim*y.nrow

        # Compute a suitable block and grid
        block = (1024, 1, 1)
        grid = get_grid_for_block(block, cnt)

        class AxnpbyKernel(ComputeKernel):
            def run(self, scomp, scopy, beta, *alphan):
                args = [i for axn in zip(xn, alphan) for i in axn]
                fn.prepared_async_call(grid, block, scomp, cnt, y, beta, *args)

        return AxnpbyKernel()
Ejemplo n.º 9
0
    def axnpby(self, y, *xn):
        if any(y.traits != x.traits for x in xn):
            raise ValueError('Incompatible matrix types')

        nv, cnt = len(xn), y.leaddim*y.nrow

        # Render the kernel template
        tpl = self.backend.lookup.get_template('axnpby')
        src = tpl.render(n=nv, dtype=npdtype_to_ctype(y.dtype))

        # Build
        kern = self._build_kernel('axnpby', src,
                                  [np.int32] + [np.intp, y.dtype]*(1 + nv))

        # Compute a suitable block and grid
        grid, block = splay(cnt)

        class AxnpbyKernel(ComputeKernel):
            def run(self, scomp, scopy, beta, *alphan):
                args = [i for axn in zip(xn, alphan) for i in axn]
                kern.prepared_async_call(grid, block, scomp, cnt, y, beta,
                                         *args)

        return AxnpbyKernel()
Ejemplo n.º 10
0
    def axnpby(self, y, *xn):
        if any(y.traits != x.traits for x in xn):
            raise ValueError('Incompatible matrix types')

        nv, cnt = len(xn), y.leaddim*y.nrow

        # Render the kernel template
        tpl = self.backend.lookup.get_template('axnpby')
        src = tpl.render(n=nv, dtype=npdtype_to_ctype(y.dtype))

        # Build
        kern = self._build_kernel('axnpby', src,
                                  [np.int32] + [np.intp, y.dtype]*(1 + nv))

        # Compute a suitable block and grid
        grid, block = splay(cnt)

        class AxnpbyKernel(ComputeKernel):
            def run(self, queue, beta, *alphan):
                args = [i for axn in zip(xn, alphan) for i in axn]
                kern.prepared_async_call(grid, block, queue.cuda_stream_comp,
                                         cnt, y, beta, *args)

        return AxnpbyKernel()
Ejemplo n.º 11
0
 def _packmodopts(self, mpiview):
     return dict(dtype=npdtype_to_ctype(mpiview.mpimat.dtype),
                 vlen=mpiview.view.vlen)
Ejemplo n.º 12
0
def npdtype_to_ctype(context, dtype):
    return nputil.npdtype_to_ctype(dtype)
Ejemplo n.º 13
0
def npdtype_to_ctype(context, dtype):
    return nputil.npdtype_to_ctype(dtype)