def errest(self, x, y, z, *, norm): if x.traits != y.traits != z.traits: raise ValueError('Incompatible matrix types') # Wrap xarr = GPUArray(x.leaddim*x.nrow, x.dtype, gpudata=x) yarr = GPUArray(y.leaddim*y.nrow, y.dtype, gpudata=y) zarr = GPUArray(z.leaddim*z.nrow, z.dtype, gpudata=z) # Norm type reduce_expr = 'a + b' if norm == 'l2' else 'max(a, b)' # Build the reduction kernel rkern = ReductionKernel( x.dtype, neutral='0', reduce_expr=reduce_expr, map_expr='pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2)', arguments='{0}* x, {0}* y, {0}* z, {0} atol, {0} rtol' .format(npdtype_to_ctype(x.dtype)) ) class ErrestKernel(ComputeKernel): @property def retval(self): return self._retarr.get() def run(self, queue, atol, rtol): self._retarr = rkern(xarr, yarr, zarr, atol, rtol, stream=queue.cuda_stream_comp) return ErrestKernel()
def errest(self, x, y, z, *, norm): if x.traits != y.traits != z.traits: raise ValueError('Incompatible matrix types') cnt = x.leaddim*x.nrow dtype = x.dtype # Norm type reduce_expr = 'a + b' if norm == 'l2' else 'max(a, b)' # Build the reduction kernel rkern = ReductionKernel( self.backend.ctx, dtype, neutral='0', reduce_expr=reduce_expr, map_expr='pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2)', arguments='__global {0}* x, __global {0}* y, __global {0}* z, ' '{0} atol, {0} rtol'.format(npdtype_to_ctype(dtype)) ) class ErrestKernel(ComputeKernel): @property def retval(self): return self._retarr.get() def run(self, queue, atol, rtol): qcomp = queue.cl_queue_comp xarr = Array(qcomp, cnt, dtype, data=x.data) yarr = Array(qcomp, cnt, dtype, data=y.data) zarr = Array(qcomp, cnt, dtype, data=z.data) self._retarr = rkern(xarr, yarr, zarr, atol, rtol, queue=qcomp) return ErrestKernel()
def pack(self, mv): # An MPI view is simply a regular view plus an MPI matrix m, v = mv.mpimat, mv.view # Render the kernel template tpl = self.backend.lookup.get_template('pack') src = tpl.render(dtype=npdtype_to_ctype(m.dtype)) # Build kern = self._build_kernel('pack_view', src, 'iiiPPPPP') # Compute the grid and thread-block size block = (128, 1, 1) grid = get_grid_for_block(block, v.n) # Create a CUDA event event = cuda.Event(cuda.event_flags.DISABLE_TIMING) class PackMPIViewKernel(ComputeKernel): def run(self, queue): scomp = queue.cuda_stream_comp scopy = queue.cuda_stream_copy # Pack kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.cstrides or 0, v.rstrides or 0, m) # Copy the packed buffer to the host event.record(scomp) scopy.wait_for_event(event) cuda.memcpy_dtoh_async(m.hdata, m.data, scopy) return PackMPIViewKernel()
def errest(self, x, y, z): if x.traits != y.traits != z.traits: raise ValueError('Incompatible matrix types') cnt = x.leaddim * x.nrow dtype = x.dtype # Build the reduction kernel rkern = ReductionKernel( self.backend.ctx, dtype, neutral='0', reduce_expr='a + b', map_expr='pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2)', arguments='__global {0}* x, __global {0}* y, __global {0}* z, ' '{0} atol, {0} rtol'.format(npdtype_to_ctype(dtype))) class ErrestKernel(ComputeKernel): @property def retval(self): return self._retarr.get() def run(self, queue, atol, rtol): qcomp = queue.cl_queue_comp xarr = Array(qcomp, cnt, dtype, data=x.data) yarr = Array(qcomp, cnt, dtype, data=y.data) zarr = Array(qcomp, cnt, dtype, data=z.data) self._retarr = rkern(xarr, yarr, zarr, atol, rtol, queue=qcomp) return ErrestKernel()
def pack(self, mv): # An MPI view is simply a regular view plus an MPI matrix m, v = mv.mpimat, mv.view # Render the kernel template tpl = self.backend.lookup.get_template("pack") src = tpl.render(dtype=npdtype_to_ctype(m.dtype)) # Build kern = self._build_kernel("pack_view", src, "iiiPPPPP") class PackMPIViewKernel(ComputeKernel): def run(self): kern(v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.cstrides or 0, v.rstrides or 0, m) return PackMPIViewKernel()
def pack(self, mv): # An MPI view is simply a regular view plus an MPI matrix m, v = mv.mpimat, mv.view # Render the kernel template tpl = self.backend.lookup.get_template('pack') src = tpl.render(dtype=npdtype_to_ctype(m.dtype)) # Build kern = self._build_kernel('pack_view', src, 'iiiPPPPP') class PackMPIViewKernel(ComputeKernel): def run(self): kern(v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.cstrides or 0, v.rstrides or 0, m) return PackMPIViewKernel()
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') m, n, k = a.nrow, b.ncol, a.ncol if a.dtype == np.float64: cblas_gemm = self._wrappers.cblas_dgemm else: cblas_gemm = self._wrappers.cblas_sgemm # If our BLAS library is single threaded then invoke our own # parallelization kernel which uses OpenMP to partition the # operation along b.ncol (which works extremely well for the # extremely long matrices encountered by PyFR). Otherwise, we # let the BLAS library handle parallelization itself (which # may, or may not, use OpenMP). if self._cblas_type == 'cblas-st': # Argument types and template params for par_gemm argt = [np.intp, np.int32, np.int32, np.int32, a.dtype, np.intp, np.int32, np.intp, np.int32, a.dtype, np.intp, np.int32] opts = dict(dtype=npdtype_to_ctype(a.dtype)) par_gemm = self._get_function('par_gemm', 'par_gemm', None, argt, opts) # Pointer to the BLAS library GEMM function cblas_gemm_ptr = cast(cblas_gemm, c_void_p).value class MulKernel(ComputeKernel): def run(self): par_gemm(cblas_gemm_ptr, m, n, k, alpha, a, a.leaddim, b, b.leaddim, beta, out, out.leaddim) else: class MulKernel(ComputeKernel): def run(self): cblas_gemm(CBlasOrder.ROW_MAJOR, CBlasTranspose.NO_TRANS, CBlasTranspose.NO_TRANS, m, n, k, alpha, a, a.leaddim, b, b.leaddim, beta, out, out.leaddim) return MulKernel()
def axnpby(self, y, *xn): if any(y.traits != x.traits for x in xn): raise ValueError('Incompatible matrix types') opts = dict(n=len(xn), dtype=npdtype_to_ctype(y.dtype)) fn = self._get_function('axnpby', 'axnpby', [np.int32] + [np.intp, y.dtype]*(1 + len(xn)), opts) # Determine the total element count in the matrices cnt = y.leaddim*y.nrow # Compute a suitable block and grid block = (1024, 1, 1) grid = get_grid_for_block(block, cnt) class AxnpbyKernel(ComputeKernel): def run(self, scomp, scopy, beta, *alphan): args = [i for axn in zip(xn, alphan) for i in axn] fn.prepared_async_call(grid, block, scomp, cnt, y, beta, *args) return AxnpbyKernel()
def axnpby(self, y, *xn): if any(y.traits != x.traits for x in xn): raise ValueError('Incompatible matrix types') nv, cnt = len(xn), y.leaddim*y.nrow # Render the kernel template tpl = self.backend.lookup.get_template('axnpby') src = tpl.render(n=nv, dtype=npdtype_to_ctype(y.dtype)) # Build kern = self._build_kernel('axnpby', src, [np.int32] + [np.intp, y.dtype]*(1 + nv)) # Compute a suitable block and grid grid, block = splay(cnt) class AxnpbyKernel(ComputeKernel): def run(self, scomp, scopy, beta, *alphan): args = [i for axn in zip(xn, alphan) for i in axn] kern.prepared_async_call(grid, block, scomp, cnt, y, beta, *args) return AxnpbyKernel()
def axnpby(self, y, *xn): if any(y.traits != x.traits for x in xn): raise ValueError('Incompatible matrix types') nv, cnt = len(xn), y.leaddim*y.nrow # Render the kernel template tpl = self.backend.lookup.get_template('axnpby') src = tpl.render(n=nv, dtype=npdtype_to_ctype(y.dtype)) # Build kern = self._build_kernel('axnpby', src, [np.int32] + [np.intp, y.dtype]*(1 + nv)) # Compute a suitable block and grid grid, block = splay(cnt) class AxnpbyKernel(ComputeKernel): def run(self, queue, beta, *alphan): args = [i for axn in zip(xn, alphan) for i in axn] kern.prepared_async_call(grid, block, queue.cuda_stream_comp, cnt, y, beta, *args) return AxnpbyKernel()
def _packmodopts(self, mpiview): return dict(dtype=npdtype_to_ctype(mpiview.mpimat.dtype), vlen=mpiview.view.vlen)
def npdtype_to_ctype(context, dtype): return nputil.npdtype_to_ctype(dtype)