def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Fetch the matrix and tally up the number of non-zeros arr = a.get() nnz, nuq = np.count_nonzero(arr), len(np.unique(np.abs(arr))) # Check that A is suitable if nuq > 28 and nnz / arr.size > 0.15: raise NotSuitableError('Matrix is inappropriate for GiMMiK') # Generate src = generate_mm(arr, dtype=a.dtype, platform='cuda', alpha=alpha, beta=beta) # Build fun = self._build_kernel('gimmik_mm', src, [np.int32, np.intp]*2 + [np.int32]) # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, b.ncol) class MulKernel(ComputeKernel): def run(self, queue): fun.exec_async(grid, block, queue.stream_comp, b.ncol, b, b.leaddim, out, out.leaddim) return MulKernel()
def axnpby(self, *arr, subdims=None): if any(arr[0].traits != x.traits for x in arr[1:]): raise ValueError('Incompatible matrix types') nv = len(arr) nrow, ldim, dtype = arr[0].traits ncola, ncolb = arr[0].ioshape[1:] # Render the kernel template src = self.backend.lookup.get_template('axnpby').render( subdims=subdims or range(ncola), ncola=ncola, nv=nv ) # Build the kernel kern = self._build_kernel('axnpby', src, [np.int32]*3 + [np.intp]*nv + [dtype]*nv) # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, ncolb, nrow) class AxnpbyKernel(ComputeKernel): def run(self, queue, *consts): args = list(arr) + list(consts) kern.prepared_async_call(grid, block, queue.cuda_stream_comp, nrow, ncolb, ldim, *args) return AxnpbyKernel()
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Check that A is reasonably sparse if np.count_nonzero(a.get()) > self.max_nnz: raise NotSuitableError('Matrix too dense for GiMMiK') # Generate src = generate_mm(a.get(), dtype=a.dtype, platform='cuda', alpha=alpha, beta=beta) # Build fun = self._build_kernel('gimmik_mm', src, 'iPiPi') # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, b.ncol) class MulKernel(ComputeKernel): def run(self, queue): fun.prepared_async_call(grid, block, queue.cuda_stream_comp, b.ncol, b, b.leaddim, out, out.leaddim) return MulKernel()
def _mul_gimmik(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Check that A is reasonably sparse if np.count_nonzero(a.get()) > self.max_nnz: raise NotSuitableError('Matrix too dense for GiMMiK') # Generate src = self._gen_gimmik( a.get(), 'cuda', alpha=alpha, beta=beta, double=a.dtype == np.float64, reduced=True, ) # Build fun = self._build_kernel('gimmik_mm', src, 'PPiii') # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, b.ncol) class MulKernel(ComputeKernel): def run(self, queue): fun.prepared_async_call(grid, block, queue.cuda_stream_comp, b, out, b.ncol, b.leaddim, out.leaddim) return MulKernel()
def pack(self, mv): # An MPI view is simply a regular view plus an MPI matrix m, v = mv.mpimat, mv.view # Render the kernel template tpl = self.backend.lookup.get_template('pack') src = tpl.render(dtype=npdtype_to_ctype(m.dtype)) # Build kern = self._build_kernel('pack_view', src, 'iiiPPPPP') # Compute the grid and thread-block size block = (128, 1, 1) grid = get_grid_for_block(block, v.n) # Create a CUDA event event = cuda.Event(cuda.event_flags.DISABLE_TIMING) class PackMPIViewKernel(ComputeKernel): def run(self, queue): scomp = queue.cuda_stream_comp scopy = queue.cuda_stream_copy # Pack kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.cstrides or 0, v.rstrides or 0, m) # Copy the packed buffer to the host event.record(scomp) scopy.wait_for_event(event) cuda.memcpy_dtoh_async(m.hdata, m.data, scopy) return PackMPIViewKernel()
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError("Incompatible matrices for out = a*b") # Check that A is constant if "const" not in a.tags: raise NotSuitableError("GiMMiK requires a constant a matrix") # Check that A is reasonably sparse if np.count_nonzero(a.get()) > self.max_nnz: raise NotSuitableError("Matrix too dense for GiMMiK") # Generate src = generate_mm(a.get(), dtype=a.dtype, platform="cuda", alpha=alpha, beta=beta) # Build fun = self._build_kernel("gimmik_mm", src, "iPiPi") # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, b.ncol) class MulKernel(ComputeKernel): def run(self, queue): fun.prepared_async_call(grid, block, queue.cuda_stream_comp, b.ncol, b, b.leaddim, out, out.leaddim) return MulKernel()
def reduction(self, *rs, method, norm, dt_mat=None): if any(r.traits != rs[0].traits for r in rs[1:]): raise ValueError('Incompatible matrix types') cuda = self.backend.cuda nrow, ncol, ldim, dtype = rs[0].traits[1:] ncola, ncolb = rs[0].ioshape[1:] # Reduction block dimensions block = (128, 1, 1) # Determine the grid size grid = get_grid_for_block(block, ncolb, ncola) # Empty result buffer on the device reduced_dev = cuda.mem_alloc(ncola*grid[0]*rs[0].itemsize) # Empty result buffer on the host reduced_host = cuda.pagelocked_empty((ncola, grid[0]), dtype) tplargs = dict(norm=norm, sharesz=block[0], method=method) if method == 'resid': tplargs['dt_type'] = 'matrix' if dt_mat else 'scalar' # Get the kernel template src = self.backend.lookup.get_template('reduction').render(**tplargs) regs = list(rs) + [dt_mat] if dt_mat else rs # Argument types for reduction kernel if method == 'errest': argt = [np.int32]*3 + [np.intp]*4 + [dtype]*2 elif method == 'resid' and dt_mat: argt = [np.int32]*3 + [np.intp]*4 + [dtype] else: argt = [np.int32]*3 + [np.intp]*3 + [dtype] # Build the reduction kernel rkern = self._build_kernel('reduction', src, argt) # Norm type reducer = np.max if norm == 'uniform' else np.sum class ReductionKernel(Kernel): @property def retval(self): return reducer(reduced_host, axis=1) def run(self, queue, *facs): rkern.exec_async(grid, block, queue.stream, nrow, ncolb, ldim, reduced_dev, *regs, *facs) cuda.memcpy(reduced_host, reduced_dev, reduced_dev.nbytes, queue.stream) return ReductionKernel()
def pack(self, mv): cuda = self.backend.cuda # An exchange view is simply a regular view plus an exchange matrix m, v = mv.xchgmat, mv.view # Render the kernel template src = self.backend.lookup.get_template('pack').render() # Build kern = self._build_kernel('pack_view', src, [np.int32] * 3 + [np.intp] * 4) # Compute the grid and thread-block size block = (128, 1, 1) grid = get_grid_for_block(block, v.n) # If MPI is CUDA aware then we just need to pack the buffer if self.backend.mpitype == 'cuda-aware': class PackXchgViewKernel(ComputeKernel): def run(self, queue): scomp = queue.cuda_stream_comp # Pack kern.exec_async(grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.rstrides or 0, m) # Otherwise, we need to both pack the buffer and copy it back else: # Create a CUDA event event = cuda.create_event() class PackXchgViewKernel(ComputeKernel): def run(self, queue): scomp = queue.cuda_stream_comp scopy = queue.cuda_stream_copy # Pack kern.exec_async(grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.rstrides or 0, m) # Copy the packed buffer to the host event.record(scomp) scopy.wait_for_event(event) cuda.memcpy_async(m.hdata, m.data, m.nbytes, scopy) return PackXchgViewKernel()
def pack(self, mv): # An exchange view is simply a regular view plus an exchange matrix m, v = mv.xchgmat, mv.view # Render the kernel template src = self.backend.lookup.get_template('pack').render() # Build kern = self._build_kernel('pack_view', src, 'iiiPPPP') # Compute the grid and thread-block size block = (128, 1, 1) grid = get_grid_for_block(block, v.n) # If MPI is CUDA aware then we just need to pack the buffer if self.backend.mpitype == 'cuda-aware': class PackXchgViewKernel(ComputeKernel): def run(self, queue): scomp = queue.cuda_stream_comp # Pack kern.prepared_async_call( grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.rstrides or 0, m ) # Otherwise, we need to both pack the buffer and copy it back else: # Create a CUDA event event = cuda.Event(cuda.event_flags.DISABLE_TIMING) class PackXchgViewKernel(ComputeKernel): def run(self, queue): scomp = queue.cuda_stream_comp scopy = queue.cuda_stream_copy # Pack kern.prepared_async_call( grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.rstrides or 0, m ) # Copy the packed buffer to the host event.record(scomp) scopy.wait_for_event(event) cuda.memcpy_dtoh_async(m.hdata, m.data, scopy) return PackXchgViewKernel()
def errest(self, x, y, z, *, norm): if x.traits != y.traits != z.traits: raise ValueError('Incompatible matrix types') nrow, ldim, dtype = x.traits ncola, ncolb = x.ioshape[1:] # Reduction block dimensions block = (128, 1, 1) # Determine the grid size grid = get_grid_for_block(block, ncolb) # Empty result buffer on host with shape (nvars, nblocks) err_host = cuda.pagelocked_empty((ncola, grid[0]), dtype, 'C') # Device memory allocation err_dev = cuda.mem_alloc(err_host.nbytes) # Get the kernel template src = self.backend.lookup.get_template('errest').render( norm=norm, ncola=ncola, sharesz=block[0] ) # Build the reduction kernel rkern = self._build_kernel( 'errest', src, [np.int32]*3 + [np.intp]*4 + [dtype]*2 ) # Norm type reducer = np.max if norm == 'uniform' else np.sum class ErrestKernel(ComputeKernel): @property def retval(self): return reducer(err_host, axis=1) def run(self, queue, atol, rtol): rkern.prepared_async_call(grid, block, queue.cuda_stream_comp, nrow, ncolb, ldim, err_dev, x, y, z, atol, rtol) cuda.memcpy_dtoh_async(err_host, err_dev, queue.cuda_stream_comp) return ErrestKernel()
def axnpby(self, y, *xn): if any(y.traits != x.traits for x in xn): raise ValueError('Incompatible matrix types') opts = dict(n=len(xn), dtype=npdtype_to_ctype(y.dtype)) fn = self._get_function('axnpby', 'axnpby', [np.int32] + [np.intp, y.dtype]*(1 + len(xn)), opts) # Determine the total element count in the matrices cnt = y.leaddim*y.nrow # Compute a suitable block and grid block = (1024, 1, 1) grid = get_grid_for_block(block, cnt) class AxnpbyKernel(ComputeKernel): def run(self, scomp, scopy, beta, *alphan): args = [i for axn in zip(xn, alphan) for i in axn] fn.prepared_async_call(grid, block, scomp, cnt, y, beta, *args) return AxnpbyKernel()