def axnpby(self, *arr, subdims=None): if any(arr[0].traits != x.traits for x in arr[1:]): raise ValueError('Incompatible matrix types') nv = len(arr) nrow, ncol, ldim, dtype = arr[0].traits[1:] ncola, ncolb = arr[0].ioshape[1:] # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, ncolb, nrow) # Render the kernel template src = self.backend.lookup.get_template('axnpby').render( block=block, subdims=subdims or range(ncola), ncola=ncola, nv=nv) # Build the kernel kern = self._build_kernel('axnpby', src, [np.int32] * 3 + [np.intp] * nv + [dtype] * nv) class AxnpbyKernel(ComputeKernel): def run(self, queue, *consts): kern.exec_async(grid, block, queue.stream_comp, nrow, ncolb, ldim, *arr, *consts) return AxnpbyKernel()
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Fetch the matrix and tally up the number of non-zeros arr = a.get() nnz, nuq = np.count_nonzero(arr), len(np.unique(np.abs(arr))) # Check that A is suitable if nuq > 28 and nnz / arr.size > 0.15: raise NotSuitableError('Matrix inappropriate GiMMiK') # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, b.ncol) # Generate src = generate_mm(a.get(), dtype=a.dtype, platform='hip', alpha=alpha, beta=beta) # Build fun = self._build_kernel('gimmik_mm', src, [np.int32, np.intp]*2 + [np.int32]) class MulKernel(Kernel): def run(self, queue): fun.exec_async(grid, block, queue.stream, b.ncol, b, b.leaddim, out, out.leaddim) return MulKernel()
def reduction(self, *rs, method, norm, dt_mat=None): if any(r.traits != rs[0].traits for r in rs[1:]): raise ValueError('Incompatible matrix types') hip = self.backend.hip nrow, ncol, ldim, dtype = rs[0].traits[1:] ncola, ncolb = rs[0].ioshape[1:] # Reduction block dimensions block = (128, 1, 1) # Determine the grid size grid = get_grid_for_block(block, ncolb, ncola) # Empty result buffer on the device reduced_dev = hip.mem_alloc(ncola * grid[0] * rs[0].itemsize) # Empty result buffer on the host reduced_host = hip.pagelocked_empty((ncola, grid[0]), dtype) tplargs = dict(norm=norm, blocksz=block[0], method=method) if method == 'resid': tplargs['dt_type'] = 'matrix' if dt_mat else 'scalar' # Get the kernel template src = self.backend.lookup.get_template('reduction').render(**tplargs) regs = list(rs) + [dt_mat] if dt_mat else rs # Argument types for reduction kernel if method == 'errest': argt = [np.int32] * 3 + [np.intp] * 4 + [dtype] * 2 elif method == 'resid' and dt_mat: argt = [np.int32] * 3 + [np.intp] * 4 + [dtype] else: argt = [np.int32] * 3 + [np.intp] * 3 + [dtype] # Build the reduction kernel rkern = self._build_kernel('reduction', src, argt) # Norm type reducer = np.max if norm == 'uniform' else np.sum class ReductionKernel(ComputeKernel): @property def retval(self): return reducer(reduced_host, axis=1) def run(self, queue, *facs): rkern.exec_async(grid, block, queue.stream_comp, nrow, ncolb, ldim, reduced_dev, *regs, *facs) hip.memcpy_async(reduced_host, reduced_dev, reduced_dev.nbytes, queue.stream_comp) return ReductionKernel()
def pack(self, mv): hip = self.backend.hip # An exchange view is simply a regular view plus an exchange matrix m, v = mv.xchgmat, mv.view # Render the kernel template src = self.backend.lookup.get_template('pack').render() # Build kern = self._build_kernel('pack_view', src, [np.int32]*3 + [np.intp]*4) # Compute the grid and thread-block size block = (128, 1, 1) grid = get_grid_for_block(block, v.n) # If MPI is HIP aware then we just need to pack the buffer if self.backend.mpitype == 'hip-aware': class PackXchgViewKernel(ComputeKernel): def run(self, queue): scomp = queue.hip_stream_comp # Pack kern.exec_async( grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.rstrides or 0, m ) # Otherwise, we need to both pack the buffer and copy it back else: # Create a HIP event event = hip.create_event() class PackXchgViewKernel(ComputeKernel): def run(self, queue): scomp = queue.hip_stream_comp scopy = queue.hip_stream_copy # Pack kern.exec_async( grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.rstrides or 0, m ) # Copy the packed buffer to the host event.record(scomp) scopy.wait_for_event(event) hip.memcpy_async(m.hdata, m.data, m.nbytes, scopy) return PackXchgViewKernel()
def errest(self, x, y, z, *, norm): if x.traits != y.traits != z.traits: raise ValueError('Incompatible matrix types') hip = self.backend.hip nrow, ncol, ldim, dtype = x.traits ncola, ncolb = x.ioshape[1:] # Reduction block dimensions block = (128, 1, 1) # Determine the grid size grid = get_grid_for_block(block, ncolb, ncola) # Empty result buffer on the device err_dev = hip.mem_alloc(ncola*grid[0]*x.itemsize) # Empty result buffer on the host err_host = hip.pagelocked_empty((ncola, grid[0]), dtype) # Get the kernel template src = self.backend.lookup.get_template('errest').render( norm=norm, sharesz=block[0] ) # Build the reduction kernel rkern = self._build_kernel( 'errest', src, [np.int32]*3 + [np.intp]*4 + [dtype]*2 ) # Norm type reducer = np.max if norm == 'uniform' else np.sum class ErrestKernel(ComputeKernel): @property def retval(self): return reducer(err_host, axis=1) def run(self, queue, atol, rtol): rkern.exec_async(grid, block, queue.hip_stream_comp, nrow, ncolb, ldim, err_dev, x, y, z, atol, rtol) hip.memcpy_async(err_host, err_dev, err_dev.nbytes, queue.hip_stream_comp) return ErrestKernel()