def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Check that A is reasonably sparse if np.count_nonzero(a.get()) > self.max_nnz: raise NotSuitableError('Matrix too dense for GiMMiK') # Generate src = generate_mm(a.get(), dtype=a.dtype, platform='cuda', alpha=alpha, beta=beta) # Build fun = self._build_kernel('gimmik_mm', src, 'iPiPi') # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, b.ncol) class MulKernel(ComputeKernel): def run(self, queue): fun.prepared_async_call(grid, block, queue.cuda_stream_comp, b.ncol, b, b.leaddim, out, out.leaddim) return MulKernel()
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Check that A is reasonably sparse if np.count_nonzero(a.get()) > self.max_nnz: raise NotSuitableError('Matrix too dense for GiMMiK') # Generate src = generate_mm(a.get(), dtype=a.dtype, platform='opencl', alpha=alpha, beta=beta) # Build fun = self._build_kernel('gimmik_mm', src, [np.int32] + [np.intp, np.int32]*2) fun.set_args(b.ncol, b, b.leaddim, out, out.leaddim) class MulKernel(Kernel): def run(self, queue): fun.exec_async(queue.cmd_q, (b.ncol,), None) return MulKernel(mats=[b, out])
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Fetch the matrix and tally up the number of non-zeros arr = a.get() nnz, nuq = np.count_nonzero(arr), len(np.unique(np.abs(arr))) # Check that A is suitable if nuq > 28 and nnz / arr.size > 0.15: raise NotSuitableError('Matrix is inappropriate for GiMMiK') # Generate src = generate_mm(arr, dtype=a.dtype, platform='cuda', alpha=alpha, beta=beta) # Build fun = self._build_kernel('gimmik_mm', src, [np.int32, np.intp]*2 + [np.int32]) # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, b.ncol) class MulKernel(ComputeKernel): def run(self, queue): fun.exec_async(grid, block, queue.stream_comp, b.ncol, b, b.leaddim, out, out.leaddim) return MulKernel()
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError("Incompatible matrices for out = a*b") # Check that A is constant if "const" not in a.tags: raise NotSuitableError("GiMMiK requires a constant a matrix") # Check that A is reasonably sparse if np.count_nonzero(a.get()) > self.max_nnz: raise NotSuitableError("Matrix too dense for GiMMiK") # Generate src = generate_mm(a.get(), dtype=a.dtype, platform="cuda", alpha=alpha, beta=beta) # Build fun = self._build_kernel("gimmik_mm", src, "iPiPi") # Determine the grid/block block = (128, 1, 1) grid = get_grid_for_block(block, b.ncol) class MulKernel(ComputeKernel): def run(self, queue): fun.prepared_async_call(grid, block, queue.cuda_stream_comp, b.ncol, b, b.leaddim, out, out.leaddim) return MulKernel()
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Check that A is reasonably sparse if np.count_nonzero(a.get()) > self.max_nnz: raise NotSuitableError('Matrix too dense for GiMMiK') # Generate src = generate_mm(a.get(), dtype=a.dtype, platform='opencl', alpha=alpha, beta=beta) # Build fun = self._build_kernel('gimmik_mm', src, [np.int32] + [np.intp, np.int32]*2) class MulKernel(ComputeKernel): def run(self, queue): fun(queue.cl_queue_comp, (b.ncol,), None, b.ncol, b.data, b.leaddim, out.data, out.leaddim) return MulKernel()
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Check that A is reasonably sparse if np.count_nonzero(a.get()) > self.max_nnz: raise NotSuitableError('Matrix too dense for GiMMiK') # Generate the GiMMiK kernel src = generate_mm(a.get(), dtype=a.dtype, platform='c-omp', alpha=alpha, beta=beta) gimmik_mm = self._build_kernel('gimmik_mm', src, [np.int32] + [np.intp, np.int32] * 2) class MulKernel(ComputeKernel): def run(self, queue): gimmik_mm(b.ncol, b, b.leaddim, out, out.leaddim) return MulKernel()
def get_mm_kernel(mat, alpha=1., beta=0., tol=1e-15): matSrc = generate_mm(filter_tol(mat, tol=tol), dtype=mat.dtype, platform='cuda', alpha=alpha, beta=beta) matMod = compiler.SourceModule(matSrc) matKern = get_kernel(matMod, "gimmik_mm", 'iPiPi') return matKern
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Check that A is reasonably sparse if np.count_nonzero(a.get()) > self.max_nnz: raise NotSuitableError('Matrix too dense for GiMMiK') # Generate the GiMMiK kernel src = generate_mm(a.get(), dtype=a.dtype, platform='c', alpha=alpha, beta=beta) gimmik_mm = self._build_kernel('gimmik_mm', src, [np.int32] + [np.intp, np.int32] * 2) gimmik_ptr = cast(gimmik_mm, c_void_p).value tplargs = {'lib': 'gimmik'} # Render our parallel wrapper kernel src = self.backend.lookup.get_template('batch-gemm').render(**tplargs) # Argument types for par_gimmik argt = [np.intp] + [np.int32] * 2 + [np.intp, np.int32] * 2 # Build batch_gemm = self._build_kernel('batch_gemm', src, argt) class MulKernel(ComputeKernel): def run(self, queue): batch_gemm(gimmik_ptr, b.leaddim, b.nblocks, b, b.blocksz, out, out.blocksz) return MulKernel()
def mul(self, a, b, out, alpha=1.0, beta=0.0): # Ensure the matrices are compatible if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol: raise ValueError('Incompatible matrices for out = a*b') # Check that A is constant if 'const' not in a.tags: raise NotSuitableError('GiMMiK requires a constant a matrix') # Check that A is reasonably sparse if np.count_nonzero(a.get()) > self.max_nnz: raise NotSuitableError('Matrix too dense for GiMMiK') # Generate and compile the GiMMiK function src = generate_mm(a.get(), dtype=a.dtype, platform='c', alpha=alpha, beta=beta) gimmik_mm = self._build_function('gimmik_mm', src, 'iPiPi') gimmik_ptr = cast(gimmik_mm, c_void_p).value # Render our parallel wrapper kernel src = self.backend.lookup.get_template('batch-gemm').render( lib='gimmik') # Build batch_gemm = self._build_kernel('batch_gemm', src, 'PiiPiPi') batch_gemm.set_args(gimmik_ptr, b.leaddim, b.nblocks, b, b.blocksz, out, out.blocksz) class MulKernel(Kernel): def run(self, queue): batch_gemm() return MulKernel(mats=[b, out], misc=[gimmik_mm])