def todense(self, out=None, allocator=mem_alloc, stream=None): if out is None: out = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C") if self.nnz == 0: # weird but happens out.fill(0.0, stream=stream) return out # we need to out-of-place transpose if we want rowmajor outputs # thus we need a temporary to store our results if out.flags.c_contiguous: tmp = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C") else: tmp = out if stream is not None: cusparse.cusparseSetStream(cusparse_handle, stream.handle) cublas.cublasSetStream(cublas_handle, stream.handle) cusparse.cusparseScsr2dense(cusparse_handle, self.shape[0], self.shape[1], self.descr,, self.indptr.gpudata, self.indices.gpudata, tmp.gpudata, tmp.shape[0]) if out.flags.c_contiguous: cublas.cublasSgeam(cublas_handle, 1, 1, tmp.shape[1], tmp.shape[0], 1.0, tmp.gpudata, tmp.shape[0], 0.0, 0, tmp.shape[0], out.gpudata, out.shape[1]) if stream is not None: cusparse.cusparseSetStream(cusparse_handle, 0) cublas.cublasSetStream(cublas_handle, 0) return out
def test_cublasSgeam(self): a = np.random.rand(2, 3).astype(np.float32) b = np.random.rand(2, 3).astype(np.float32) a_gpu = gpuarray.to_gpu(a.copy()) b_gpu = gpuarray.to_gpu(b.copy()) c_gpu = gpuarray.zeros_like(a_gpu) alpha = np.float32(np.random.rand()) beta = np.float32(np.random.rand()) cublas.cublasSgeam(self.cublas_handle, 'n', 'n', 2, 3, alpha, a_gpu.gpudata, 2, beta, b_gpu.gpudata, 2, c_gpu.gpudata, 2) assert np.allclose(c_gpu.get(), alpha * a + beta * b)
def todense(self, out=None, allocator=mem_alloc, stream=None): if out is None: out = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C") if self.nnz == 0: # weird but happens out.fill(0.0, stream=stream) return out # we need to out-of-place transpose if we want rowmajor outputs # thus we need a temporary to store our results if out.flags.c_contiguous: tmp = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C") else: tmp = out if stream is not None: cusparse.cusparseSetStream(cusparse_handle, stream.handle) cublas.cublasSetStream(cublas_handle, stream.handle) cusparse.cusparseScsr2dense( cusparse_handle, self.shape[0], self.shape[1], self.descr,, self.indptr.gpudata, self.indices.gpudata, tmp.gpudata, tmp.shape[0], ) if out.flags.c_contiguous: cublas.cublasSgeam( cublas_handle, 1, 1, tmp.shape[1], tmp.shape[0], 1.0, tmp.gpudata, tmp.shape[0], 0.0, 0, tmp.shape[0], out.gpudata, out.shape[1], ) if stream is not None: cusparse.cusparseSetStream(cusparse_handle, 0) cublas.cublasSetStream(cublas_handle, 0) return out
def test_cublasSgeam(self): a = np.random.rand(2, 3).astype(np.float32) b = np.random.rand(2, 3).astype(np.float32) a_gpu = gpuarray.to_gpu(a.copy()) b_gpu = gpuarray.to_gpu(b.copy()) c_gpu = gpuarray.zeros_like(a_gpu) alpha = np.float32(np.random.rand()) beta = np.float32(np.random.rand()) cublas.cublasSgeam(self.cublas_handle, 'n', 'n', 2, 3, alpha, a_gpu.gpudata, 2, beta, b_gpu.gpudata, 2, c_gpu.gpudata, 2) assert np.allclose(c_gpu.get(), alpha*a+beta*b)
def csrmm2(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0): ''' Calculates C += alpha * A*B + beta*C. Where A is sparse and both A and B can be transposed. ''' if transA: ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE n, l = A_gpu.shape[1], A_gpu.shape[0] else: ta = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n, l = A_gpu.shape if (B_gpu.flags.c_contiguous and transB) or (B_gpu.flags.f_contiguous and not transB): tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE else: tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE k, m = (B_gpu.shape[1], B_gpu.shape[0]) if transB else B_gpu.shape assert (l == k) and (n, m) == C_gpu.shape ldb = B_gpu.shape[1] if B_gpu.flags.c_contiguous else B_gpu.shape[0] ldc = C_gpu.shape[0] # if C-major, save result into a temp array and transpose afterwards if C_gpu.flags.c_contiguous: out = __cuda_get_temp_matrix(C_gpu.shape, C_gpu.dtype) if beta != 0.0: memcpy_dtod(out.gpudata, C_gpu.gpudata, C_gpu.nbytes) else: out = C_gpu cusparse.cusparseScsrmm2(cusparse_handle, ta, tb, n, m, k, A_gpu.nnz, alpha, A_gpu.descr,, A_gpu.indptr.gpudata, A_gpu.indices.gpudata, B_gpu.gpudata, ldb, beta, out.gpudata, ldc) if C_gpu.flags.c_contiguous: cublas.cublasSgeam(cublas_handle, 1, 1, m, n, 1.0, out.gpudata, C_gpu.shape[0], 0.0, 0, C_gpu.shape[0], C_gpu.gpudata, C_gpu.shape[1]) return C_gpu
def csrmmB(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0): ''' Calculates C += alpha * A*B + beta*C. Where B is sparse and both A and B can be transposed. Note: cuSPARSE only allows for sparse A, so we need some tricks: Essentially, we will compute C^T = B^T * A^T By enforcing C to be row-major, can drop its transpose since cuSPARSE assumes column-major. Thus, we only need to compute C = op(B)^T * op(A)^T ''' assert C_gpu.flags.c_contiguous m, k = B_gpu.shape ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE if not transB else cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE if transA: if ta: # we can't have ta and tb true at the same time according to cuSPARSE docs out = __cuda_get_temp_matrix(A_gpu.shape, A_gpu.dtype) cublas.cublasSgeam(cublas_handle, 1, 1, A_gpu.shape[0], A_gpu.shape[1], 1.0, A_gpu.gpudata, A_gpu.shape[1], 0.0, A_gpu.gpudata, A_gpu.shape[1], out.gpudata, A_gpu.shape[0]) out.shape = A_gpu.shape[1], A_gpu.shape[0] out.strides = gpuarray._c_contiguous_strides(out.dtype.itemsize, out.shape) A_gpu = out tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] else: tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE n = A_gpu.shape[1] else: tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] ldb = A_gpu.shape[1] ldc = C_gpu.shape[1] cusparse.cusparseScsrmm2(cusparse_handle, ta, tb, m, n, k, B_gpu.nnz, alpha, B_gpu.descr,, B_gpu.indptr.gpudata, B_gpu.indices.gpudata, A_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc) return C_gpu
def csrmmB(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0): """ Calculates C += alpha * A*B + beta*C. Where B is sparse and both A and B can be transposed. Note: cuSPARSE only allows for sparse A, so we need some tricks: Essentially, we will compute C^T = B^T * A^T By enforcing C to be row-major, can drop its transpose since cuSPARSE assumes column-major. Thus, we only need to compute C = op(B)^T * op(A)^T """ assert C_gpu.flags.c_contiguous m, k = B_gpu.shape ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE if not transB else cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE if transA: if ta: # we can't have ta and tb true at the same time according to cuSPARSE docs out = __cuda_get_temp_matrix(A_gpu.shape, A_gpu.dtype) cublas.cublasSgeam( cublas_handle, 1, 1, A_gpu.shape[0], A_gpu.shape[1], 1.0, A_gpu.gpudata, A_gpu.shape[1], 0.0, A_gpu.gpudata, A_gpu.shape[1], out.gpudata, A_gpu.shape[0], ) out.shape = A_gpu.shape[1], A_gpu.shape[0] out.strides = gpuarray._c_contiguous_strides(out.dtype.itemsize, out.shape) A_gpu = out tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] else: tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE n = A_gpu.shape[1] else: tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] ldb = A_gpu.shape[1] ldc = C_gpu.shape[1] cusparse.cusparseScsrmm2( cusparse_handle, ta, tb, m, n, k, B_gpu.nnz, alpha, B_gpu.descr,, B_gpu.indptr.gpudata, B_gpu.indices.gpudata, A_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc, ) return C_gpu
def csrmm2(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0): """ Calculates C += alpha * A*B + beta*C. Where A is sparse and both A and B can be transposed. """ if transA: ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE n, l = A_gpu.shape[1], A_gpu.shape[0] else: ta = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n, l = A_gpu.shape if (B_gpu.flags.c_contiguous and transB) or (B_gpu.flags.f_contiguous and not transB): tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE else: tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE k, m = (B_gpu.shape[1], B_gpu.shape[0]) if transB else B_gpu.shape assert (l == k) and (n, m) == C_gpu.shape ldb = B_gpu.shape[1] if B_gpu.flags.c_contiguous else B_gpu.shape[0] ldc = C_gpu.shape[0] # if C-major, save result into a temp array and transpose afterwards if C_gpu.flags.c_contiguous: out = __cuda_get_temp_matrix(C_gpu.shape, C_gpu.dtype) if beta != 0.0: memcpy_dtod(out.gpudata, C_gpu.gpudata, C_gpu.nbytes) else: out = C_gpu cusparse.cusparseScsrmm2( cusparse_handle, ta, tb, n, m, k, A_gpu.nnz, alpha, A_gpu.descr,, A_gpu.indptr.gpudata, A_gpu.indices.gpudata, B_gpu.gpudata, ldb, beta, out.gpudata, ldc, ) if C_gpu.flags.c_contiguous: cublas.cublasSgeam( cublas_handle, 1, 1, m, n, 1.0, out.gpudata, C_gpu.shape[0], 0.0, 0, C_gpu.shape[0], C_gpu.gpudata, C_gpu.shape[1], ) return C_gpu