def test_cusparseScsrmm2_notranspose(): A = np.random.laplace(size=(5, 3)).astype(np.float32) A[A<0.1] = 0 A = sparse.csr_matrix(A, dtype=np.float32) B = np.random.normal(size=(3, 6)).astype(np.float32, order="f") C = np.ones((A.shape[0], B.shape[1]), dtype=np.float32) X_exp = (A*B) + 0.5*C a_data = gpu.to_gpu(A.data) a_indptr = gpu.to_gpu(A.indptr) a_indices = gpu.to_gpu(A.indices) b = gpu.to_gpu(B) h = cusparse.cusparseCreate() descrA = cusparse.cusparseCreateMatDescr() c = gpu.empty((C.shape[1], C.shape[0]), dtype=A.dtype) c.fill(1.0) cusparse.cusparseScsrmm2(h, cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE, cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE, c.shape[1], c.shape[0], b.shape[0], A.nnz, 1.0, descrA, a_data.gpudata, a_indptr.gpudata, a_indices.gpudata, b.gpudata, b.shape[0], 0.5, c.gpudata, c.shape[1]) assert_allclose(c.get().T, X_exp, rtol=1e-4)
def csrmm2(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0): ''' Calculates C += alpha * A*B + beta*C. Where A is sparse and both A and B can be transposed. ''' if transA: ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE n, l = A_gpu.shape[1], A_gpu.shape[0] else: ta = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n, l = A_gpu.shape if (B_gpu.flags.c_contiguous and transB) or (B_gpu.flags.f_contiguous and not transB): tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE else: tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE k, m = (B_gpu.shape[1], B_gpu.shape[0]) if transB else B_gpu.shape assert (l == k) and (n, m) == C_gpu.shape ldb = B_gpu.shape[1] if B_gpu.flags.c_contiguous else B_gpu.shape[0] ldc = C_gpu.shape[0] # if C-major, save result into a temp array and transpose afterwards if C_gpu.flags.c_contiguous: out = __cuda_get_temp_matrix(C_gpu.shape, C_gpu.dtype) if beta != 0.0: memcpy_dtod(out.gpudata, C_gpu.gpudata, C_gpu.nbytes) else: out = C_gpu cusparse.cusparseScsrmm2(cusparse_handle, ta, tb, n, m, k, A_gpu.nnz, alpha, A_gpu.descr, A_gpu.data.gpudata, A_gpu.indptr.gpudata, A_gpu.indices.gpudata, B_gpu.gpudata, ldb, beta, out.gpudata, ldc) if C_gpu.flags.c_contiguous: cublas.cublasSgeam(cublas_handle, 1, 1, m, n, 1.0, out.gpudata, C_gpu.shape[0], 0.0, 0, C_gpu.shape[0], C_gpu.gpudata, C_gpu.shape[1]) return C_gpu
def csrmmB(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0): ''' Calculates C += alpha * A*B + beta*C. Where B is sparse and both A and B can be transposed. Note: cuSPARSE only allows for sparse A, so we need some tricks: Essentially, we will compute C^T = B^T * A^T By enforcing C to be row-major, can drop its transpose since cuSPARSE assumes column-major. Thus, we only need to compute C = op(B)^T * op(A)^T ''' assert C_gpu.flags.c_contiguous m, k = B_gpu.shape ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE if not transB else cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE if transA: if ta: # we can't have ta and tb true at the same time according to cuSPARSE docs out = __cuda_get_temp_matrix(A_gpu.shape, A_gpu.dtype) cublas.cublasSgeam(cublas_handle, 1, 1, A_gpu.shape[0], A_gpu.shape[1], 1.0, A_gpu.gpudata, A_gpu.shape[1], 0.0, A_gpu.gpudata, A_gpu.shape[1], out.gpudata, A_gpu.shape[0]) out.shape = A_gpu.shape[1], A_gpu.shape[0] out.strides = gpuarray._c_contiguous_strides(out.dtype.itemsize, out.shape) A_gpu = out tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] else: tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE n = A_gpu.shape[1] else: tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] ldb = A_gpu.shape[1] ldc = C_gpu.shape[1] cusparse.cusparseScsrmm2(cusparse_handle, ta, tb, m, n, k, B_gpu.nnz, alpha, B_gpu.descr, B_gpu.data.gpudata, B_gpu.indptr.gpudata, B_gpu.indices.gpudata, A_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc) return C_gpu
def csrmmB(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0): """ Calculates C += alpha * A*B + beta*C. Where B is sparse and both A and B can be transposed. Note: cuSPARSE only allows for sparse A, so we need some tricks: Essentially, we will compute C^T = B^T * A^T By enforcing C to be row-major, can drop its transpose since cuSPARSE assumes column-major. Thus, we only need to compute C = op(B)^T * op(A)^T """ assert C_gpu.flags.c_contiguous m, k = B_gpu.shape ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE if not transB else cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE if transA: if ta: # we can't have ta and tb true at the same time according to cuSPARSE docs out = __cuda_get_temp_matrix(A_gpu.shape, A_gpu.dtype) cublas.cublasSgeam( cublas_handle, 1, 1, A_gpu.shape[0], A_gpu.shape[1], 1.0, A_gpu.gpudata, A_gpu.shape[1], 0.0, A_gpu.gpudata, A_gpu.shape[1], out.gpudata, A_gpu.shape[0], ) out.shape = A_gpu.shape[1], A_gpu.shape[0] out.strides = gpuarray._c_contiguous_strides(out.dtype.itemsize, out.shape) A_gpu = out tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] else: tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE n = A_gpu.shape[1] else: tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] ldb = A_gpu.shape[1] ldc = C_gpu.shape[1] cusparse.cusparseScsrmm2( cusparse_handle, ta, tb, m, n, k, B_gpu.nnz, alpha, B_gpu.descr, B_gpu.data.gpudata, B_gpu.indptr.gpudata, B_gpu.indices.gpudata, A_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc, ) return C_gpu
def csrmm2(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0): """ Calculates C += alpha * A*B + beta*C. Where A is sparse and both A and B can be transposed. """ if transA: ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE n, l = A_gpu.shape[1], A_gpu.shape[0] else: ta = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n, l = A_gpu.shape if (B_gpu.flags.c_contiguous and transB) or (B_gpu.flags.f_contiguous and not transB): tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE else: tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE k, m = (B_gpu.shape[1], B_gpu.shape[0]) if transB else B_gpu.shape assert (l == k) and (n, m) == C_gpu.shape ldb = B_gpu.shape[1] if B_gpu.flags.c_contiguous else B_gpu.shape[0] ldc = C_gpu.shape[0] # if C-major, save result into a temp array and transpose afterwards if C_gpu.flags.c_contiguous: out = __cuda_get_temp_matrix(C_gpu.shape, C_gpu.dtype) if beta != 0.0: memcpy_dtod(out.gpudata, C_gpu.gpudata, C_gpu.nbytes) else: out = C_gpu cusparse.cusparseScsrmm2( cusparse_handle, ta, tb, n, m, k, A_gpu.nnz, alpha, A_gpu.descr, A_gpu.data.gpudata, A_gpu.indptr.gpudata, A_gpu.indices.gpudata, B_gpu.gpudata, ldb, beta, out.gpudata, ldc, ) if C_gpu.flags.c_contiguous: cublas.cublasSgeam( cublas_handle, 1, 1, m, n, 1.0, out.gpudata, C_gpu.shape[0], 0.0, 0, C_gpu.shape[0], C_gpu.gpudata, C_gpu.shape[1], ) return C_gpu