def _inv_gpu(b): # We do a batched LU decomposition on the GPU to compute the inverse # Change the shape of the array to be size=1 minibatch if necessary # Also copy the matrix as the elements will be modified in-place a = matmul._as_batch_mat(b).copy() n = a.shape[1] n_matrices = len(a) # Pivot array p = cuda.cupy.empty((n, n_matrices), dtype=numpy.int32) # Output array c = cuda.cupy.empty_like(a) # These arrays hold information on the execution success # or if the matrix was singular info = cuda.cupy.empty(n_matrices, dtype=numpy.int32) ap = matmul._mat_ptrs(a) cp = matmul._mat_ptrs(c) _, lda = matmul._get_ld(a) _, ldc = matmul._get_ld(c) handle = cuda.Device().cublas_handle if b.dtype == numpy.float32: cuda.cublas.sgetrfBatched( handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) cuda.cublas.sgetriBatched( handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc, info.data.ptr, n_matrices) elif b.dtype == numpy.float64: cuda.cublas.dgetrfBatched( handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) cuda.cublas.dgetriBatched( handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc, info.data.ptr, n_matrices) else: assert False return c, info
def _inv_gpu(b): # We do a batched LU decomposition on the GPU to compute the inverse # Change the shape of the array to be size=1 minibatch if necessary # Also copy the matrix as the elments will be modified in-place a = matmul._as_batch_mat(b).copy() n = a.shape[1] n_matrices = len(a) # Pivot array p = cuda.cupy.empty((n, n_matrices), dtype=numpy.int32) # Output array c = cuda.cupy.empty_like(a) # These arrays hold information on the execution success # or if the matrix was singular info = cuda.cupy.empty(n_matrices, dtype=numpy.int32) ap = matmul._mat_ptrs(a) cp = matmul._mat_ptrs(c) _, lda = matmul._get_ld(a) _, ldc = matmul._get_ld(c) handle = cuda.Device().cublas_handle if b.dtype == numpy.float32: cuda.cublas.sgetrfBatched(handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) cuda.cublas.sgetriBatched(handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc, info.data.ptr, n_matrices) elif b.dtype == numpy.float64: cuda.cublas.dgetrfBatched(handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) cuda.cublas.dgetriBatched(handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc, info.data.ptr, n_matrices) else: assert False return c, info
def _det_gpu(b): # We do a batched LU decomposition on the GPU to compute # and compute the determinant by multiplying the diagonal. # Change the shape of the array to be size=1 minibatch if necessary. # Also copy the matrix as the elments will be modified in-place. a = matmul._as_batch_mat(b).copy() n = a.shape[1] n_matrices = len(a) # Pivot array p = cuda.cupy.zeros((n_matrices, n), dtype='int32') # Output array # These arrays hold information on the execution success # or if the matrix was singular. info = cuda.cupy.zeros(n_matrices, dtype=numpy.intp) ap = matmul._mat_ptrs(a) _, lda = matmul._get_ld(a) if b.dtype == numpy.float32: cuda.cublas.sgetrfBatched(cuda.Device().cublas_handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) elif b.dtype == numpy.float64: cuda.cublas.dgetrfBatched(cuda.Device().cublas_handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) else: assert False det = cuda.cupy.prod(a.diagonal(axis1=1, axis2=2), axis=1) # The determinant is equal to the product of the diagonal entries # of `a` where the sign of `a` is flipped depending on whether # the pivot array is equal to its index. rng = cuda.cupy.arange(1, n + 1, dtype='int32') parity = cuda.cupy.sum(p != rng, axis=1) % 2 sign = 1. - 2. * parity.astype(b.dtype, copy=False) return det * sign, info