def perform(self, node, inputs, outputs): context = inputs[0][0].context # Input matrix. A = inputs[0] l, n = A.shape if l != n: raise ValueError('A must be a square matrix') lda = max(1, n) # cusolver operates on F ordered matrices if not self.inplace: LU = pygpu.array(A, copy=True, order='F') else: LU = A.T if A.flags['C_CONTIGUOUS'] else A LU_ptr = LU.gpudata with context: workspace_size = cusolver.cusolverDnSgetrf_bufferSize( context.cusolver_handle, n, n, LU_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) pivots = pygpu.zeros(n, dtype='int32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata cusolver.cusolverDnSgetrf(context.cusolver_handle, n, n, LU_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) if self.check_output: val_dev_info = np.asarray(dev_info)[0] if val_dev_info > 0: raise LinAlgError('LU decomposition failed') outputs[1][0] = pivots outputs[0][0] = LU
def thunk(): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0][0] # Solution vectors. b = inputs[1][0] assert (len(A.shape) == 2) assert (len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k, m) # We copy A and b as cusolver operates inplace b = gpuarray.array(b, copy=True, order='F') if not self.inplace: A = gpuarray.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans workspace_size = cusolver.cusolverDnSgetrf_bufferSize( cusolver_handle, n, n, A_ptr, lda) if (thunk.workspace is None or thunk.workspace.size != workspace_size): thunk.workspace = gpuarray.zeros((workspace_size, ), dtype='float32', context=context) if thunk.pivots is None or thunk.pivots.size != min(n, n): thunk.pivots = gpuarray.zeros((min(n, n), ), dtype='float32', context=context) if thunk.dev_info is None: thunk.dev_info = gpuarray.zeros((1, ), dtype='float32', context=context) workspace_ptr = thunk.workspace.gpudata pivots_ptr = thunk.pivots.gpudata dev_info_ptr = thunk.dev_info.gpudata cusolver.cusolverDnSgetrf(cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) cusolver.cusolverDnSgetrs(cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b
x = np.asarray([[1.80, 2.88, 2.05, -0.89], [5.25, -2.95, -0.95, -3.80], [1.58, -2.69, -2.90, -1.04], [-1.11, -0.66, -0.59, 0.80]]).astype(np.float32) # Need to copy transposed matrix because T only returns a view: m, n = x.shape x_gpu = gpuarray.to_gpu(x.T.copy()) # Set up work buffers: Lwork = solver.cusolverDnSgetrf_bufferSize(h, m, n, x_gpu.gpudata, m) workspace_gpu = gpuarray.zeros(Lwork, np.float32) devipiv_gpu = gpuarray.zeros(min(m, n), np.int32) devinfo_gpu = gpuarray.zeros(1, np.int32) # Compute: solver.cusolverDnSgetrf(h, m, n, x_gpu.gpudata, m, workspace_gpu.gpudata, devipiv_gpu.gpudata, devinfo_gpu.gpudata) # Confirm that solution is correct by checking against result obtained with # scipy; set dimensions of computed lower/upper triangular matrices to facilitate # comparison if the original matrix was not square: l_cuda = np.tril(x_gpu.get().T, -1) u_cuda = np.triu(x_gpu.get().T) if m < n: l_cuda = l_cuda[:, :m] else: u_cuda = u_cuda[:n, :] p, l, u = sp.linalg.lu(x) # Only check values in lower triangle starting from first off-diagonal: print 'lower triangular matrix is correct: ', \ np.allclose(np.tril(l, -1), l_cuda)
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0] # Solution vectors. b = inputs[1] assert(len(A.shape) == 2) assert(len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k) # We copy A and b as cusolver operates inplace b = pygpu.array(b, copy=True, order='F') if not self.inplace: A = pygpu.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans if self.A_structure == 'symmetric': with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, 0, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSpotrf( context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) self.check_dev_info(dev_info) cusolverDnSpotrs( context.cusolver_handle, 0, n, m, A_ptr, lda, b_ptr, ldb, dev_info_ptr) else: # general case for A with context: workspace_size = cusolver.cusolverDnSgetrf_bufferSize( context.cusolver_handle, n, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) pivots = pygpu.zeros(n, dtype='int32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSgetrf( context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) self.check_dev_info(dev_info) cusolver.cusolverDnSgetrs( context.cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0] # Solution vectors. b = inputs[1] assert (len(A.shape) == 2) assert (len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k) # We copy A and b as cusolver operates inplace b = pygpu.array(b, copy=True, order='F') if not self.inplace: A = pygpu.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans if self.A_structure == 'symmetric': with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, 0, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSpotrf(context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) self.check_dev_info(dev_info) cusolverDnSpotrs(context.cusolver_handle, 0, n, m, A_ptr, lda, b_ptr, ldb, dev_info_ptr) else: # general case for A with context: workspace_size = cusolver.cusolverDnSgetrf_bufferSize( context.cusolver_handle, n, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) pivots = pygpu.zeros(n, dtype='int32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSgetrf(context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) self.check_dev_info(dev_info) cusolver.cusolverDnSgetrs(context.cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b
def thunk(): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0][0] # Solution vectors. b = inputs[1][0] assert(len(A.shape) == 2) assert(len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k, m) # We copy A and b as cusolver operates inplace b = gpuarray.array(b, copy=True, order='F') if not self.inplace: A = gpuarray.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans workspace_size = cusolver.cusolverDnSgetrf_bufferSize( cusolver_handle, n, n, A_ptr, lda) if (thunk.workspace is None or thunk.workspace.size != workspace_size): thunk.workspace = gpuarray.zeros((workspace_size,), dtype='float32', context=context) if thunk.pivots is None or thunk.pivots.size != min(n, n): thunk.pivots = gpuarray.zeros((min(n, n),), dtype='float32', context=context) if thunk.dev_info is None: thunk.dev_info = gpuarray.zeros((1,), dtype='float32', context=context) workspace_ptr = thunk.workspace.gpudata pivots_ptr = thunk.pivots.gpudata dev_info_ptr = thunk.dev_info.gpudata cusolver.cusolverDnSgetrf( cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) cusolver.cusolverDnSgetrs( cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b
[5.25, -2.95, -0.95, -3.80], [1.58, -2.69, -2.90, -1.04], [-1.11, -0.66, -0.59, 0.80]]).astype(np.float32) # Need to copy transposed matrix because T only returns a view: m, n = x.shape x_gpu = gpuarray.to_gpu(x.T.copy()) # Set up work buffers: Lwork = solver.cusolverDnSgetrf_bufferSize(h, m, n, x_gpu.gpudata, m) workspace_gpu = gpuarray.zeros(Lwork, np.float32) devipiv_gpu = gpuarray.zeros(min(m, n), np.int32) devinfo_gpu = gpuarray.zeros(1, np.int32) # Compute: solver.cusolverDnSgetrf(h, m, n, x_gpu.gpudata, m, workspace_gpu.gpudata, devipiv_gpu.gpudata, devinfo_gpu.gpudata) # Confirm that solution is correct by checking against result obtained with # scipy; set dimensions of computed lower/upper triangular matrices to facilitate # comparison if the original matrix was not square: l_cuda = np.tril(x_gpu.get().T, -1) u_cuda = np.triu(x_gpu.get().T) if m < n: l_cuda = l_cuda[:, :m] else: u_cuda = u_cuda[:n, :] p, l, u = sp.linalg.lu(x) # Only check values in lower triangle starting from first off-diagonal: print 'lower triangular matrix is correct: ', \ np.allclose(np.tril(l, -1), l_cuda)