def gpu_skcuda_cusolver_cusolverDnDgesvd_S(input): coloring_print("\nGPU: skcuda.cusolver.cusolverDnDgesvd() 'S' option") # #縦横を逆(≒転地)してcolumn-majorにし、GPUのcusolverDnDgesvd()に対応させる。結果配列のU,Vが逆になる n, m = input.shape # change function by data type if input.dtype == np.dtype('float64'): get_buffer = solver.cusolverDnDgesvd_bufferSize cusolver_svd = solver.cusolverDnDgesvd elif input.dtype == np.dtype('float32'): get_buffer = solver.cusolverDnSgesvd_bufferSize cusolver_svd = solver.cusolverDnSgesvd else: print "Error: data type must be float64 or float32" h2d_start = time.time() input_gpu = gpuarray.to_gpu(input) h2d_end = time.time() print "H2D: ", h2d_end - h2d_start, "[sec]" # Set up work buffers: h = solver.cusolverDnCreate() Lwork = get_buffer(h, m, n) workspace_gpu = gpuarray.zeros(Lwork, input.dtype) devInfo_gpu = gpuarray.zeros(1, np.int32) # Set up output buffers: s_gpu = gpuarray.zeros(min(m, n), input.dtype) u_gpu = gpuarray.zeros((n, n), input.dtype) vh_gpu = gpuarray.zeros((m, m), input.dtype) # 'S': the first min(m,n) columns of U (the left singular vectors) are returned in the array U cusolver_S_svd_start = time.time() status = cusolver_svd(h, 'S', 'S', m, n, input_gpu.gpudata, m, s_gpu.gpudata, u_gpu.gpudata, m, vh_gpu.gpudata, n, workspace_gpu.gpudata, Lwork, 0, devInfo_gpu.gpudata) cusolver_S_svd_end = time.time() print "solver.cusolverDnSgesvd() 'S' option", cusolver_S_svd_end - cusolver_S_svd_start, "[sec]" print "Total: ", cusolver_S_svd_end - h2d_start, "[sec]" # u and s is swapped (数学的に正しいかはわからない) check_result(input, vh_gpu.get(), s_gpu.get(), u_gpu.get()) solver.cusolverDnDestroy(h)
def gpu_skcuda_cusolver_cusolverDnDgesvd_N(input): coloring_print("\nGPU: skcuda.cusolver.cusolverDnDgesvd() 'N' option") # #縦横を逆(≒転地)してcolumn-majorにし、GPUのcusolverDnDgesvd()に対応させる。結果配列のU,Vが逆になる n, m = input.shape # change function by data type if input.dtype == np.dtype('float64'): get_buffer = solver.cusolverDnDgesvd_bufferSize cusolver_svd = solver.cusolverDnDgesvd elif input.dtype == np.dtype('float32'): get_buffer = solver.cusolverDnSgesvd_bufferSize cusolver_svd = solver.cusolverDnSgesvd else: print "Error: data type must be float64 or float32" h2d_start = time.time() input_gpu = gpuarray.to_gpu(input) h2d_end = time.time() print "H2D: ", h2d_end - h2d_start, "[sec]" # Set up work buffers: h = solver.cusolverDnCreate() Lwork = get_buffer(h, m, n) workspace_gpu = gpuarray.zeros(Lwork, input.dtype) devInfo_gpu = gpuarray.zeros(1, np.int32) # Set up output buffers: s_gpu = gpuarray.zeros(min(m, n), input.dtype) # 'N': no columns of U (no left singular vectors) are computed. cusolver_N_svd_start = time.time() status = cusolver_svd(h, 'N', 'N', m, n, input_gpu.gpudata, m, s_gpu.gpudata, 0, m, 0, n, workspace_gpu.gpudata, 0, 0, devInfo_gpu.gpudata) cusolver_N_svd_end = time.time() print "solver.cusolverDnSgesvd() 'N' option: ", cusolver_N_svd_end - cusolver_N_svd_start, "[sec]" print "Total: ", cusolver_N_svd_end - h2d_start, "[sec]" print "only s is computed" # print s_gpu.get() solver.cusolverDnDestroy(h)
def make_thunk(self, node, storage_map, _, no_recycling=[], impl=None): if not cusolver_available: raise RuntimeError('CUSOLVER is not available and ' 'GpuCusolverSolve Op can not be constructed.') inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] global cusolver_handle if cusolver_handle is None: cusolver_handle = cusolver.cusolverDnCreate() def thunk(): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0][0] # Solution vectors. b = inputs[1][0] assert (len(A.shape) == 2) assert (len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k, m) # We copy A and b as cusolver operates inplace b = gpuarray.array(b, copy=True, order='F') if not self.inplace: A = gpuarray.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans workspace_size = cusolver.cusolverDnSgetrf_bufferSize( cusolver_handle, n, n, A_ptr, lda) if (thunk.workspace is None or thunk.workspace.size != workspace_size): thunk.workspace = gpuarray.zeros((workspace_size, ), dtype='float32', context=context) if thunk.pivots is None or thunk.pivots.size != min(n, n): thunk.pivots = gpuarray.zeros((min(n, n), ), dtype='float32', context=context) if thunk.dev_info is None: thunk.dev_info = gpuarray.zeros((1, ), dtype='float32', context=context) workspace_ptr = thunk.workspace.gpudata pivots_ptr = thunk.pivots.gpudata dev_info_ptr = thunk.dev_info.gpudata cusolver.cusolverDnSgetrf(cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) cusolver.cusolverDnSgetrs(cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False thunk.workspace = None thunk.pivots = None thunk.dev_info = None return thunk
#!/usr/bin/env python """ Demo of how to call low-level CUSOLVER wrappers to perform LU decomposition. """ import numpy as np import scipy.linalg import scipy as sp import pycuda.autoinit import pycuda.gpuarray as gpuarray import skcuda.cusolver as solver h = solver.cusolverDnCreate() x = np.asarray([[1.80, 2.88, 2.05, -0.89], [5.25, -2.95, -0.95, -3.80], [1.58, -2.69, -2.90, -1.04], [-1.11, -0.66, -0.59, 0.80]]).astype(np.float32) # Need to copy transposed matrix because T only returns a view: m, n = x.shape x_gpu = gpuarray.to_gpu(x.T.copy()) # Set up work buffers: Lwork = solver.cusolverDnSgetrf_bufferSize(h, m, n, x_gpu.gpudata, m) workspace_gpu = gpuarray.zeros(Lwork, np.float32) devipiv_gpu = gpuarray.zeros(min(m, n), np.int32) devinfo_gpu = gpuarray.zeros(1, np.int32) # Compute: solver.cusolverDnSgetrf(h, m, n, x_gpu.gpudata, m, workspace_gpu.gpudata, devipiv_gpu.gpudata, devinfo_gpu.gpudata)
def attach_cusolver_handle_to_context(ctx): handle = getattr(ctx, 'cusolver_handle', None) if handle is None: with ctx: ctx.cusolver_handle = cusolver.cusolverDnCreate()
#!/usr/bin/env python """ Demo of how to call low-level CUSOLVER wrappers to perform eigen decomposition for a batch of small Hermitian matrices. """ import numpy as np import pycuda.autoinit import pycuda.gpuarray as gpuarray import skcuda.cusolver as solver handle = solver.cusolverDnCreate() batchSize = 100 n = 9 A = np.empty((n * batchSize, n), dtype=np.complex64) B = np.empty((n * batchSize, n), dtype=A.dtype) for i in range(batchSize): x = np.random.randn(n, n) + 1j * np.random.randn(n, n) x = x + x.conj().T x = x.astype(np.complex64) A[i * n:(i + 1) * n, :] = x # Need to reverse dimensions because CUSOLVER expects column-major matrices: B[i * n:(i + 1) * n, :] = x.T.copy() x_gpu = gpuarray.to_gpu(B) # Set up output buffers: w_gpu = gpuarray.empty((batchSize, n), dtype=np.float32)
def attach_cusolver_handle_to_context(ctx): handle = getattr(ctx, "cusolver_handle", None) if handle is None: with ctx: ctx.cusolver_handle = cusolver.cusolverDnCreate()
#!/usr/bin/env python """ Demo of how to call low-level CUSOLVER wrappers to perform SVD decomposition. """ import numpy as np import pycuda.autoinit import pycuda.gpuarray as gpuarray import skcuda.cusolver as solver h = solver.cusolverDnCreate() x = np.asarray([[1.80, 2.88, 2.05, -0.89], [5.25, -2.95, -0.95, -3.80], [1.58, -2.69, -2.90, -1.04], [-1.11, -0.66, -0.59, 0.80]]).astype(np.float32) # Need to reverse dimensions because CUSOLVER expects column-major matrices: n, m = x.shape x_gpu = gpuarray.to_gpu(x) # Set up work buffers: Lwork = solver.cusolverDnSgesvd_bufferSize(h, m, n) workspace_gpu = gpuarray.zeros(Lwork, np.float32) devInfo_gpu = gpuarray.zeros(1, np.int32) # Set up output buffers: s_gpu = gpuarray.zeros(min(m, n), np.float32) u_gpu = gpuarray.zeros((m, m), np.float32) vh_gpu = gpuarray.zeros((n, n), np.float32)
def prepare_node(self, node, storage_map, compute_map, impl): ctx = node.inputs[0].type.context handle = getattr(ctx, 'cusolver_handle', None) if handle is None: with ctx: ctx.cusolver_handle = cusolver.cusolverDnCreate()
def make_thunk(self, node, storage_map, _, no_recycling=[], impl=None): if not cusolver_available: raise RuntimeError('CUSOLVER is not available and ' 'GpuCusolverSolve Op can not be constructed.') inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] global cusolver_handle if cusolver_handle is None: cusolver_handle = cusolver.cusolverDnCreate() def thunk(): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0][0] # Solution vectors. b = inputs[1][0] assert(len(A.shape) == 2) assert(len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k, m) # We copy A and b as cusolver operates inplace b = gpuarray.array(b, copy=True, order='F') if not self.inplace: A = gpuarray.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans workspace_size = cusolver.cusolverDnSgetrf_bufferSize( cusolver_handle, n, n, A_ptr, lda) if (thunk.workspace is None or thunk.workspace.size != workspace_size): thunk.workspace = gpuarray.zeros((workspace_size,), dtype='float32', context=context) if thunk.pivots is None or thunk.pivots.size != min(n, n): thunk.pivots = gpuarray.zeros((min(n, n),), dtype='float32', context=context) if thunk.dev_info is None: thunk.dev_info = gpuarray.zeros((1,), dtype='float32', context=context) workspace_ptr = thunk.workspace.gpudata pivots_ptr = thunk.pivots.gpudata dev_info_ptr = thunk.dev_info.gpudata cusolver.cusolverDnSgetrf( cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) cusolver.cusolverDnSgetrs( cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False thunk.workspace = None thunk.pivots = None thunk.dev_info = None return thunk
def prepare_node(self, node, storage_map, compute_map, impl): ctx = node.inputs[0].type.context handle = getattr(ctx, 'cusolver_handle', None) if handle is None: with ctx: ctx.cusolver_handle = cusolver.cusolverDnCreate()
#!/usr/bin/env python """ Demo of how to call low-level CUSOLVER wrappers to perform eigen decomposition for a batch of small symmetric matrices. """ import numpy as np import pycuda.autoinit import pycuda.gpuarray as gpuarray import skcuda.cusolver as solver handle = solver.cusolverDnCreate() batchSize = 100 n = 9 A = np.empty((n*batchSize, n), dtype = np.double) for i in range(batchSize): x = np.random.randn(n, n) x = x+x.T A[i*n:(i+1)*n, :] = x x_gpu = gpuarray.to_gpu(A) # Set up output buffers: w_gpu = gpuarray.empty((batchSize, n), dtype = A.dtype) # Set up parameters params = solver.cusolverDnCreateSyevjInfo() solver.cusolverDnXsyevjSetTolerance(params, 1e-7)