def gpu_skcuda_cusolver_cusolverDnDgesvd_S(input): coloring_print("\nGPU: skcuda.cusolver.cusolverDnDgesvd() 'S' option") # #縦横を逆(≒転地)してcolumn-majorにし、GPUのcusolverDnDgesvd()に対応させる。結果配列のU,Vが逆になる n, m = input.shape # change function by data type if input.dtype == np.dtype('float64'): get_buffer = solver.cusolverDnDgesvd_bufferSize cusolver_svd = solver.cusolverDnDgesvd elif input.dtype == np.dtype('float32'): get_buffer = solver.cusolverDnSgesvd_bufferSize cusolver_svd = solver.cusolverDnSgesvd else: print "Error: data type must be float64 or float32" h2d_start = time.time() input_gpu = gpuarray.to_gpu(input) h2d_end = time.time() print "H2D: ", h2d_end - h2d_start, "[sec]" # Set up work buffers: h = solver.cusolverDnCreate() Lwork = get_buffer(h, m, n) workspace_gpu = gpuarray.zeros(Lwork, input.dtype) devInfo_gpu = gpuarray.zeros(1, np.int32) # Set up output buffers: s_gpu = gpuarray.zeros(min(m, n), input.dtype) u_gpu = gpuarray.zeros((n, n), input.dtype) vh_gpu = gpuarray.zeros((m, m), input.dtype) # 'S': the first min(m,n) columns of U (the left singular vectors) are returned in the array U cusolver_S_svd_start = time.time() status = cusolver_svd(h, 'S', 'S', m, n, input_gpu.gpudata, m, s_gpu.gpudata, u_gpu.gpudata, m, vh_gpu.gpudata, n, workspace_gpu.gpudata, Lwork, 0, devInfo_gpu.gpudata) cusolver_S_svd_end = time.time() print "solver.cusolverDnSgesvd() 'S' option", cusolver_S_svd_end - cusolver_S_svd_start, "[sec]" print "Total: ", cusolver_S_svd_end - h2d_start, "[sec]" # u and s is swapped (数学的に正しいかはわからない) check_result(input, vh_gpu.get(), s_gpu.get(), u_gpu.get()) solver.cusolverDnDestroy(h)
def gpu_skcuda_cusolver_cusolverDnDgesvd_N(input): coloring_print("\nGPU: skcuda.cusolver.cusolverDnDgesvd() 'N' option") # #縦横を逆(≒転地)してcolumn-majorにし、GPUのcusolverDnDgesvd()に対応させる。結果配列のU,Vが逆になる n, m = input.shape # change function by data type if input.dtype == np.dtype('float64'): get_buffer = solver.cusolverDnDgesvd_bufferSize cusolver_svd = solver.cusolverDnDgesvd elif input.dtype == np.dtype('float32'): get_buffer = solver.cusolverDnSgesvd_bufferSize cusolver_svd = solver.cusolverDnSgesvd else: print "Error: data type must be float64 or float32" h2d_start = time.time() input_gpu = gpuarray.to_gpu(input) h2d_end = time.time() print "H2D: ", h2d_end - h2d_start, "[sec]" # Set up work buffers: h = solver.cusolverDnCreate() Lwork = get_buffer(h, m, n) workspace_gpu = gpuarray.zeros(Lwork, input.dtype) devInfo_gpu = gpuarray.zeros(1, np.int32) # Set up output buffers: s_gpu = gpuarray.zeros(min(m, n), input.dtype) # 'N': no columns of U (no left singular vectors) are computed. cusolver_N_svd_start = time.time() status = cusolver_svd(h, 'N', 'N', m, n, input_gpu.gpudata, m, s_gpu.gpudata, 0, m, 0, n, workspace_gpu.gpudata, 0, 0, devInfo_gpu.gpudata) cusolver_N_svd_end = time.time() print "solver.cusolverDnSgesvd() 'N' option: ", cusolver_N_svd_end - cusolver_N_svd_start, "[sec]" print "Total: ", cusolver_N_svd_end - h2d_start, "[sec]" print "only s is computed" # print s_gpu.get() solver.cusolverDnDestroy(h)
workspace_gpu = gpuarray.zeros(lwork, dtype=A.dtype) info = gpuarray.zeros(batchSize, dtype=np.int32) # Compute: solver.cusolverDnCheevjBatched(handle, 'CUSOLVER_EIG_MODE_VECTOR', 'u', n, x_gpu.gpudata, n, w_gpu.gpudata, workspace_gpu.gpudata, lwork, info.gpudata, params, batchSize) # Print info tmp = info.get() if any(tmp): print "the following job did not converge:", np.nonzero(tmp)[0] else: print "all jobs converged" # Destroy handle solver.cusolverDnDestroySyevjInfo(params) solver.cusolverDnDestroy(handle) Q = x_gpu.get() W = w_gpu.get() print 'maximum error in A * Q - Q * Lambda is:' for i in range(batchSize): q = Q[i * n:(i + 1) * n, :].T.copy() x = A[i * n:(i + 1) * n, :].copy() w = W[i, :].copy() print '{}th matrix'.format(i), np.abs( np.dot(x, q) - np.dot(q, np.diag(w))).max()
h = solver.cusolverDnCreate() x = np.asarray([[1.80, 2.88, 2.05, -0.89], [5.25, -2.95, -0.95, -3.80], [1.58, -2.69, -2.90, -1.04], [-1.11, -0.66, -0.59, 0.80]]).astype(np.float32) # Need to reverse dimensions because CUSOLVER expects column-major matrices: n, m = x.shape x_gpu = gpuarray.to_gpu(x) # Set up work buffers: Lwork = solver.cusolverDnSgesvd_bufferSize(h, m, n) workspace_gpu = gpuarray.zeros(Lwork, np.float32) devInfo_gpu = gpuarray.zeros(1, np.int32) # Set up output buffers: s_gpu = gpuarray.zeros(min(m, n), np.float32) u_gpu = gpuarray.zeros((m, m), np.float32) vh_gpu = gpuarray.zeros((n, n), np.float32) # Compute: status = solver.cusolverDnSgesvd(h, 'A', 'A', m, n, x_gpu.gpudata, m, s_gpu.gpudata, u_gpu.gpudata, m, vh_gpu.gpudata, n, workspace_gpu.gpudata, Lwork, 0, devInfo_gpu.gpudata) # Confirm that solution is correct by ensuring that the original matrix can be # obtained from the decomposition: print 'correct solution: ', np.allclose( x, np.dot(vh_gpu.get(), np.dot(np.diag(s_gpu.get()), u_gpu.get())), 1e-4) solver.cusolverDnDestroy(h)
h = solver.cusolverDnCreate() x = np.asarray([[1.80, 2.88, 2.05, -0.89], [5.25, -2.95, -0.95, -3.80], [1.58, -2.69, -2.90, -1.04], [-1.11, -0.66, -0.59, 0.80]]).astype(np.float32) # Need to reverse dimensions because CUSOLVER expects column-major matrices: n, m = x.shape x_gpu = gpuarray.to_gpu(x) # Set up work buffers: Lwork = solver.cusolverDnSgesvd_bufferSize(h, m, n) workspace_gpu = gpuarray.zeros(Lwork, np.float32) devInfo_gpu = gpuarray.zeros(1, np.int32) # Set up output buffers: s_gpu = gpuarray.zeros(min(m, n), np.float32) u_gpu = gpuarray.zeros((m, m), np.float32) vh_gpu = gpuarray.zeros((n, n), np.float32) # Compute: status = solver.cusolverDnSgesvd(h, 'A', 'A', m, n, x_gpu.gpudata, m, s_gpu.gpudata, u_gpu.gpudata, m, vh_gpu.gpudata, n, workspace_gpu.gpudata, Lwork, 0, devInfo_gpu.gpudata) # Confirm that solution is correct by ensuring that the original matrix can be # obtained from the decomposition: print 'correct solution: ', np.allclose(x, np.dot(vh_gpu.get(), np.dot(np.diag(s_gpu.get()), u_gpu.get())), 1e-4) solver.cusolverDnDestroy(h)
lwork = solver.cusolverDnDsyevjBatched_bufferSize(handle, 'CUSOLVER_EIG_MODE_VECTOR', 'u', n, x_gpu.gpudata, n, w_gpu.gpudata, params, batchSize) workspace_gpu = gpuarray.zeros(lwork, dtype = A.dtype) info = gpuarray.zeros(batchSize, dtype = np.int32) # Compute: solver.cusolverDnDsyevjBatched(handle, 'CUSOLVER_EIG_MODE_VECTOR', 'u', n, x_gpu.gpudata, n, w_gpu.gpudata, workspace_gpu.gpudata, lwork, info.gpudata, params, batchSize) # print info tmp = info.get() if any(tmp): print("the following job did not converge: %r" % np.nonzero(tmp)[0]) # Destroy handle solver.cusolverDnDestroySyevjInfo(params) solver.cusolverDnDestroy(handle) Q = x_gpu.get() W = w_gpu.get() print('maximum error in A * Q - Q * Lambda is:') for i in range(batchSize): q = Q[i*n:(i+1)*n,:].T.copy() x = A[i*n:(i+1)*n,:].copy() w = W[i, :].copy() print('{}th matrix %r'.format(i) % np.abs(np.dot(x, q) - np.dot(q, np.diag(w))).max())