def test_cublasSgetriBatched(self): l,m = 11,7 np.random.seed(1) A = np.random.rand(l,m, m).astype(np.float32) a_gpu = gpuarray.to_gpu(A) a_arr = bptrs(a_gpu) c_gpu = gpuarray.empty((l,m,m), np.float32) c_arr = bptrs(c_gpu) p_gpu = gpuarray.empty((l,m), np.int32) i_gpu = gpuarray.zeros(l, np.int32) cublas.cublasSgetrfBatched(self.cublas_handle, m, a_arr.gpudata, m, p_gpu.gpudata, i_gpu.gpudata, l) cublas.cublasSgetriBatched(self.cublas_handle, m, a_arr.gpudata, m, p_gpu.gpudata, c_arr.gpudata,m, i_gpu.gpudata, l) X = np.array(map(np.linalg.inv,A)) X_ = c_gpu.get() assert np.allclose(X,X_,6)
def thunk(): input_shape = inputs[0][0].shape size = input_shape[1] # matrices to invert are (size x size) batch_size = input_shape[0] z = outputs[0] # only allocate if there is no previous allocation of the right size. if z[0] is None or z[0].shape != input_shape: z[0] = cuda.CudaNdarray.zeros(input_shape) pivot_alloc[0] = pycuda.gpuarray.empty((batch_size, size), np.int32) info_alloc[0] = pycuda.gpuarray.zeros(batch_size, np.int32) input_pycuda = to_gpuarray(inputs[0][0]) output_pycuda = to_gpuarray(z[0]) pivot = pivot_alloc[0] info = info_alloc[0] # construct pointer arrays for batched operations input_arr = bptrs(input_pycuda) output_arr = bptrs(output_pycuda) if not self.destructive: input_pycuda = input_pycuda.copy() # to prevent destruction of the input handle = scikits.cuda.misc._global_cublas_handle # perform LU factorization cublas.cublasSgetrfBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, info.gpudata, batch_size) # the LU factorization is now in input_pycuda (destructive operation!) # use factorization to perform inversion cublas.cublasSgetriBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, output_arr.gpudata, size, info.gpudata, batch_size)
def solve_gpu(As, Bs): batch_size, num_factors = As.shape if allocated_shape[0] == As.shape: # reuse previous allocations As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr = allocations[0] As_gpu.set(As) Bs_gpu.set(Bs) else: # allocate # transfer As and Bs to GPU As_gpu = pycuda.gpuarray.to_gpu(As.astype('float32')) Bs_gpu = pycuda.gpuarray.to_gpu(Bs.astype('float32')) # allocate arrays P_gpu = pycuda.gpuarray.empty((batch_size, num_factors), np.int32) info_gpu = pycuda.gpuarray.zeros(batch_size, np.int32) Cs_gpu = pycuda.gpuarray.empty_like(Bs_gpu) # inverted Bs. Rs_gpu = pycuda.gpuarray.empty_like(As_gpu) # final output, As * inverted Bs. # get pointer arrays A_arr = bptrs(As_gpu) B_arr = bptrs(Bs_gpu) C_arr = bptrs(Cs_gpu) R_arr = bptrs(Rs_gpu) allocated_shape[0] = As.shape allocations[0] = As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr handle = scikits.cuda.misc._global_cublas_handle # perform LU factorization cublas.cublasSgetrfBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, info_gpu.gpudata, batch_size) # the LU factorization is now in Bs_gpu! # use factorization to perform inversion cublas.cublasSgetriBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, C_arr.gpudata, num_factors, info_gpu.gpudata, batch_size) # the inverted matrices are now in Cs_gpu! # compute dot products dot(A, C) = dot(A, Binv). Note that the As are actually vectors! transb = 'n' transa = 'n' N, k, m = Cs_gpu.shape N2, l = As_gpu.shape n = 1 # As_gpu is a batch of vectors, not matrices, but we treat it as a batch of matrices with leading dimension 1. # kind of tricky, but it seems to work. The same goes for the output array Rs_gpu. lda = max(1, m) ldb = max(1, k) ldc = max(1, m) alpha = np.float32(1.0) beta = np.float32(0.0) cublas.cublasSgemmBatched(handle, transb, transa, m, n, k, alpha, C_arr.gpudata, lda, A_arr.gpudata, ldb, beta, R_arr.gpudata, ldc, N) # the resulting batch of vectors is now in Rs_gpu. return Rs_gpu.get()
def test_cublasSgetrfBatched(self): from scipy.linalg import lu_factor l, m = 11, 7 A = np.random.rand(l, m, m).astype(np.float32) A = np.array([np.matrix(a) * np.matrix(a).T for a in A]) a_gpu = gpuarray.to_gpu(A) a_arr = bptrs(a_gpu) p_gpu = gpuarray.empty((l, m), np.int32) i_gpu = gpuarray.zeros(1, np.int32) X = np.array([lu_factor(a)[0] for a in A]) cublas.cublasSgetrfBatched(self.cublas_handle, m, a_arr.gpudata, m, p_gpu.gpudata, i_gpu.gpudata, l) X_ = np.array([a.T for a in a_gpu.get()]) assert np.allclose(X, X_)
def test_cublasSgetrfBatched(self): from scipy.linalg import lu_factor l,m = 11,7 A = np.random.rand(l,m, m).astype(np.float32) A = np.array([np.matrix(a)*np.matrix(a).T for a in A]) a_gpu = gpuarray.to_gpu(A) a_arr = bptrs(a_gpu) p_gpu = gpuarray.empty((l,m), np.int32) i_gpu = gpuarray.zeros(l, np.int32) X = np.array([ lu_factor(a)[0] for a in A]) cublas.cublasSgetrfBatched(self.cublas_handle, m, a_arr.gpudata, m, p_gpu.gpudata, i_gpu.gpudata, l) X_ = np.array([a.T for a in a_gpu.get()]) assert np.allclose(X,X_,6)
def solve_gpu(As, Bs): batch_size, num_factors = As.shape if allocated_shape[0] == As.shape: # reuse previous allocations As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr = allocations[ 0] As_gpu.set(As) Bs_gpu.set(Bs) else: # allocate # transfer As and Bs to GPU As_gpu = pycuda.gpuarray.to_gpu(As.astype('float32')) Bs_gpu = pycuda.gpuarray.to_gpu(Bs.astype('float32')) # allocate arrays P_gpu = pycuda.gpuarray.empty((batch_size, num_factors), np.int32) info_gpu = pycuda.gpuarray.zeros(batch_size, np.int32) Cs_gpu = pycuda.gpuarray.empty_like(Bs_gpu) # inverted Bs. Rs_gpu = pycuda.gpuarray.empty_like( As_gpu) # final output, As * inverted Bs. # get pointer arrays A_arr = bptrs(As_gpu) B_arr = bptrs(Bs_gpu) C_arr = bptrs(Cs_gpu) R_arr = bptrs(Rs_gpu) allocated_shape[0] = As.shape allocations[ 0] = As_gpu, Bs_gpu, P_gpu, info_gpu, Cs_gpu, Rs_gpu, A_arr, B_arr, C_arr, R_arr handle = scikits.cuda.misc._global_cublas_handle # perform LU factorization cublas.cublasSgetrfBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, info_gpu.gpudata, batch_size) # the LU factorization is now in Bs_gpu! # use factorization to perform inversion cublas.cublasSgetriBatched(handle, num_factors, B_arr.gpudata, num_factors, P_gpu.gpudata, C_arr.gpudata, num_factors, info_gpu.gpudata, batch_size) # the inverted matrices are now in Cs_gpu! # compute dot products dot(A, C) = dot(A, Binv). Note that the As are actually vectors! transb = 'n' transa = 'n' N, k, m = Cs_gpu.shape N2, l = As_gpu.shape n = 1 # As_gpu is a batch of vectors, not matrices, but we treat it as a batch of matrices with leading dimension 1. # kind of tricky, but it seems to work. The same goes for the output array Rs_gpu. lda = max(1, m) ldb = max(1, k) ldc = max(1, m) alpha = np.float32(1.0) beta = np.float32(0.0) cublas.cublasSgemmBatched(handle, transb, transa, m, n, k, alpha, C_arr.gpudata, lda, A_arr.gpudata, ldb, beta, R_arr.gpudata, ldc, N) # the resulting batch of vectors is now in Rs_gpu. return Rs_gpu.get()