def _perform_sgemv(self, mat, v, vec_out, nvecs, dim): ''' NOTES: cuBLAS uses Fortran layout cublas_sgemv is used to multiply matrix and vector (LEVEl 2 BLAS) cublas_handle -> handle to the cuBLAS library context t -> transpose dim -> number of columns of matrix nvecs -> number of rows of matrix alpha -> scalar used for multiplication of mat mat.gpudata -> matrix mat dim -> columns of matrix v.gpudata -> vector v incX -> Stride within X. For example, if incX is 7, every 7th element is used. beta -> scalar used for multiplication of v v_out.gpudata -> result incY -> Stride within Y. For example, if incx is 7, every 7th element is used Readmore -> http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemv ''' alpha = np.float32(1.0) beta = np.float32(0.0) incx = 1 incy = 1 cublas_handle = cublas.cublasCreate() cublas.cublasSgemv(cublas_handle, 't', dim, nvecs, alpha, mat.gpudata, dim, v.gpudata, incx, beta, vec_out.gpudata, incy) cublas.cublasDestroy(cublas_handle) return vec_out
def test_cublasSgemv(self): a = np.random.rand(2, 3).astype(np.float32) x = np.random.rand(3, 1).astype(np.float32) a_gpu = gpuarray.to_gpu(a.T.copy()) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.empty((2, 1), np.float32) alpha = np.float32(1.0) beta = np.float32(0.0) cublas.cublasSgemv(self.cublas_handle, 'n', 2, 3, alpha, a_gpu.gpudata, 2, x_gpu.gpudata, 1, beta, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), np.dot(a, x))
def product_Ax(A_d, X, N, handler, precision): if precision == 2: RP = np.float64 else: RP = np.float32 X_d = ga.to_gpu(X.reshape((N, 1)).astype(RP)) res_d = ga.to_gpu(np.zeros((N, 1)).astype(RP)) alpha = 1 if precision == 1: cublas.cublasSgemv(handler, "t", N, N, alpha, A_d.gpudata, N, X_d.gpudata, 1, 0, res_d.gpudata, 1) else: cublas.cublasDgemv(handler, "t", N, N, alpha, A_d.gpudata, N, X_d.gpudata, 1, 0, res_d.gpudata, 1) result = res_d.get() return result[:, 0]
# refers to the structure of the matrix,we can specify whether we want to use the original matrix, a direct transpose, or a conjugate transpose (for complex matrices) # Since we now have the column-wise matrix stored properly on the GPU, we can set the trans variable to not take the transpose by using the _CUBLAS_OP dictionary trans = cublas._CUBLAS_OP['N'] # indicates the leading dimension of the matrix, where the total size of the matrix is actually lda x n (if lda > m -> Problems) lda = m # x and its stride, incx; x is the underlying C pointer of the vector being multiplied by A. Remember, x will have to be of size n; incx = 1 # y and its stride incy as the last parameters. We should remember that y should be of size m, or the number of rows incy = 1 handle = cublas.cublasCreate() # refers to the cuBLAS context. # cublas.cublasSgemv(handle, trans, m, n, alpha, A_gpu.gpudata, lda, x_gpu.gpudata, incx, beta, y_gpu.gpudata, incy) cublas.cublasDestroy(handle) print 'cuBLAS returned the correct value: %s' % np.allclose( np.dot(A, x), y_gpu.get()) """Level-3 GEMM (general matrix-matrix)""" # performance metric for our GPU to determine the number of Floating Point Operations Per Second (FLOPS) it can perform, # which will be two separate values: the case of single precision, and that of double precision # m, n, and k variables for our matrix sizes m = 5000 n = 10000 k = 10000
print("\nComputing matrix vector mutliplication in CPU:\n") print("a = \n", a_cpu, "\n") print("b = \n", b_cpu, "\n") print("Product =\n", c_cpu, "\n") # allocating and converting (a) to a gpuarray: a_gpu = gpuarray.to_gpu(a_cpu) # allocating and converting (b) to a gpuarray: b_gpu = gpuarray.to_gpu(b_cpu) # allocating c as a 2x1 matrix filled with zeros: c_gpu = gpuarray.zeros((2,1), dtype = np.float32) # Computing matrix product of gpu (a) and (b) and storing it in gpu (c): cublas.cublasSgemv(handle = cublas.cublasCreate(), trans = 'n', m = 2, n = 3, alpha = 1.0, A = a_gpu.gpudata, lda = 2, x = b_gpu.gpudata, incx = 1, beta = 1.0, y = c_gpu.gpudata, incy = 1) # printing the process(GPU): print("\nComputing matrix vector mutliplication in GPU:\n") print("a = \n", a_gpu.get(), "\n") print("b = \n", b_gpu.get(), "\n") print("Product =\n", c_gpu.get(), "\n") #print(a_gpu, "\n", b_gpu) #print(c_gpu)