Example #1
0
    def _perform_sgemv(self, mat, v, vec_out, nvecs, dim):
        '''
        NOTES: cuBLAS uses Fortran layout
        cublas_sgemv is used to multiply matrix and vector (LEVEl 2 BLAS)

        cublas_handle   -> handle to the cuBLAS library context
        t               -> transpose
        dim             -> number of columns of matrix
        nvecs           -> number of rows of matrix
        alpha           -> scalar used for multiplication of mat
        mat.gpudata     -> matrix mat
        dim             -> columns of matrix
        v.gpudata       -> vector v
        incX            -> Stride within X. For example, if incX is 7, every 7th element is used.
        beta            -> scalar used for multiplication of v
        v_out.gpudata   -> result
        incY            -> Stride within Y. For example, if incx is 7, every 7th element is used

        Readmore        -> http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemv
        '''
        alpha = np.float32(1.0)
        beta = np.float32(0.0)

        incx = 1
        incy = 1

        cublas_handle = cublas.cublasCreate()

        cublas.cublasSgemv(cublas_handle, 't', dim, nvecs, alpha, mat.gpudata,
                           dim, v.gpudata, incx, beta, vec_out.gpudata, incy)

        cublas.cublasDestroy(cublas_handle)

        return vec_out
Example #2
0
 def test_cublasSgemv(self):
     a = np.random.rand(2, 3).astype(np.float32)
     x = np.random.rand(3, 1).astype(np.float32)
     a_gpu = gpuarray.to_gpu(a.T.copy())
     x_gpu = gpuarray.to_gpu(x)
     y_gpu = gpuarray.empty((2, 1), np.float32)
     alpha = np.float32(1.0)
     beta = np.float32(0.0)
     cublas.cublasSgemv(self.cublas_handle, 'n', 2, 3, alpha, a_gpu.gpudata,
                        2, x_gpu.gpudata, 1, beta, y_gpu.gpudata, 1)
     assert np.allclose(y_gpu.get(), np.dot(a, x))
Example #3
0
 def test_cublasSgemv(self):
     a = np.random.rand(2, 3).astype(np.float32)
     x = np.random.rand(3, 1).astype(np.float32)
     a_gpu = gpuarray.to_gpu(a.T.copy())
     x_gpu = gpuarray.to_gpu(x)
     y_gpu = gpuarray.empty((2, 1), np.float32)
     alpha = np.float32(1.0)
     beta = np.float32(0.0)
     cublas.cublasSgemv(self.cublas_handle, 'n', 2, 3, alpha,
                        a_gpu.gpudata, 2, x_gpu.gpudata,
                        1, beta, y_gpu.gpudata, 1)
     assert np.allclose(y_gpu.get(), np.dot(a, x))
Example #4
0
def product_Ax(A_d, X, N, handler, precision):

    if precision == 2:
        RP = np.float64
    else:
        RP = np.float32

    X_d = ga.to_gpu(X.reshape((N, 1)).astype(RP))
    res_d = ga.to_gpu(np.zeros((N, 1)).astype(RP))

    alpha = 1
    if precision == 1:
        cublas.cublasSgemv(handler, "t", N, N, alpha, A_d.gpudata, N,
                           X_d.gpudata, 1, 0, res_d.gpudata, 1)
    else:
        cublas.cublasDgemv(handler, "t", N, N, alpha, A_d.gpudata, N,
                           X_d.gpudata, 1, 0, res_d.gpudata, 1)

    result = res_d.get()

    return result[:, 0]
Example #5
0
# refers to the structure of the matrix,we can specify whether we want to use the original matrix, a direct transpose, or a conjugate transpose (for complex matrices)
# Since we now have the column-wise matrix stored properly on the GPU, we can set the trans variable to not take the transpose by using the _CUBLAS_OP dictionary
trans = cublas._CUBLAS_OP['N']

# indicates the leading dimension of the matrix, where the total size of the matrix is actually lda x n (if lda > m -> Problems)
lda = m

# x and its stride, incx; x is the underlying C pointer of the vector being multiplied by A. Remember, x will have to be of size n;
incx = 1
# y and its stride incy as the last parameters. We should remember that y should be of size m, or the number of rows
incy = 1
handle = cublas.cublasCreate()  # refers to the cuBLAS context.

#

cublas.cublasSgemv(handle, trans, m, n, alpha, A_gpu.gpudata, lda,
                   x_gpu.gpudata, incx, beta, y_gpu.gpudata, incy)

cublas.cublasDestroy(handle)
print 'cuBLAS returned the correct value: %s' % np.allclose(
    np.dot(A, x), y_gpu.get())
"""Level-3 GEMM (general matrix-matrix)"""

# performance metric for our GPU to determine the number of Floating Point Operations Per Second (FLOPS) it can perform,
# which will be two separate values: the case of single precision, and that of double precision

# m, n, and k variables for our matrix sizes
m = 5000
n = 10000
k = 10000

Example #6
0
print("\nComputing matrix vector mutliplication in CPU:\n")
print("a = \n", a_cpu, "\n")
print("b = \n", b_cpu, "\n")
print("Product =\n", c_cpu, "\n")

# allocating and converting (a) to a gpuarray:
a_gpu = gpuarray.to_gpu(a_cpu)
# allocating and converting (b) to a gpuarray:
b_gpu = gpuarray.to_gpu(b_cpu)
# allocating c as a 2x1 matrix filled with zeros:
c_gpu = gpuarray.zeros((2,1), dtype = np.float32)
# Computing matrix product of gpu (a) and (b) and storing it in gpu (c):
cublas.cublasSgemv(handle = cublas.cublasCreate(),
				trans = 'n',
				m = 2, n = 3,
				alpha = 1.0,
				A = a_gpu.gpudata, lda = 2,
				x = b_gpu.gpudata, incx = 1, 
				beta = 1.0, y = c_gpu.gpudata, incy = 1)

# printing the process(GPU):
print("\nComputing matrix vector mutliplication in GPU:\n")
print("a = \n", a_gpu.get(), "\n")
print("b = \n", b_gpu.get(), "\n")
print("Product =\n", c_gpu.get(), "\n")



#print(a_gpu, "\n", b_gpu)

#print(c_gpu)