Beispiel #1
0
 def forward(self, matrix1, matrix2):
     with torch.cuda.device_of(matrix1):
         dim1, dim2 = matrix1.size()
         dim2, dim3 = matrix2.size()
         output = matrix1.new(dim1, dim3)
         handle = torch.cuda.current_blas_handle()
         stream = torch.cuda.current_stream()
         cublas.cublasSetStream(handle, stream)
         if isinstance(matrix1, torch.cuda.FloatTensor):
             cublas.cublasSgemm(handle, 'n', 'n', dim3, dim1, dim2, 1,
                                matrix2.data_ptr(), dim3,
                                matrix1.data_ptr(), dim2, 0,
                                output.data_ptr(), dim3)
         elif isinstance(matrix1, torch.cuda.DoubleTensor):
             cublas.cublasDgemm(handle, 'n', 'n', dim3, dim1, dim2, 1,
                                matrix2.data_ptr(), dim3,
                                matrix1.data_ptr(), dim2, 0,
                                output.data_ptr(), dim3)
     self.save_for_backward(matrix1, matrix2)
     return output
def mult_BLAS():
	alpha 	= np.float64(1.0) # no prefactor
	beta 	= np.float64(0.0) # C matrix is not involved so beta = 0.0
	#m, k, n = ud.basis_size, ud.basis_size, ud.basis_size**2
	t0 = time.clock()
	for a in range(100):
		cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= ud.i, n 	= ud.j_k, 		k = ud.i_prime,
							lda = ud.i, ldb = ud.i_prime, ldc = ud.i,
							alpha = alpha,  beta = beta, 
							A = T_gpu.gpudata, 
							B = v_x_gpu.gpudata, 
							C = U_x_gpu.gpudata, )
		cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= ud.i, n 	= ud.j_k, 		k = ud.i_prime,
							lda = ud.i, ldb = ud.i_prime, ldc = ud.i,
							alpha = alpha,  beta = beta, 
							A = T_gpu.gpudata, 
							B = v_y_gpu.gpudata, 
							C = U_y_gpu.gpudata, )
		cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= ud.i, n 	= ud.j_k, 		k = ud.i_prime,
							lda = ud.i, ldb = ud.i_prime, ldc = ud.i_prime,
							alpha = alpha,  beta = beta, 
							A = T_gpu.gpudata, 
							B = v_z_gpu.gpudata, 
							C = U_z_gpu.gpudata, )
		'''cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= ud.i, n 	= ud.j_k, 		k = ud.i_prime,
							lda = ud.i, ldb = ud.i_prime, ldc = ud.i,
							alpha = alpha,  beta = beta, 
							A = pot_gpu.gpudata, 
							B = v_x_gpu.gpudata, 
							C = potential_gpu.gpudata, )'''
	print(time.clock() - t0, "mult_BLAS timer")
	return
Beispiel #3
0
def diag_gpu(A, v1):
    # handle
    current_handle = cublas.cublasCreate()

    m = A.shape[0]
    Q = np.zeros((m, m), dtype=np.float64)
    # Q[0, :] = 0.0 # implied
    Q[1, :] = v1.copy()
    beta = np.zeros(m, dtype=np.float64)
    alpha = np.zeros(m, dtype=np.float64) 

    # move data onto the GPU
    A_gpu = gpuarray.to_gpu(A)
    Q_gpu = gpuarray.to_gpu(Q)
    beta_gpu = gpuarray.to_gpu(beta)
    alpha_gpu = gpuarray.to_gpu(alpha)
    w = gpuarray.zeros(m, dtype=np.float64)

    # we define three kernels for simple arithmetic
    w_scale = ElementwiseKernel(
        arguments="double *w, double *alpha, double *beta, double *Q1, double *Q2, int loop_index",
        operation="w[i] = w[i] - (alpha[loop_index] * Q1[i]) - (beta[loop_index] * Q2[i])",
        name="element_wise_w_building")
    # using -= to do inplace subtraction gives an incorrect answer


    norm_krnl = ReductionKernel(np.float64, neutral="0.0", reduce_expr="a+b", 
        map_expr="x[i]*x[i]", arguments="double *x")

    ediv = ElementwiseKernel(
        arguments="double *a, double *b, double *c, int loop_index",
        operation="a[i] = b[i] / c[loop_index+1]",
        name="element_wise_division")
    # the name must not have spaces!!!!

    for i in range(1, m-1):
        cublas.cublasDgemv(handle = current_handle, trans = 'T',
                            m = m, n = m, # Hermitian matrix
                            alpha = 1.0, 
                            beta = 0.0,
                            A = A_gpu.gpudata, 
                            lda = m,
                            x = Q_gpu[i, :].gpudata, 
                            incx = 1,    
                            y = w.gpudata, 
                            incy = 1,
                            )

        cublas.cublasDgemm(handle = current_handle, 
                            transa = 'n', transb = 'n',
                            m   = 1, n  = 1,      k = m,
                            lda = 1, ldb = m, ldc = 1,
                            alpha = 1.0,  beta = 0.0, 
                            A = w.gpudata, 
                            B = Q_gpu[i, :].gpudata, 
                            C = alpha_gpu[i].gpudata)


        w_scale(w, alpha_gpu, beta_gpu, Q_gpu[i, :], Q_gpu[i-1, :], i)
        beta_gpu[i+1] = cumath.sqrt(norm_krnl(w))
        ediv(Q_gpu[i+1, :], w, beta_gpu, i)
    # end of loop

    # last 2 steps
    cublas.cublasDgemv(handle = current_handle, trans = 'T',
                            m = m, n = m, # Hermitian matrix
                            alpha = 1.0,
                            beta = 0.0,
                            A = A_gpu.gpudata, 
                            lda = m,
                            x = Q_gpu[-1, :].gpudata,
                            incx = 1,    
                            y = w.gpudata,
                            incy = 1,)

    cublas.cublasDgemm(handle = current_handle, 
                        transa = 'n', transb = 'n',
                        m   = 1, n  = 1,  k = m,
                        lda = 1, ldb = m, ldc = 1,
                        alpha = 1.0,  beta = 0.0, 
                        A = w.gpudata, 
                        B = Q_gpu[-1, :].gpudata, 
                        C = alpha_gpu[-1].gpudata)

    # retrive the alpha's and betas
    alpha_cpu = alpha_gpu.get()
    beta_cpu = beta_gpu.get()

    print("GPU: ", alpha_cpu, beta_cpu, sep="\n\n")
    # make tridiagonal matrix out of alpha and B
    # Tri = np.zeros(matrix_size)
    return
	# allocate space on gpu for results
	U_x_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size
	U_y_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size
	U_z_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size

#m, k, n = ud.basis_size, ud.basis_size, ud.basis_size**2
for basis in range(BASIS_SIZE):
	prepare_gpu(basis) # set it up
	i, j_k, i_prime = basis_size, basis_size*basis_size, basis_size
	initial_time = time.clock()
	for num_iter in range(ITERATIONS):
		cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= i, n 	= j_k, 		k = i_prime,
							lda = i, ldb = i_prime, ldc = i,
							alpha = ud.alpha,  beta = ud.beta, 
							A = T_gpu.gpudata, 
							B = v_x_gpu.gpudata, 
							C = U_x_gpu.gpudata, )
		cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= i, n 	= j_k, 		k = i_prime,
							lda = i, ldb = i_prime, ldc = i,
							alpha = ud.alpha,  beta = ud.beta, 
							A = T_gpu.gpudata, 
							B = v_y_gpu.gpudata, 
							C = U_y_gpu.gpudata, )
		cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= i, n 	= j_k, 		k = i_prime,
							lda = i, ldb = i_prime, ldc = i,
Beispiel #5
0
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import skcuda.cublas as cublas

A = np.array(([1, 2, 3], [4, 5, 6]), order='F').astype(np.float64)
B = np.array(([7, 8, 1, 5], [9, 10, 0, 9], [11, 12, 5, 5]),
             order='F').astype(np.float64)

A_gpu = gpuarray.to_gpu(A)
B_gpu = gpuarray.to_gpu(B)

m, k = A_gpu.shape
k, n = B_gpu.shape

C_gpu = gpuarray.empty((m, n), np.float64)

alpha = np.float64(1.0)
beta = np.float64(0.0)

cublas_handle = cublas.cublasCreate()
cublas.cublasDgemm(cublas_handle, 'n', 'n', m, n, k, alpha, A_gpu.gpudata, m,
                   B_gpu.gpudata, k, beta, C_gpu.gpudata, m)
cublas.cublasDestroy(cublas_handle)

C_gpu = C_gpu.reshape(C_gpu.shape, order='F')

print(np.dot(A, B))
print(C_gpu)