def _perform_sgemv(self, mat, v, vec_out, nvecs, dim): ''' NOTES: cuBLAS uses Fortran layout cublas_sgemv is used to multiply matrix and vector (LEVEl 2 BLAS) cublas_handle -> handle to the cuBLAS library context t -> transpose dim -> number of columns of matrix nvecs -> number of rows of matrix alpha -> scalar used for multiplication of mat mat.gpudata -> matrix mat dim -> columns of matrix v.gpudata -> vector v incX -> Stride within X. For example, if incX is 7, every 7th element is used. beta -> scalar used for multiplication of v v_out.gpudata -> result incY -> Stride within Y. For example, if incx is 7, every 7th element is used Readmore -> http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemv ''' alpha = np.float32(1.0) beta = np.float32(0.0) incx = 1 incy = 1 cublas_handle = cublas.cublasCreate() cublas.cublasSgemv(cublas_handle, 't', dim, nvecs, alpha, mat.gpudata, dim, v.gpudata, incx, beta, vec_out.gpudata, incy) cublas.cublasDestroy(cublas_handle) return vec_out
def _perform_sgemm(self, mat_a, mat_b, mat_out): nvecs_a, dim = mat_a.shape nvecs_b, dim = mat_b.shape alpha = np.float32(1.0) beta = np.float32(0.0) ''' cublas_sgemm is used to multiply matrix and matrix(LEVEL 3 BLAS) cublas_handle -> handle to the cuBLAS-library context t -> transpose mat_b n -> notranspose mat_a nvecs_b -> rows of mat_b nvecs_a -> rows of mat_a dim -> Common dimensions in mat_a and mat_b alpha -> scaling factor for multiplication of mat_a and mat_b mat_b.gpudata -> matrix mat_b dim -> columns of mat_b mat_a.gpudata -> matrix mat_a dim -> columns of mat_a beta -> scaling factor for r_gpu mat_out.gpudata -> matirx mat_out nvecs_b -> rows of mat_b Read more -> http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm ''' cublas_handle = cublas.cublasCreate() cublas.cublasSgemm(cublas_handle, 't', 'n', nvecs_b, nvecs_a, dim, alpha, mat_b.gpudata, dim, mat_a.gpudata, dim, beta, mat_out.gpudata, nvecs_b) cublas.cublasDestroy(cublas_handle) return mat_out
def ijk_lu_decomposer_opt_gpu(M): m = M.shape[0] n = M.shape[1] import pycuda.autoinit import pycuda.gpuarray as gpuarray from skcuda.cublas import cublasCreate, cublasDestroy, cublasDdot#, cublasDscal import skcuda.misc as misc #import skcuda.linalg as linalg #linalg.init() N_gpu = gpuarray.to_gpu(M) h = cublasCreate() for i in range(0,n): for j in range(0,n): #N[i,j] -= N[i,:min(i,j)].dot(N[:min(i,j),j]) N_gpu[i,j] -= cublasDdot(h, N_gpu[i,:min(i,j)].size, N_gpu[i,:min(i,j)].gpudata, 1, N_gpu[:min(i,j),j].gpudata, n) if j<i: N_gpu[i,j] /= N_gpu[j,j] #cublasDscal(h, N_gpu[i,j].size, 1.0/np.float64(N_gpu[j,j].get()), N_gpu[i,j].gpudata, 1) #Move from GPU to CPU N = N_gpu.get() cublasDestroy(h) return N
def jki_lu_decomposer_opt_gpu(M): m = M.shape[0] n = M.shape[1] import pycuda.autoinit import pycuda.gpuarray as gpuarray from skcuda.cublas import cublasCreate, cublasDaxpy, cublasDscal, cublasDestroy import skcuda.misc as misc N_gpu = gpuarray.to_gpu(M) h = cublasCreate() for j in range(0,n): for k in range(0,j): #N[k+1:,j] = N[k+1:,j] - N[k+1:,k] * N[k,j] cublasDaxpy(h, N_gpu[k+1:,k].size, -np.float64(N_gpu[k,j].get()), N_gpu[k+1:,k].gpudata, n, N_gpu[k+1:,j].gpudata, n) #N[j+1:,j] /= N[j,j] cublasDscal(h, N_gpu[j+1:,j].size, 1.0/np.float64(N_gpu[j,j].get()), N_gpu[j+1:,j].gpudata, n) #Move from GPU to CPU N = N_gpu.get() cublasDestroy(h) return N
def kij_lu_decomposer_opt_gpu(M): m = M.shape[0] n = M.shape[1] import pycuda.autoinit import pycuda.gpuarray as gpuarray from skcuda.cublas import cublasCreate, cublasDaxpy, cublasDestroy import skcuda.misc as misc N_gpu = gpuarray.to_gpu(M) h = cublasCreate() for k in range(0,n): for i in range(k+1,n): N_gpu[i,k] = N_gpu[i,k] / N_gpu[k,k] #N[i,k+1:] -= N[i,k] * N[k,k+1:] cublasDaxpy(h, N_gpu[k,k+1:].size, -np.float64(N_gpu[i,k].get()), N_gpu[k,k+1:].gpudata, 1, N_gpu[i,k+1:].gpudata, 1) #Move from GPU to CPU N = N_gpu.get() cublasDestroy(h) return N
def GMRES_d(A_d, B, X0, tol, Nmax, precision): handler = cublas.cublasCreate() # Initialisation N = B.shape[0] Q = np.zeros((N, Nmax + 1)) H = np.zeros((Nmax + 1, Nmax + 1)) cs = np.zeros(Nmax) sn = np.zeros(Nmax) e1 = np.zeros(Nmax + 1) #Test preliminaire normB = np.linalg.norm(B) if normB == 0: print("Nombre d'iterations : 0") print("Erreur : ", 0) if precision == 2: return np.zeros(N, np.float64) else: return np.zeros(N, np.float32) # Residu initial Ax = product_Ax(A_d, X0, N, handler, precision) r0 = B - Ax Q[:, 0] = r0 / np.linalg.norm(r0) e1[0] = 1 error = np.linalg.norm(r0) / np.linalg.norm(B) print("Error:", error) beta = np.linalg.norm(r0) * e1 k = 0 while (error > tol) and k < min(Nmax, N) - 1: Arnoldi_d(Q, A_d, H, k, handler, precision) apply_rotation(H, cs, sn, k) beta[k + 1] = -sn[k] * beta[k] beta[k] = cs[k] * beta[k] error = abs(beta[k + 1]) / np.linalg.norm(B) k += 1 print("Nombre d'iterations : {}".format(k)) print("Erreur :", error) Hinv = np.linalg.inv(H[:k, :k]) y = np.matmul(Hinv[:k, :k], beta[:k]) X = np.dot(Q[:, :k], y) + X0 return X
def init(): import atexit def _shutdown_gpucsrarray(): cublas.cublasDestroy(cublas_handle) cusparse.cusparseDestroy(cusparse_handle) global cublas_handle, cusparse_handle if cublas_handle is None or cusparse_handle is None: cublas_handle = cublas.cublasCreate() cusparse_handle = cusparse.cusparseCreate() atexit.register(_shutdown_gpucsrarray)
def mult_BLAS(): alpha = np.float64(1.0) # no prefactor beta = np.float64(0.0) # C matrix is not involved so beta = 0.0 #m, k, n = ud.basis_size, ud.basis_size, ud.basis_size**2 t0 = time.clock() for a in range(100): cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = ud.i, n = ud.j_k, k = ud.i_prime, lda = ud.i, ldb = ud.i_prime, ldc = ud.i, alpha = alpha, beta = beta, A = T_gpu.gpudata, B = v_x_gpu.gpudata, C = U_x_gpu.gpudata, ) cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = ud.i, n = ud.j_k, k = ud.i_prime, lda = ud.i, ldb = ud.i_prime, ldc = ud.i, alpha = alpha, beta = beta, A = T_gpu.gpudata, B = v_y_gpu.gpudata, C = U_y_gpu.gpudata, ) cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = ud.i, n = ud.j_k, k = ud.i_prime, lda = ud.i, ldb = ud.i_prime, ldc = ud.i_prime, alpha = alpha, beta = beta, A = T_gpu.gpudata, B = v_z_gpu.gpudata, C = U_z_gpu.gpudata, ) '''cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = ud.i, n = ud.j_k, k = ud.i_prime, lda = ud.i, ldb = ud.i_prime, ldc = ud.i, alpha = alpha, beta = beta, A = pot_gpu.gpudata, B = v_x_gpu.gpudata, C = potential_gpu.gpudata, )''' print(time.clock() - t0, "mult_BLAS timer") return
def get_cublas_handle(): """Gets CUBLAS handle for the current device. Returns: CUBLAS handle. """ global _cublas_handles device = Context.get_device() if device in _cublas_handles: return _cublas_handles[device] handle = cublas.cublasCreate() _cublas_handles[device] = handle return handle
def compute_gflops(precision='S'): if precision=='S': float_type = 'float32' elif precision=='D': float_type = 'float64' else: return -1 A = np.random.randn(m, k).astype(float_type) B = np.random.randn(k, n).astype(float_type) C = np.random.randn(m, n).astype(float_type) A_cm = A.T.copy() B_cm = B.T.copy() C_cm = C.T.copy() A_gpu = gpuarray.to_gpu(A_cm) B_gpu = gpuarray.to_gpu(B_cm) C_gpu = gpuarray.to_gpu(C_cm) alpha = np.random.randn() beta = np.random.randn() transa = cublas._CUBLAS_OP['N'] transb = cublas._CUBLAS_OP['N'] lda = m ldb = k ldc = m t = time() handle = cublas.cublasCreate() exec('cublas.cublas%sgemm(handle, transa, transb, m, n, k, alpha, A_gpu.gpudata, lda, \ B_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)' % precision) cublas.cublasDestroy(handle) t = time() - t gflops = 2*m*n*(k+1)*(10**-9) / t return gflops
def count_triangles_cublas(adjacency_list): driver.init() context = tools.make_default_context() h = cublas.cublasCreate() n = len(adjacency_list) A = np.zeros([n,n], dtype=np.float64) for row_idx, neighbor_list in adjacency_list: A[row_idx, neighbor_list] = 1.0 a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.empty(A.shape, A.dtype) c_gpu = gpuarray.empty(A.shape, A.dtype) one = np.float64(1.0) zero = np.float64(0.0) cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, a_gpu.gpudata, n, zero, b_gpu.gpudata, n) cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, b_gpu.gpudata, n, zero, c_gpu.gpudata, n) trace = linalg.trace(c_gpu, h) cublas.cublasDestroy(h) context.detach() return int(trace/6)
def compute_gflops(precision='S'): if precision == 'S': float_type = 'float32' elif precision == 'D': float_type = 'float64' else: return -1 # some random matrices that are of the appropriate precision that we will use for timing A = np.random.randn(m, k).astype(float_type) B = np.random.randn(k, n).astype(float_type) C = np.random.randn(m, n).astype(float_type) # wie gehabt A_cm = A.T.copy() B_cm = B.T.copy() C_cm = C.T.copy() A_gpu = gpuarray.to_gpu(A_cm) B_gpu = gpuarray.to_gpu(B_cm) C_gpu = gpuarray.to_gpu(C_cm) alpha = np.random.randn() beta = np.random.randn() transa = cublas._CUBLAS_OP['N'] transb = cublas._CUBLAS_OP['N'] lda = m ldb = k ldc = m t = time() handle = cublas.cublasCreate() # two different (relevant) precision modes, D and S exec( 'cublas.cublas%sgemm(handle, transa, transb, m, n, k, alpha, A_gpu.gpudata, lda, \ B_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)' % precision) cublas.cublasDestroy(handle) t = time() - t # a total of 2kmn - mn + 3mn = 2kmn + 2mn = 2mn(k+1) floating point operations in a given GEMM operation gflops = 2 * m * n * (k + 1) * (10**-9) / t return gflops
def matmul(self, mat, return_time=False): '''Matrix multiplication between two matrices''' # check dimensions first: if self.ncols != mat.nrows: raise ValueError("Dimensions {0} and {1} do not match.".format( self.ncols, mat.nrows)) # move matrices to gpu # somehow we need to transpose this to make it work, not sure why tho a_gpu = gpuarray.to_gpu(self.arr.T.copy()) b_gpu = gpuarray.to_gpu(mat.arr.T.copy()) c_gpu = gpuarray.to_gpu( np.zeros((self.nrows, mat.ncols)).astype(np.float32).T.copy()) # initialize culas context h = cublasCreate() # evaluate the matrix multiplication # cubals syntax as follows: # h = handle, "n" : op(A) = A (for op(A) = A^t, use "t"), # then list m, n, k if we have m x k dot k x n # then first value is scalar in front of product (set to 1) # then give array data followed by row dim of each matrix # last value is for adding another matrix, set to 0. # also record the time it takes for the evaluation t0 = time.perf_counter_ns() cublasSgemm(h, "n", "n", self.nrows, mat.ncols, self.ncols, np.float32(1.0), a_gpu.gpudata, self.nrows, b_gpu.gpudata, mat.nrows, np.float32(0.0), c_gpu.gpudata, self.nrows) t1 = time.perf_counter_ns() eval_time = (t1 - t0) * (1e-9) # time for each matmul evaluation # move from device to host prod_arr = c_gpu.get().T # free allocated memory for handle cublasDestroy(h) return (cublasMatrix(prod_arr), eval_time) if return_time else cublasMatrix(prod_arr)
def __init__(self, inputs, outputs, norm=None, precision=np.float64): super(SLFNSkCUDA, self).__init__(inputs, outputs, norm, precision) # startup GPU #self.ctx = misc.init_context(misc.init_device(nDevice)) # NO NO NO, crashes and does not release memory # use CUDA_DEVICE=0 python my-script.py try: linalg.init() except OSError as e: pass # no 'cusolver' library which is paid and not needed # print "error initializing scikit-cuda: %s" % e # print "ignore if toolbox works" # precision-dependent stuff if precision is np.float64: self.posv = lapack.dposv else: self.posv = lapack.sposv self.handle = cublas.cublasCreate() # prepare GPU function kernels kernel = """ __global__ void dev_sigm(%s *a) { unsigned idx = blockDim.x * blockIdx.x + threadIdx.x; a[idx] = 1.0 / ( exp(a[idx]) + 1 ); } """ kernel = kernel % "double" if self.precision is np.float64 else kernel % "float" self.dev_sigm = SourceModule(kernel).get_function("dev_sigm") self.dev_sigm.prepare("P") # GPU transformation functions self.func["lin"] = self._dev_lin self.func["sigm"] = self._dev_sigm self.func["tanh"] = self._dev_tanh self.func["rbf_l1"] = self._dev_rbfl1 self.func["rbf_l2"] = self._dev_rbfl2 self.func["rbf_linf"] = self._dev_rbflinf
def cor_mat_2(BOLD, upper_tri, N, L): # preprocessing fMRI data in CPU start_time = time.time() BOLD = preprocessing(BOLD, N, L) stop_time = time.time() delta = stop_time - start_time print("Running time for preprocessing: ", delta, "\n") alpha = np.float32(1.0) beta = np.float32(0.0) # passaggio su device start_time = time.time() BOLD_device = gpuarray.to_gpu(BOLD) result = np.zeros((BOLD.shape[0], BOLD.shape[0]), np.float32) result_device = gpuarray.to_gpu(result) # print("BOLD_device shape:", BOLD_device.shape) # print("result_device shape:", result_device.shape) stop_time = time.time() delta = stop_time - start_time print("Running time matrices to device: ", delta, "\n") start_time = time.time() h = cublas.cublasCreate() cublas.cublasSgemm(h, 'T', 'n', N, N, L, alpha, BOLD_device.gpudata, L, BOLD_device.gpudata, L, beta, result_device.gpudata, N) stop_time = time.time() delta = stop_time - start_time print("Running time core function: ", delta, "\n") start_time = time.time() threads_per_block = 1024 blocks_per_grid = int(math.ceil(1 + ((N*N - 1) / threads_per_block))) mod = pycuda.compiler.SourceModule(""" __global__ void ker(float * cormat, float * upper,int n1,int n) { long idx = blockDim.x*blockIdx.x+threadIdx.x; long i = idx%n1; long j = idx/n1; if(i<j && i<n1 && j<n) { long tmp=i; tmp*=(i+1); tmp/=2; long tmp_2=i; tmp_2*=n; tmp_2=tmp_2-tmp; tmp_2+=j; tmp_2-=i; upper[tmp_2-1]=cormat[j*n+i]; } } """) result_device = result_device.reshape(-1) # print("result device shape:", result_device.shape) upper_tri_device = gpuarray.to_gpu(upper_tri) funct = mod.get_function("ker") funct(result_device, upper_tri_device, np.int32(N), np.int32(N), block=(threads_per_block, 1, 1), grid=(blocks_per_grid, 1) ) upper_tri = upper_tri_device.get() stop_time = time.time() delta = stop_time - start_time print("Running time to get upper tri: ", delta, "\n") cublas.cublasDestroy(h) return upper_tri
def cor_mat_3(BOLD, upper_tri, N, L, OOO): # calcolo memoria disponibile meminfo = cuda.mem_get_info() print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) available_mem = float(meminfo[0]) available_mem /= np.dtype(np.float32).itemsize available_mem -= N * L # print("Available memory: ", available_mem) # preprocessing fMRI data in CPU start_time = time.time() BOLD = preprocessing(BOLD, N, L) stop_time = time.time() delta = stop_time - start_time print("Running time for preprocessing: ", delta, "\n") # passaggio di BOLD in device start_time = time.time() BOLD_device = gpuarray.to_gpu(BOLD) stop_time = time.time() delta = stop_time - start_time print("Running time matrices to device: ", delta, "\n") # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("After BOLD_device free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) # inizializzazione variabili flag = 1 ii=0 upper_size = (N-1) * N / 2 block = OOO N_prime = N temp = 0 temp2 = 0 temp3 = 0 pak = 0 so_far = 0 count = 1 temp4 = 0 alpha = np.float32(1.0) beta = np.float32(0.0) while flag is 1: print("###### ITERAZIONE ", count, " #####") # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("After BOLD_device free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) # print("block: ", block) # print("N_prime: ", N_prime) # checking for the last chunk if block == N_prime: flag = 0 if pak is not 0: del dev_upper del result_device temp = block temp *= (block + 1) temp /= 2 # M1 is the size of upper triangle part of chunk M1 = N_prime M1 *= block M1 -= temp M1 = int(M1) # print("M1: ", M1) pak += 1 # print("so_far*L: ", so_far*L) start_time = time.time() result = np.zeros((block, N_prime), np.float32) # print("result shape: ", result.shape) BOLD_device = BOLD_device.reshape(-1) # allocate memory on the device for the result result_device = gpuarray.to_gpu(result) # print("result_device shape: ", result_device.shape) # # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("Before cublasSgemm free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) stop_time = time.time() delta = stop_time - start_time print("Running time matrices to device: ", delta, "\n") start_time = time.time() h = cublas.cublasCreate() cublas.cublasSgemm(h, 'T', 'n', block, N_prime, L, alpha, BOLD_device[so_far*L:].gpudata, L, BOLD_device[so_far*L:].gpudata, L, beta, result_device.gpudata, block) stop_time = time.time() delta = stop_time - start_time print("Running time core function: ", delta, "\n") temp2 = block temp2 *= N_prime # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) # result_device = gpuarray.to_gpu(result1) start_time = time.time() threads_per_block = 1024 blocks_per_grid = 1 + math.ceil(((temp2-1) / threads_per_block)) grid = (blocks_per_grid, 1) # print("temp2:", temp2) # print("threads_per_block: ", threads_per_block) # print("blocks_per_grid: ", blocks_per_grid) upper = np.zeros(M1, np.float32) # print("upper shape:", upper.shape) dev_upper = gpuarray.to_gpu(upper) # print("dev_upper shape: ", dev_upper.shape) # print("result_device shape: ", result_device.shape) mod = pycuda.compiler.SourceModule(""" __global__ void ker2(float * cormat, float * upper,int n1,int n,long long upper_size,int N,int i_so_far,long long M1) { long long idx = blockDim.x; idx*=blockIdx.x; idx+=threadIdx.x; long i = idx/n; long j = idx%n; if(i<j && i<n1 && j<n)// &&i<N &&j<N && idx<(n1*n)) { long long tmp=i; tmp*=(i+1); tmp/=2; long long tmp_2=i; tmp_2*=n; tmp_2=tmp_2-tmp; tmp_2+=j; tmp_2-=i; long long indexi=n1; indexi*=j; indexi=indexi+i; upper[tmp_2-1]=cormat[indexi]; } } """) # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) funct = mod.get_function("ker2") funct(result_device, dev_upper, np.int32(block), np.int32(N_prime), np.int64(upper_size), np.int32(N), np.int32(ii), np.int64(M1), block=(threads_per_block, 1, 1), grid=grid ) temp3+=M1 # print("upper_tri shape:", upper_tri.shape) upper_tri[temp4:temp3] = dev_upper.get() stop_time = time.time() delta = stop_time - start_time print("Running time to get upper tri: ", delta, "\n") temp4 += M1 ii += block cublas.cublasDestroy(h) so_far += block if N_prime > block: N_prime = N_prime - block block = remaining_N2(N_prime, L, available_mem) if N_prime < block: block = N_prime count += 1 # liberare la memoria del BOLD_device del result_device del dev_upper # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) return upper_tri
def setUpClass(cls): cls.ctx = make_default_context() cls.cublas_handle = cublas.cublasCreate()
def register_multiple_images_subpix_cuda(stack, template): import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.cumath as cumath import skcuda.fft as cu_fft import skcuda.linalg as lin import skcuda.cublas as cub from numpy import pi, newaxis, floor import cmath from pycuda.elementwise import ElementwiseKernel from pycuda.compiler import SourceModule from numpy import conj, abs, arctan2, sqrt, real, imag, shape, zeros, trunc, ceil, floor, fix from numpy.fft import fftshift, ifftshift fft2, ifft2 = fftn, ifftn = fast_ffts.get_ffts(nthreads=1, use_numpy_fft=False) mod = SourceModule(""" #include <pycuda-complex.hpp>" __global__ void load_convert(unsigned short *a, float *b,int f, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; int offset = f * imlen; if (idx <imlen) { b[idx] = (float)a[offset+idx]; } } __global__ void convert_export(float *a, unsigned short *b,int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { b[idx] = (unsigned short)(a[idx]>0 ? a[idx] : 0) ; } } __global__ void multiply_comp_float(pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { z[idx] = x[idx] * y[idx]; } } __global__ void calc_conj(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { y[idx]._M_re = x[idx]._M_re; y[idx]._M_im = -x[idx]._M_im; } } __global__ void convert_multiply(float *x, pycuda::complex<float> *y, float sx, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { y[idx]._M_re = 0; y[idx]._M_im = x[idx] * sx; } } __global__ void transfer_array(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlenl, int imlen, int nlargeh, int nh) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; int offset = imlenl*3/4; if (idx<imlen) { int target_ind = (offset+(idx/nh)*nlargeh + (idx % nh))%imlenl; x[target_ind] = y[idx]; } } __global__ void calc_shiftmatrix(float *x, float *y, pycuda::complex<float> *z, float sx, float sy,float dg, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { z[idx]._M_re = 0; z[idx]._M_im = x[idx] * sx + y[idx] * sy + dg; } } __global__ void sub_float(float *x, float *y, float sv, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { x[idx] = y[idx]-sv; } } """) load_convert_kernel = mod.get_function('load_convert') convert_export_kernel = mod.get_function('convert_export') convert_multiply_kernel = mod.get_function('convert_multiply') multiply_float_kernel = mod.get_function('multiply_comp_float') transfer_array_kernel = mod.get_function('transfer_array') calc_shiftmatrix_kernel = mod.get_function('calc_shiftmatrix') conj_kernel = mod.get_function('calc_conj') sub_float_kernel = mod.get_function('sub_float') Z = stack.shape[0] M = stack.shape[1] N = stack.shape[2] max_memsize = 4200000000 imlen = M * N half_imlen = M * (N // 2 + 1) grid_dim = (64, int(imlen / (512 * 64)) + 1, 1) block_dim = (512, 1, 1) #512 threads per block stack_bin = int(max_memsize / (M * N * stack.itemsize)) stack_ite = int(Z / stack_bin) + 1 usfac = 100 ## needs to be bigger than 10 if not template.shape == stack.shape[1:]: raise ValueError("Images must have same shape.") if np.any(np.isnan(template)): template = template.copy() template[template != template] = 0 if np.any(np.isnan(stack)): stack = stack.copy() stack[stack != stack] = 0 mlarge = M * 2 nlarge = N * 2 t = time.time() plan_forward = cu_fft.Plan((M, N), np.float32, np.complex64) plan_inverse = cu_fft.Plan((M, N), np.complex64, np.float32) plan_inverse_big = cu_fft.Plan((mlarge, nlarge), np.complex64, np.float32) cub_h = cub.cublasCreate() template_gpu = gpuarray.to_gpu(template.astype('float32')) source_gpu = gpuarray.empty((M, N), np.float32) ifft_gpu = gpuarray.empty((M, N), np.float32) result_gpu = gpuarray.empty((M, N), np.uint16) templatef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64) sourcef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64) prod_gpu1 = gpuarray.empty((M, N // 2 + 1), np.complex64) prod_gpu2 = gpuarray.empty((M, N // 2 + 1), np.complex64) shiftmatrix = gpuarray.empty((M, N // 2 + 1), np.complex64) cu_fft.fft(template_gpu, templatef_gpu, plan_forward, scale=True) templatef_gpu = templatef_gpu.conj() move_list = np.zeros((Z, 2)) largearray1_gpu = gpuarray.zeros((mlarge, nlarge // 2 + 1), np.complex64) largearray2_gpu = gpuarray.empty((mlarge, nlarge), np.float32) imlenl = mlarge * (nlarge // 2 + 1) zoom_factor = 1.5 dftshiftG = trunc(ceil(usfac * zoom_factor) / 2) #% Center of output array at dftshift+1 upsample_dim = int(ceil(usfac * zoom_factor)) term1c = (ifftshift(np.arange(N, dtype='float') - floor(N / 2)). T[:, newaxis]) / N # fftfreq # output points term2c = ((np.arange(upsample_dim, dtype='float')) / usfac)[newaxis, :] term1r = (np.arange(upsample_dim, dtype='float').T)[:, newaxis] term2r = (ifftshift(np.arange(M, dtype='float')) - floor(M / 2))[newaxis, :] # fftfreq term1c_gpu = gpuarray.to_gpu(term1c[:int(floor(N / 2) + 1), :].astype('float32')) term2c_gpu = gpuarray.to_gpu(term2c.astype('float32')) term1r_gpu = gpuarray.to_gpu(term1r.astype('float32')) term2r_gpu = gpuarray.to_gpu(term2r.astype('float32')) term2c_gpu_ori = gpuarray.to_gpu(term2c.astype('float32')) term1r_gpu_ori = gpuarray.to_gpu(term1r.astype('float32')) kernc_gpu = gpuarray.zeros((N // 2 + 1, upsample_dim), np.float32) kernr_gpu = gpuarray.zeros((upsample_dim, M), np.float32) kernc_gpuc = gpuarray.zeros((N // 2 + 1, upsample_dim), np.complex64) kernr_gpuc = gpuarray.zeros((upsample_dim, M), np.complex64) Nr = np.fft.ifftshift(np.linspace(-np.fix(M / 2), np.ceil(M / 2) - 1, M)) Nc = np.fft.ifftshift(np.linspace(-np.fix(N / 2), np.ceil(N / 2) - 1, N)) [Nc, Nr] = np.meshgrid(Nc, Nr) Nc_gpu = gpuarray.to_gpu((Nc[:, :N // 2 + 1] / N).astype('float32')) Nr_gpu = gpuarray.to_gpu((Nr[:, :N // 2 + 1] / M).astype('float32')) upsampled1 = gpuarray.empty((upsample_dim, N // 2 + 1), np.complex64) upsampled2 = gpuarray.empty((upsample_dim, upsample_dim), np.complex64) source_stack = gpuarray.empty((stack_bin, M, N), dtype=stack.dtype) copy = drv.Memcpy3D() copy.set_src_host(stack.data) copy.set_dst_device(source_stack.gpudata) copy.width_in_bytes = copy.src_pitch = stack.strides[1] copy.src_height = copy.height = M for zb in range(stack_ite): zrange = np.arange(zb * stack_bin, min((stack_bin * (zb + 1)), Z)) copy.depth = len(zrange) copy.src_z = int(zrange[0]) copy() for i in range(len(zrange)): t = zb * stack_bin + i load_convert_kernel(source_stack, source_gpu.gpudata, np.int32(i), np.int32(imlen), block=block_dim, grid=grid_dim) cu_fft.fft(source_gpu, sourcef_gpu, plan_forward, scale=True) multiply_float_kernel(sourcef_gpu, templatef_gpu, prod_gpu1, np.int32(half_imlen), block=block_dim, grid=grid_dim) transfer_array_kernel(largearray1_gpu, prod_gpu1, np.int32(imlenl), np.int32(half_imlen), np.int32(nlarge // 2 + 1), np.int32(N // 2 + 1), block=block_dim, grid=grid_dim) cu_fft.ifft(largearray1_gpu, largearray2_gpu, plan_inverse_big, scale=True) peakind = cub.cublasIsamax(cub_h, largearray2_gpu.size, largearray2_gpu.gpudata, 1) rloc, cloc = np.unravel_index(peakind, largearray2_gpu.shape) md2 = trunc(mlarge / 2) nd2 = trunc(nlarge / 2) if rloc > md2: row_shift2 = rloc - mlarge else: row_shift2 = rloc if cloc > nd2: col_shift2 = cloc - nlarge else: col_shift2 = cloc row_shiftG = row_shift2 / 2. col_shiftG = col_shift2 / 2. # Initial shift estimate in upsampled grid row_shiftG0 = round(row_shiftG * usfac) / usfac col_shiftG0 = round(col_shiftG * usfac) / usfac # Matrix multiply DFT around the current shift estimate roffG = dftshiftG - row_shiftG0 * usfac coffG = dftshiftG - col_shiftG0 * usfac sub_float_kernel(term2c_gpu, term2c_gpu_ori, np.float32(coffG / usfac), np.int32(term2c_gpu.size), block=block_dim, grid=grid_dim) sub_float_kernel(term1r_gpu, term1r_gpu_ori, np.float32(roffG), np.int32(term1r_gpu.size), block=block_dim, grid=grid_dim) lin.dot(term1c_gpu, term2c_gpu, handle=cub_h, out=kernc_gpu) lin.dot(term1r_gpu, term2r_gpu, handle=cub_h, out=kernr_gpu) convert_multiply_kernel(kernc_gpu, kernc_gpuc, np.float32(-2 * pi), np.int32(kernc_gpu.size), block=block_dim, grid=grid_dim) convert_multiply_kernel(kernr_gpu, kernr_gpuc, np.float32(-2 * pi / (M * usfac)), np.int32(kernr_gpu.size), block=block_dim, grid=grid_dim) cumath.exp(kernc_gpuc, out=kernc_gpuc) cumath.exp(kernr_gpuc, out=kernr_gpuc) conj_kernel(prod_gpu1, prod_gpu2, np.int32(half_imlen), block=block_dim, grid=grid_dim) lin.dot(kernr_gpuc, prod_gpu2, handle=cub_h, out=upsampled1) lin.dot(upsampled1, kernc_gpuc, handle=cub_h, out=upsampled2) CCG = conj(upsampled2.get()) / (md2 * nd2 * usfac**2) rlocG, clocG = np.unravel_index(abs(CCG).argmax(), CCG.shape) CCGmax = CCG[rlocG, clocG] rlocG = rlocG - dftshiftG #+ 1 # +1 # questionable/failed hack + 1; clocG = clocG - dftshiftG #+ 1 # -1 # questionable/failed hack - 1; row_shiftG = row_shiftG0 + rlocG / usfac col_shiftG = col_shiftG0 + clocG / usfac diffphaseG = arctan2(imag(CCGmax), real(CCGmax)) # Compute registered version of source stack calc_shiftmatrix_kernel(Nr_gpu, Nc_gpu, shiftmatrix, np.float32(row_shiftG * 2 * np.pi), np.float32(col_shiftG * 2 * np.pi), np.float32(diffphaseG), np.int32(half_imlen), block=block_dim, grid=grid_dim) cumath.exp(shiftmatrix, out=shiftmatrix) multiply_float_kernel(sourcef_gpu, shiftmatrix, prod_gpu1, np.int32(half_imlen), block=block_dim, grid=grid_dim) cu_fft.ifft(prod_gpu1, ifft_gpu, plan_inverse) convert_export_kernel(ifft_gpu, result_gpu, np.int32(imlen), block=block_dim, grid=grid_dim) move_list[t, :] = (row_shiftG, col_shiftG) stack[t, :, :] = result_gpu.get() cub.cublasDestroy(cub_h) return (stack, move_list)
def attach_cublas_handle_to_context(ctx): handle = getattr(ctx, "cublas_handle", None) if handle is None: with ctx: ctx.cublas_handle = cublas.cublasCreate()
def create(self): if self.handle is None: self.handle = cublas.cublasCreate()
import pycuda.autoinit from pycuda import gpuarray import numpy as np from skcuda import cublas a = np.float32(10) x = np.float32([1, 2, 3]) y = np.float32([-.345, 8.15, -15.867]) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) cublas_context_h = cublas.cublasCreate() cublas.cublasSaxpy(cublas_context_h, x_gpu.size, a, x_gpu.gpudata, 1, y_gpu.gpudata, 1) cublas.cublasDestroy(cublas_context_h) print("This is close to the NumPy approximation: {}".format(np.allclose(a * x + y, y_gpu.get())))
b_g = gpuarray.to_gpu(b_c) # allocating a 2x2 matrix filled with zeros: c_g = gpuarray.zeros((2,2), dtype = np.float32) # printing the process(GPU): print('GPU matrix-matrix multiplication:\n') print('a_g = \n', a_g.get(), '\n') print('b_g = \n', b_g.get(), '\n') # storing matrix-matrix product of (a_g) and (b_g) in (c_g): start_gpu = drv.Event() end_gpu = drv.Event() start_gpu.record() cublas.cublasSgemm(cublas.cublasCreate(), transa = 'n', transb = 't', m = 2, n = 2, k = 2, alpha = np.float32(1.0), A = a_g.gpudata, lda = 2, B = b_g.gpudata, ldb = 2, beta = np.float32(0.0), C = c_g.gpudata, ldc = 2) end_gpu.record() end_gpu.synchronize() gpu_time = end_gpu.time_till(start_gpu) # printing GPU time and product: print('GPU time:', gpu_time, '\n')
def __init__(self, input_shape=None, i_gpu=_DEFAULT_GPU, cudaDevice=None, cudaContext=None, cudnnContext=None, cublasContext=None, verbose=None, maxiter=None, keeptime=None, keepvars=None, stepsize=None, parameters=None): """ Initialize contexts if needed """ self.i_gpu = i_gpu self.floatX = _DEFAULT_FLOATX self.verbose = verbose self.maxiter = maxiter self.keeptime = keeptime self.keepvars = keepvars self.stepsize = stepsize self.input_shape = input_shape self.cudaDevice = cudaDevice self.cudaContext = cudaContext self.cudnnContext = cudnnContext self.cublasContext = cublasContext ############################################################ # define circuit parameters as a constant named tuple rather # than a dict to avoid inappropriate modification by user ############################################################ try: for pkey, pval in parameters.iteritems(): _DEFAULT_PARAMETERS_TEMPLATE[pkey] = pval except AttributeError: pass finally: self.parameters = CircuitParameters(**_DEFAULT_PARAMETERS_TEMPLATE) if self.verbose is None: self.verbose = _DEFAULT_VERBOSE if self.maxiter is None: self.maxiter = _DEFAULT_MAXITER if self.keeptime is None: self.keeptime = _DEFAULT_KEEPTIME if self.keepvars is None: self.keepvars = _DEFAULT_KEEPVARS if self.stepsize is None: self.stepsize = _DEFAULT_STEPSIZE if self.input_shape is not None: self._sanity_check() if self.cudaDevice is None: cuda_driver.init() self.cudaDevice = cuda_driver.Device(self.i_gpu) if self.cudaContext is None: self.cudaContext = self.cudaDevice.make_context() if self.cudnnContext is None: self.cudnnContext = cudnn.cudnnCreate() if self.cublasContext is None: self.cublasContext = cublas.cublasCreate() # if input shape is known, initialize now if self.input_shape is not None: self._prepare_kernels() self._prepare_tensors()
def omnical(ggu_indices, gains, ubls, data, wgts, conv_crit=1e-10, maxiter=50, check_every=4, check_after=1, gain=0.3, nthreads=NTHREADS, precision=1, verbose=False): '''CPU-side function for organizing GPU-acceleration primitives into a cohesive omnical algorithm. Args: ggu_indices: (nbls,3) array of (i,j,k) indices denoting data order as gains[i] * gains[j].conj() * ubl[k] gains: (ndata, nants) array of estimated complex gains ubls: (ndata, nubls) array of estimated complex unique baselines data: (ndata, nbls) array of data to be calibrated wgts: (ndata, nbls) array of weights for each data conv_crit: maximum allowed relative change in solutions to be considered converged maxiter: maximum number of omnical iterations allowed before it gives up. check_every: Compute convergence every Nth iteration (saves computation). Default 4. check_after: Start computing convergence only after N iterations. Default 1. gain: The fractional step made toward the new solution each iteration. Values in the range 0.1 to 0.5 are generally safe. Increasing values trade speed for stability. Default is 0.3. nthreads: Number of GPU threads to use. Default NTHREADS=1024 precision: 1=float32, 2=double (float64). Default 1. verbose: If True, print things. Default False. Returns: info dictionary of {'gains':gains, 'ubls':ubls, 'chisq':chisq, 'iters':iters, 'conv':conv} ''' # Sanity check input array dimensions nbls = ggu_indices.shape[0] assert ggu_indices.shape == (nbls, 3) ndata = data.shape[0] assert data.shape == (ndata, nbls) assert wgts.shape == (ndata, nbls) nants = gains.shape[1] assert gains.shape == (ndata, nants) nubls = ubls.shape[1] assert ubls.shape == (ndata, nubls) assert precision in (1, 2) assert check_every > 1 if verbose: print('PRECISION:', precision) print('NDATA:', ndata) print('NANTS:', nants) print('NUBLS:', nubls) print('NBLS:', nbls) # Choose between float/double primitives if precision == 1: real_dtype, complex_dtype = np.float32, np.complex64 DTYPE, CDTYPE = 'float', 'cuFloatComplex' CMULT, CONJ, CSUB = 'cuCmulf', 'cuConjf', 'cuCsubf' COPY = cublasCcopy else: real_dtype, complex_dtype = np.float64, np.complex128 DTYPE, CDTYPE = 'double', 'cuDoubleComplex' CMULT, CONJ, CSUB = 'cuCmul', 'cuConj', 'cuCsub' COPY = cublasZcopy # ensure data types ggu_indices = ggu_indices.astype(np.uint32) data = data.astype(complex_dtype) wgts = wgts.astype(real_dtype) gains = gains.astype(complex_dtype) ubls = ubls.astype(complex_dtype) # Profiling info used to determine optimal chunk size: # Model: 15.3 s + ndata/140 => px=8192 in 75 s on Quadro T2000 # px=2000, nants=126, precision=2 #chunk_size = min(nthreads // 2, ndata) # 8.5 s #chunk_size = min(nthreads // 4, ndata) # 6.7 s #chunk_size = min(nthreads // 8, ndata) # 5.6 s #chunk_size = min(nthreads // 16, ndata) # 5.2 s #chunk_size = min(nthreads // 32, ndata) # 5.4 s #chunk_size = min(nthreads // 64, ndata) # 5.9 s # px=2000, nants=54, precision=2 #chunk_size = min(nthreads // 2, ndata) # 1.5 s #chunk_size = min(nthreads // 4, ndata) # 1.3 s #chunk_size = min(nthreads // 8, ndata) # 1.3 s #chunk_size = min(nthreads // 16, ndata) # 1.5 s #chunk_size = min(nthreads // 32, ndata) # 1.9 s #chunk_size = min(nthreads // 64, ndata) # 3.0 s # px=2000, nants=180, precision=2 #chunk_size = min(nthreads // 2, ndata) # 16.8 s #chunk_size = min(nthreads // 4, ndata) # 13.8 s #chunk_size = min(nthreads // 8, ndata) # 11.1 s #chunk_size = min(nthreads // 16, ndata) # 9.9 s #chunk_size = min(nthreads // 32, ndata) # 10.0 s #chunk_size = min(nthreads // 64, ndata) # 10.14 s # px=2000, nants=180, precision=1 #chunk_size = min(nthreads // 2, ndata) # 7.6 s #chunk_size = min(nthreads // 4, ndata) # 8.3 s #chunk_size = min(nthreads // 8, ndata) # 7.4 s #chunk_size = min(nthreads // 16, ndata) # 7.3 s #chunk_size = min(nthreads // 32, ndata) # 7.4 s #chunk_size = min(nthreads // 64, ndata) # 7.5 s # Upshot: nthreads // 16 seems to cover most cases, no need to tune. # Build the CUDA code gpu_code = GPU_TEMPLATE.format( **{ 'NBLS': nbls, 'NUBLS': nubls, 'NANTS': nants, 'GAIN': gain, 'CMULT': CMULT, 'CONJ': CONJ, 'CSUB': CSUB, 'DTYPE': DTYPE, 'CDTYPE': CDTYPE, }) # Extract functions from CUDA, suffix _cuda indicates GPU operation gpu_module = compiler.SourceModule(gpu_code) gen_dmdl_cuda = gpu_module.get_function("gen_dmdl") calc_chisq_cuda = gpu_module.get_function("calc_chisq") calc_dwgts_cuda = gpu_module.get_function("calc_dwgts") calc_gu_wgt_cuda = gpu_module.get_function("calc_gu_wgt") calc_gu_buf_cuda = gpu_module.get_function("calc_gu_buf") clear_complex_cuda = gpu_module.get_function("clear_complex") set_val_real_cuda = gpu_module.get_function("set_val_real") set_val_uint_cuda = gpu_module.get_function("set_val_uint") update_gains_cuda = gpu_module.get_function("update_gains") update_ubls_cuda = gpu_module.get_function("update_ubls") calc_conv_cuda = gpu_module.get_function("calc_conv") update_active_cuda = gpu_module.get_function("update_active") h = cublasCreate() # handle for managing cublas, used for buffer copies # define GPU buffers, suffix _g indicates GPU buffer chunk_size = min(nthreads // 16, ndata) data_chunks = 1 ANT_SHAPE = (chunk_size, nants) UBL_SHAPE = (chunk_size, nubls) BLS_SHAPE = (chunk_size, nbls) block = (chunk_size, int(np.floor(nthreads / chunk_size)), 1) ant_grid = (data_chunks, int(np.ceil(nants / block[1]))) ubl_grid = (data_chunks, int(np.ceil(nubls / block[1]))) bls_grid = (data_chunks, int(np.ceil(nbls / block[1]))) conv_grid = (data_chunks, int(np.ceil(max(nants, nubls) / block[1]))) if verbose: print('GPU block:', block) print('ANT grid:', ant_grid) print('UBL grid:', ubl_grid) print('BLS grid:', bls_grid) ggu_indices_g = gpuarray.empty(shape=ggu_indices.shape, dtype=np.uint32) active_g = gpuarray.empty(shape=(chunk_size, ), dtype=np.uint32) iters_g = gpuarray.empty(shape=(chunk_size, ), dtype=np.uint32) gains_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype) new_gains_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype) gbuf_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype) gwgt_g = gpuarray.empty(shape=ANT_SHAPE, dtype=real_dtype) ubls_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype) new_ubls_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype) ubuf_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype) uwgt_g = gpuarray.empty(shape=UBL_SHAPE, dtype=real_dtype) data_g = gpuarray.empty(shape=BLS_SHAPE, dtype=complex_dtype) dmdl_g = gpuarray.empty(shape=BLS_SHAPE, dtype=complex_dtype) wgts_g = gpuarray.empty(shape=BLS_SHAPE, dtype=real_dtype) dwgts_g = gpuarray.empty(shape=BLS_SHAPE, dtype=real_dtype) chisq_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype) new_chisq_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype) conv_sum_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype) conv_wgt_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype) conv_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype) # Define return buffers chisq = np.empty((ndata, ), dtype=real_dtype) conv = np.empty((ndata, ), dtype=real_dtype) iters = np.empty((ndata, ), dtype=np.uint32) active = np.empty((chunk_size, ), dtype=np.uint32) # upload data indices ggu_indices_g.set_async(ggu_indices) # initialize structures used to time code event_order = ('start', 'upload', 'dmdl', 'calc_chisq', 'loop_top', 'calc_gu_wgt', 'calc_gu_buf', 'copy_gains', 'copy_ubls', 'update_gains', 'update_ubls', 'dmdl2', 'chisq2', 'calc_conv', 'update_active', 'get_active', 'end') event_pairs = list((event_order[i], event_order[i + 1]) for i in range(len(event_order[:-1]))) cum_time = {} t0 = time.time() # Loop over chunks of parallel omnical problems for px in range(0, ndata, chunk_size): events = {e: driver.Event() for e in event_order} events['start'].record() end = min(ndata, px + chunk_size) beg = end - chunk_size offset = px - beg gains_g.set_async(gains[beg:end]) ubls_g.set_async(ubls[beg:end]) data_g.set_async(data[beg:end]) wgts_g.set_async(wgts[beg:end]) active = np.ones((chunk_size, ), dtype=np.uint32) if offset > 0: active[:offset] = 0 active_g.set_async(active) set_val_real_cuda(conv_sum_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) set_val_real_cuda(conv_wgt_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) events['upload'].record() gen_dmdl_cuda(ggu_indices_g, gains_g, ubls_g, dmdl_g, active_g, block=block, grid=bls_grid) events['dmdl'].record() set_val_real_cuda(chisq_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) calc_chisq_cuda(data_g, dmdl_g, wgts_g, chisq_g, active_g, block=block, grid=bls_grid) events['calc_chisq'].record() if TIME_IT: events['calc_chisq'].synchronize() for (e1, e2) in event_pairs[:3]: cum_time[(e1,e2)] = cum_time.get((e1,e2), 0) + \ events[e2].time_since(events[e1]) # Loop over iterations within an omnical problem for i in range(1, maxiter + 1): events['loop_top'].record() if (i % check_every) == 1: # Per standard omnical algorithm, only update gwgt/uwgt # every few iterations to save compute. calc_dwgts_cuda(dmdl_g, wgts_g, dwgts_g, active_g, block=block, grid=bls_grid) set_val_real_cuda( gwgt_g, np.uint32(nants * chunk_size), real_dtype(0), block=(nthreads, 1, 1), grid=(int(np.ceil(nants * chunk_size / nthreads)), 1)) set_val_real_cuda( uwgt_g, np.uint32(nubls * chunk_size), real_dtype(0), block=(nthreads, 1, 1), grid=(int(np.ceil(nubls * chunk_size / nthreads)), 1)) calc_gu_wgt_cuda(ggu_indices_g, dmdl_g, dwgts_g, gwgt_g, uwgt_g, active_g, block=block, grid=bls_grid) events['calc_gu_wgt'].record() clear_complex_cuda(gbuf_g, np.uint32(nants * chunk_size), block=(nthreads, 1, 1), grid=(int(np.ceil(nants * chunk_size / nthreads)), 1)) clear_complex_cuda(ubuf_g, np.uint32(nubls * chunk_size), block=(nthreads, 1, 1), grid=(int(np.ceil(nubls * chunk_size / nthreads)), 1)) # This is 75% of the compute load calc_gu_buf_cuda(ggu_indices_g, data_g, dwgts_g, dmdl_g, gbuf_g, ubuf_g, active_g, block=block, grid=bls_grid) events['calc_gu_buf'].record() if (i < maxiter) and (i < check_after or (i % check_every != 0)): # Fast branch: don't check convergence/divergence events['copy_gains'].record() events['copy_ubls'].record() update_gains_cuda(gbuf_g, gwgt_g, gains_g, np.float32(gain), active_g, block=block, grid=ant_grid) events['update_gains'].record() update_ubls_cuda(ubuf_g, uwgt_g, ubls_g, np.float32(gain), active_g, block=block, grid=ubl_grid) events['update_ubls'].record() gen_dmdl_cuda(ggu_indices_g, gains_g, ubls_g, dmdl_g, active_g, block=block, grid=bls_grid) events['dmdl2'].record() events['chisq2'].record() events['calc_conv'].record() events['update_active'].record() events['get_active'].record() else: # Slow branch: check convergence/divergence COPY(h, nants * chunk_size, gains_g.gpudata, 1, new_gains_g.gpudata, 1) events['copy_gains'].record() COPY(h, nubls * chunk_size, ubls_g.gpudata, 1, new_ubls_g.gpudata, 1) events['copy_ubls'].record() update_gains_cuda(gbuf_g, gwgt_g, new_gains_g, np.float32(gain), active_g, block=block, grid=ant_grid) events['update_gains'].record() update_ubls_cuda(ubuf_g, uwgt_g, new_ubls_g, np.float32(gain), active_g, block=block, grid=ubl_grid) events['update_ubls'].record() gen_dmdl_cuda(ggu_indices_g, new_gains_g, new_ubls_g, dmdl_g, active_g, block=block, grid=bls_grid) events['dmdl2'].record() set_val_real_cuda(new_chisq_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) calc_chisq_cuda(data_g, dmdl_g, wgts_g, new_chisq_g, active_g, block=block, grid=bls_grid) events['chisq2'].record() set_val_real_cuda(conv_sum_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) set_val_real_cuda(conv_wgt_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) calc_conv_cuda(new_gains_g, gains_g, new_ubls_g, ubls_g, conv_sum_g, conv_wgt_g, active_g, block=block, grid=conv_grid) events['calc_conv'].record() update_active_cuda(new_gains_g, gains_g, new_ubls_g, ubls_g, conv_sum_g, conv_wgt_g, conv_g, real_dtype(conv_crit), new_chisq_g, chisq_g, iters_g, np.uint32(i), active_g, block=block, grid=conv_grid) events['update_active'].record() active_g.get_async(ary=active) events['get_active'].record() if not np.any(active): break events['end'].record() if TIME_IT: events['end'].synchronize() for (e1, e2) in event_pairs[4:]: cum_time[(e1,e2)] = cum_time.get((e1,e2), 0) + \ events[e2].time_since(events[e1]) # Download final answers into buffers returned to user # use offset to trim off parts of chunk that were never active _chisq = np.empty((chunk_size, ), dtype=real_dtype) chisq_g.get_async(ary=_chisq) chisq[px:end] = _chisq[offset:] _iters = np.empty((chunk_size, ), dtype=np.uint32) iters_g.get_async(ary=_iters) iters[px:end] = _iters[offset:] _conv = np.empty((chunk_size, ), dtype=real_dtype) conv_g.get_async(ary=_conv) conv[px:end] = _conv[offset:] _gains = np.empty(ANT_SHAPE, dtype=complex_dtype) gains_g.get_async(ary=_gains) gains[px:end, :] = _gains[offset:, :] _ubls = np.empty(UBL_SHAPE, dtype=complex_dtype) ubls_g.get_async(ary=_ubls) ubls[px:end, :] = _ubls[offset:, :] t1 = time.time() if TIME_IT: print('Final, nthreads=%d' % nthreads) for (e1, e2) in event_pairs: try: print('%6.3f' % cum_time[(e1, e2)], e1, e2) except (KeyError): pass print(t1 - t0) print() cublasDestroy(h) # teardown GPU configuration return { 'gains': gains, 'ubls': ubls, 'chisq': chisq, 'iters': iters, 'conv': conv }
def diag_gpu(A, v1): # handle current_handle = cublas.cublasCreate() m = A.shape[0] Q = np.zeros((m, m), dtype=np.float64) # Q[0, :] = 0.0 # implied Q[1, :] = v1.copy() beta = np.zeros(m, dtype=np.float64) alpha = np.zeros(m, dtype=np.float64) # move data onto the GPU A_gpu = gpuarray.to_gpu(A) Q_gpu = gpuarray.to_gpu(Q) beta_gpu = gpuarray.to_gpu(beta) alpha_gpu = gpuarray.to_gpu(alpha) w = gpuarray.zeros(m, dtype=np.float64) # we define three kernels for simple arithmetic w_scale = ElementwiseKernel( arguments="double *w, double *alpha, double *beta, double *Q1, double *Q2, int loop_index", operation="w[i] = w[i] - (alpha[loop_index] * Q1[i]) - (beta[loop_index] * Q2[i])", name="element_wise_w_building") # using -= to do inplace subtraction gives an incorrect answer norm_krnl = ReductionKernel(np.float64, neutral="0.0", reduce_expr="a+b", map_expr="x[i]*x[i]", arguments="double *x") ediv = ElementwiseKernel( arguments="double *a, double *b, double *c, int loop_index", operation="a[i] = b[i] / c[loop_index+1]", name="element_wise_division") # the name must not have spaces!!!! for i in range(1, m-1): cublas.cublasDgemv(handle = current_handle, trans = 'T', m = m, n = m, # Hermitian matrix alpha = 1.0, beta = 0.0, A = A_gpu.gpudata, lda = m, x = Q_gpu[i, :].gpudata, incx = 1, y = w.gpudata, incy = 1, ) cublas.cublasDgemm(handle = current_handle, transa = 'n', transb = 'n', m = 1, n = 1, k = m, lda = 1, ldb = m, ldc = 1, alpha = 1.0, beta = 0.0, A = w.gpudata, B = Q_gpu[i, :].gpudata, C = alpha_gpu[i].gpudata) w_scale(w, alpha_gpu, beta_gpu, Q_gpu[i, :], Q_gpu[i-1, :], i) beta_gpu[i+1] = cumath.sqrt(norm_krnl(w)) ediv(Q_gpu[i+1, :], w, beta_gpu, i) # end of loop # last 2 steps cublas.cublasDgemv(handle = current_handle, trans = 'T', m = m, n = m, # Hermitian matrix alpha = 1.0, beta = 0.0, A = A_gpu.gpudata, lda = m, x = Q_gpu[-1, :].gpudata, incx = 1, y = w.gpudata, incy = 1,) cublas.cublasDgemm(handle = current_handle, transa = 'n', transb = 'n', m = 1, n = 1, k = m, lda = 1, ldb = m, ldc = 1, alpha = 1.0, beta = 0.0, A = w.gpudata, B = Q_gpu[-1, :].gpudata, C = alpha_gpu[-1].gpudata) # retrive the alpha's and betas alpha_cpu = alpha_gpu.get() beta_cpu = beta_gpu.get() print("GPU: ", alpha_cpu, beta_cpu, sep="\n\n") # make tridiagonal matrix out of alpha and B # Tri = np.zeros(matrix_size) return
def setUpClass(cls): cls.cublas_handle = cublas.cublasCreate()
flag = 1 p_hat_gpu.gpudata.free() s_hat_gpu.gpudata.free() v_gpu.gpudata.free() t_gpu.gpudata.free() return xcg_gpu, 0, 0, 0 ######## # MAIN # ######## culinalg.init() cuda.init() cublas_handle = cublas.cublasCreate() start = cuda.Event() end = cuda.Event() # --- Wave propagation c = np.float32(3e8) # --- Frequency definitions fmin = np.float32(1e9) fmax = np.float32(2.e9) freq = np.arange(fmin, fmax + .5e9, .5e9, dtype=np.float32) lambdamin = np.float32(c / fmax) lambdamax = np.float32(c / fmin)
# transfer kinetic portion to gpu T_gpu = gpuarray.to_gpu(T_i) # allocate space on gpu for results U_x_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size U_y_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size U_z_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size #m, k, n = ud.basis_size, ud.basis_size, ud.basis_size**2 for basis in range(BASIS_SIZE): prepare_gpu(basis) # set it up i, j_k, i_prime = basis_size, basis_size*basis_size, basis_size initial_time = time.clock() for num_iter in range(ITERATIONS): cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = i, n = j_k, k = i_prime, lda = i, ldb = i_prime, ldc = i, alpha = ud.alpha, beta = ud.beta, A = T_gpu.gpudata, B = v_x_gpu.gpudata, C = U_x_gpu.gpudata, ) cublas.cublasDgemm(handle = cublas.cublasCreate(), transa = 'n', transb = 'n', m = i, n = j_k, k = i_prime, lda = i, ldb = i_prime, ldc = i, alpha = ud.alpha, beta = ud.beta, A = T_gpu.gpudata, B = v_y_gpu.gpudata, C = U_y_gpu.gpudata, )
x = np.ones(N, dtype=np.float32) y = 2. * np.ones(N, dtype=np.float32) nStreams = 2 streams = [cuda.Stream() for i in range(nStreams)] x_pin = [ cuda.register_host_memory(x[i * N / nStreams:(i + 1) * N / nStreams]) for i in range(nStreams) ] y_pin = [ cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams]) for i in range(nStreams) ] h = cublas.cublasCreate() x_gpu = np.empty(nStreams, dtype=object) y_gpu = np.empty(nStreams, dtype=object) ans = np.empty(nStreams, dtype=object) for i in range(nStreams): cublas.cublasSetStream(h, streams[i].handle) x_gpu[i] = gpuarray.to_gpu_async(x_pin[i], stream=streams[i]) y_gpu[i] = gpuarray.to_gpu_async(y_pin[i], stream=streams[i]) cublas.cublasSaxpy(h, x_gpu[i].size, a, x_gpu[i].gpudata, 1, y_gpu[i].gpudata, 1) ans[i] = y_gpu[i].get_async(stream=streams[i])
def attach_cublas_handle_to_context(ctx): handle = getattr(ctx, 'cublas_handle', None) if handle is None: with ctx: ctx.cublas_handle = cublas.cublasCreate()
# coding=utf-8 import pycuda.autoinit from pycuda import gpuarray import numpy as np from skcuda import cublas from time import time a = np.float32(10) x = np.float32([1, 2, 3]) y = np.float32([-.345, 8.15, -15.867]) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) # create a cuBLAS context. This is similar in nature to CUDA contexts cublas_context_h = cublas.cublasCreate() """Level-1 AXPY (vector-vector)""" # this is a direct wrapper to a low-level C function, so the input may seem more like a C function than a true Python function. # In short, this performed an "AXPY" operation, ultimately putting the output data into the y_gpu array # first input is always the CUDA context handle. We then have to specify the size of the vectors, since this function will be ultimately # operating on C pointers; we can do this by using the size parameter of a gpuarray. Having typecasted our scalar already to a NumPy float32 variable, # we can pass the a variable right over as the scalar parameter. We then hand the underlying C pointer of the x_gpu array to this function using the gpudata # parameter. Then we specify the stride of the first array as 1: the stride specifies how many steps we should take between each input value. # (In contrast, if you were using a vector from a column in a row-wise matrix, you would set the stride to the width of the matrix.) # We then put in the pointer to the y_gpu array, and set its stride to 1 as well #We can now use the cublasSaxpy function. The S stands for single precision, which is what we will need since we are working with 32-bit floating point arrays: cublas.cublasSaxpy(cublas_context_h, x_gpu.size, a, x_gpu.gpudata, 1, y_gpu.gpudata, 1) print(y_gpu.get())
def cublas_handle(): cublas_handle = cublas.cublasCreate() cublas.cublasSetStream(cublas_handle, stream().handle) return cublas_handle
def vis_gpu(antpos, freq, eq2tops, crd_eq, I_sky, bm_cube, nthreads=NTHREADS, max_memory=MAX_MEMORY, real_dtype=np.float32, complex_dtype=np.complex64, verbose=False): # ensure shapes nant = antpos.shape[0] assert (antpos.shape == (nant, 3)) npix = crd_eq.shape[1] assert (crd_eq.shape == (3, npix)) assert (I_sky.shape == (npix, )) beam_px = bm_cube.shape[1] assert (bm_cube.shape == (nant, beam_px, beam_px)) ntimes = eq2tops.shape[0] assert (eq2tops.shape == (ntimes, 3, 3)) # ensure data types antpos = antpos.astype(real_dtype) eq2tops = eq2tops.astype(real_dtype) crd_eq = crd_eq.astype(real_dtype) Isqrt = np.sqrt(I_sky).astype(real_dtype) bm_cube = bm_cube.astype(real_dtype) # XXX complex? chunk = max(min(npix, MIN_CHUNK), 2**int(ceil(np.log2(float(nant * npix) / max_memory / 2)))) npixc = npix / chunk # blocks of threads are mapped to (pixels,ants,freqs) block = (max(1, nthreads / nant), min(nthreads, nant), 1) grid = (int(ceil(npixc / float(block[0]))), int(ceil(nant / float(block[1])))) gpu_code = GPU_TEMPLATE % { 'NANT': nant, 'NPIX': npixc, 'BEAM_PX': beam_px, 'BLOCK_PX': block[0], } gpu_module = compiler.SourceModule(gpu_code) bm_interp = gpu_module.get_function("InterpolateBeam") meas_eq = gpu_module.get_function("MeasEq") bm_texref = gpu_module.get_texref("bm_tex") import pycuda.autoinit h = cublasCreate() # handle for managing cublas # define GPU buffers and transfer initial values bm_texref.set_array( numpy3d_to_array(bm_cube) ) # never changes, transpose happens in copy so cuda bm_tex is (BEAM_PX,BEAM_PX,NANT) antpos_gpu = gpuarray.to_gpu( antpos) # never changes, set to -2*pi*antpos/c Isqrt_gpu = gpuarray.empty(shape=(npixc, ), dtype=real_dtype) A_gpu = gpuarray.empty(shape=(nant, npixc), dtype=real_dtype) # will be set on GPU by bm_interp crd_eq_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype) eq2top_gpu = gpuarray.empty(shape=(3, 3), dtype=real_dtype) # sent from CPU each time crdtop_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype) # will be set on GPU tau_gpu = gpuarray.empty(shape=(nant, npixc), dtype=real_dtype) # will be set on GPU v_gpu = gpuarray.empty(shape=(nant, npixc), dtype=complex_dtype) # will be set on GPU vis_gpus = [ gpuarray.empty(shape=(nant, nant), dtype=complex_dtype) for i in xrange(chunk) ] # output CPU buffers for downloading answers vis_cpus = [ np.empty(shape=(nant, nant), dtype=complex_dtype) for i in xrange(chunk) ] streams = [driver.Stream() for i in xrange(chunk)] event_order = ('start', 'upload', 'eq2top', 'tau', 'interpolate', 'meas_eq', 'vis', 'end') vis = np.empty((ntimes, nant, nant), dtype=complex_dtype) for t in xrange(ntimes): if verbose: print '%d/%d' % (t + 1, ntimes) eq2top_gpu.set( eq2tops[t]) # defines sky orientation for this time step events = [{e: driver.Event() for e in event_order} for i in xrange(chunk)] for c in xrange(chunk + 2): cc = c - 1 ccc = c - 2 if 0 <= ccc < chunk: stream = streams[ccc] vis_gpus[ccc].get_async(ary=vis_cpus[ccc], stream=stream) events[ccc]['end'].record(stream) if 0 <= cc < chunk: stream = streams[cc] cublasSetStream(h, stream.handle) ## compute crdtop = dot(eq2top,crd_eq) # cublas arrays are in Fortran order, so P=M*N is actually # peformed as P.T = N.T * M.T cublasSgemm(h, 'n', 'n', npixc, 3, 3, 1., crd_eq_gpu.gpudata, npixc, eq2top_gpu.gpudata, 3, 0., crdtop_gpu.gpudata, npixc) events[cc]['eq2top'].record(stream) ## compute tau = dot(antpos,crdtop) cublasSgemm(h, 'n', 'n', npixc, nant, 3, 1., crdtop_gpu.gpudata, npixc, antpos_gpu.gpudata, 3, 0., tau_gpu.gpudata, npixc) events[cc]['tau'].record(stream) ## interpolate bm_tex at specified topocentric coords, store interpolation in A ## threads are parallelized across pixel axis bm_interp(crdtop_gpu, A_gpu, grid=grid, block=block, stream=stream) events[cc]['interpolate'].record(stream) # compute v = A * I * exp(1j*tau*freq) meas_eq(A_gpu, Isqrt_gpu, tau_gpu, real_dtype(freq), v_gpu, grid=grid, block=block, stream=stream) events[cc]['meas_eq'].record(stream) # compute vis = dot(v, v.T) # transpose below incurs about 20% overhead cublasCgemm(h, 'c', 'n', nant, nant, npixc, 1., v_gpu.gpudata, npixc, v_gpu.gpudata, npixc, 0., vis_gpus[cc].gpudata, nant) events[cc]['vis'].record(stream) if c < chunk: stream = streams[c] events[c]['start'].record(stream) crd_eq_gpu.set_async(crd_eq[:, c * npixc:(c + 1) * npixc], stream=stream) Isqrt_gpu.set_async(Isqrt[c * npixc:(c + 1) * npixc], stream=stream) events[c]['upload'].record(stream) events[chunk - 1]['end'].synchronize() vis[t] = sum(vis_cpus) if verbose: for c in xrange(chunk): print '%d:%d START->END:' % ( c, chunk), events[c]['start'].time_till( events[c]['end']) * 1e-3 #for i,e in enumerate(event_order[:-1]): # print c, e,'->',event_order[i+1], ':', events[c][e].time_till(events[c][event_order[i+1]]) * 1e-3 print 'TOTAL:', events[0]['start'].time_till( events[chunk - 1]['end']) * 1e-3 # teardown GPU configuration cublasDestroy(h) return vis
def setUp(self): np.random.seed(23) # For reproducible tests. self.cublas_handle = cublas.cublasCreate()
c_cpu = np.dot(a_cpu, b_cpu) # printing the process (CPU): print("\nComputing matrix vector mutliplication in CPU:\n") print("a = \n", a_cpu, "\n") print("b = \n", b_cpu, "\n") print("Product =\n", c_cpu, "\n") # allocating and converting (a) to a gpuarray: a_gpu = gpuarray.to_gpu(a_cpu) # allocating and converting (b) to a gpuarray: b_gpu = gpuarray.to_gpu(b_cpu) # allocating c as a 2x1 matrix filled with zeros: c_gpu = gpuarray.zeros((2,1), dtype = np.float32) # Computing matrix product of gpu (a) and (b) and storing it in gpu (c): cublas.cublasSgemv(handle = cublas.cublasCreate(), trans = 'n', m = 2, n = 3, alpha = 1.0, A = a_gpu.gpudata, lda = 2, x = b_gpu.gpudata, incx = 1, beta = 1.0, y = c_gpu.gpudata, incy = 1) # printing the process(GPU): print("\nComputing matrix vector mutliplication in GPU:\n") print("a = \n", a_gpu.get(), "\n") print("b = \n", b_gpu.get(), "\n") print("Product =\n", c_gpu.get(), "\n")