def _perform_sgemv(self, mat, v, vec_out, nvecs, dim): ''' NOTES: cuBLAS uses Fortran layout cublas_sgemv is used to multiply matrix and vector (LEVEl 2 BLAS) cublas_handle -> handle to the cuBLAS library context t -> transpose dim -> number of columns of matrix nvecs -> number of rows of matrix alpha -> scalar used for multiplication of mat mat.gpudata -> matrix mat dim -> columns of matrix v.gpudata -> vector v incX -> Stride within X. For example, if incX is 7, every 7th element is used. beta -> scalar used for multiplication of v v_out.gpudata -> result incY -> Stride within Y. For example, if incx is 7, every 7th element is used Readmore -> http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemv ''' alpha = np.float32(1.0) beta = np.float32(0.0) incx = 1 incy = 1 cublas_handle = cublas.cublasCreate() cublas.cublasSgemv(cublas_handle, 't', dim, nvecs, alpha, mat.gpudata, dim, v.gpudata, incx, beta, vec_out.gpudata, incy) cublas.cublasDestroy(cublas_handle) return vec_out
def _perform_sgemm(self, mat_a, mat_b, mat_out): nvecs_a, dim = mat_a.shape nvecs_b, dim = mat_b.shape alpha = np.float32(1.0) beta = np.float32(0.0) ''' cublas_sgemm is used to multiply matrix and matrix(LEVEL 3 BLAS) cublas_handle -> handle to the cuBLAS-library context t -> transpose mat_b n -> notranspose mat_a nvecs_b -> rows of mat_b nvecs_a -> rows of mat_a dim -> Common dimensions in mat_a and mat_b alpha -> scaling factor for multiplication of mat_a and mat_b mat_b.gpudata -> matrix mat_b dim -> columns of mat_b mat_a.gpudata -> matrix mat_a dim -> columns of mat_a beta -> scaling factor for r_gpu mat_out.gpudata -> matirx mat_out nvecs_b -> rows of mat_b Read more -> http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm ''' cublas_handle = cublas.cublasCreate() cublas.cublasSgemm(cublas_handle, 't', 'n', nvecs_b, nvecs_a, dim, alpha, mat_b.gpudata, dim, mat_a.gpudata, dim, beta, mat_out.gpudata, nvecs_b) cublas.cublasDestroy(cublas_handle) return mat_out
def jki_lu_decomposer_opt_gpu(M): m = M.shape[0] n = M.shape[1] import pycuda.autoinit import pycuda.gpuarray as gpuarray from skcuda.cublas import cublasCreate, cublasDaxpy, cublasDscal, cublasDestroy import skcuda.misc as misc N_gpu = gpuarray.to_gpu(M) h = cublasCreate() for j in range(0,n): for k in range(0,j): #N[k+1:,j] = N[k+1:,j] - N[k+1:,k] * N[k,j] cublasDaxpy(h, N_gpu[k+1:,k].size, -np.float64(N_gpu[k,j].get()), N_gpu[k+1:,k].gpudata, n, N_gpu[k+1:,j].gpudata, n) #N[j+1:,j] /= N[j,j] cublasDscal(h, N_gpu[j+1:,j].size, 1.0/np.float64(N_gpu[j,j].get()), N_gpu[j+1:,j].gpudata, n) #Move from GPU to CPU N = N_gpu.get() cublasDestroy(h) return N
def ijk_lu_decomposer_opt_gpu(M): m = M.shape[0] n = M.shape[1] import pycuda.autoinit import pycuda.gpuarray as gpuarray from skcuda.cublas import cublasCreate, cublasDestroy, cublasDdot#, cublasDscal import skcuda.misc as misc #import skcuda.linalg as linalg #linalg.init() N_gpu = gpuarray.to_gpu(M) h = cublasCreate() for i in range(0,n): for j in range(0,n): #N[i,j] -= N[i,:min(i,j)].dot(N[:min(i,j),j]) N_gpu[i,j] -= cublasDdot(h, N_gpu[i,:min(i,j)].size, N_gpu[i,:min(i,j)].gpudata, 1, N_gpu[:min(i,j),j].gpudata, n) if j<i: N_gpu[i,j] /= N_gpu[j,j] #cublasDscal(h, N_gpu[i,j].size, 1.0/np.float64(N_gpu[j,j].get()), N_gpu[i,j].gpudata, 1) #Move from GPU to CPU N = N_gpu.get() cublasDestroy(h) return N
def shutdown(): """Finalizes CUDA global state. This function is automatically called by :mod:`atexit`. Multiple calls are allowed, so user can manually call this function if necessary. """ global _contexts, _cublas_handles, _pid, _pools _check_cuda_available() pid = os.getpid() if _pid != pid: # not initialized return for cublas_handle in six.itervalues(_cublas_handles): cublas.cublasDestroy(cublas_handle) _cublas_handles = {} cumisc.shutdown() _pools = {} for ctx in six.itervalues(_contexts): ctx.detach() _contexts = {} _pid = None # mark as uninitialized
def kij_lu_decomposer_opt_gpu(M): m = M.shape[0] n = M.shape[1] import pycuda.autoinit import pycuda.gpuarray as gpuarray from skcuda.cublas import cublasCreate, cublasDaxpy, cublasDestroy import skcuda.misc as misc N_gpu = gpuarray.to_gpu(M) h = cublasCreate() for k in range(0,n): for i in range(k+1,n): N_gpu[i,k] = N_gpu[i,k] / N_gpu[k,k] #N[i,k+1:] -= N[i,k] * N[k,k+1:] cublasDaxpy(h, N_gpu[k,k+1:].size, -np.float64(N_gpu[i,k].get()), N_gpu[k,k+1:].gpudata, 1, N_gpu[i,k+1:].gpudata, 1) #Move from GPU to CPU N = N_gpu.get() cublasDestroy(h) return N
def shutdown(): """Finalizes CUDA global state. This function is automatically called by :mod:`atexit`. Multiple calls are allowed, so user can manually call this function if necessary. """ global _contexts, _cublas_handles, _pid, _pools pid = os.getpid() if _pid != pid: # not initialized return for cublas_handle in six.itervalues(_cublas_handles): cublas.cublasDestroy(cublas_handle) _cublas_handles = {} cumisc.shutdown() _pools = {} for ctx in six.itervalues(_contexts): ctx.detach() _contexts = {} _pid = None # mark as uninitialized
def compute_gflops(precision='S'): if precision=='S': float_type = 'float32' elif precision=='D': float_type = 'float64' else: return -1 A = np.random.randn(m, k).astype(float_type) B = np.random.randn(k, n).astype(float_type) C = np.random.randn(m, n).astype(float_type) A_cm = A.T.copy() B_cm = B.T.copy() C_cm = C.T.copy() A_gpu = gpuarray.to_gpu(A_cm) B_gpu = gpuarray.to_gpu(B_cm) C_gpu = gpuarray.to_gpu(C_cm) alpha = np.random.randn() beta = np.random.randn() transa = cublas._CUBLAS_OP['N'] transb = cublas._CUBLAS_OP['N'] lda = m ldb = k ldc = m t = time() handle = cublas.cublasCreate() exec('cublas.cublas%sgemm(handle, transa, transb, m, n, k, alpha, A_gpu.gpudata, lda, \ B_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)' % precision) cublas.cublasDestroy(handle) t = time() - t gflops = 2*m*n*(k+1)*(10**-9) / t return gflops
def count_triangles_cublas(adjacency_list): driver.init() context = tools.make_default_context() h = cublas.cublasCreate() n = len(adjacency_list) A = np.zeros([n,n], dtype=np.float64) for row_idx, neighbor_list in adjacency_list: A[row_idx, neighbor_list] = 1.0 a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.empty(A.shape, A.dtype) c_gpu = gpuarray.empty(A.shape, A.dtype) one = np.float64(1.0) zero = np.float64(0.0) cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, a_gpu.gpudata, n, zero, b_gpu.gpudata, n) cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, b_gpu.gpudata, n, zero, c_gpu.gpudata, n) trace = linalg.trace(c_gpu, h) cublas.cublasDestroy(h) context.detach() return int(trace/6)
def compute_gflops(precision='S'): if precision == 'S': float_type = 'float32' elif precision == 'D': float_type = 'float64' else: return -1 # some random matrices that are of the appropriate precision that we will use for timing A = np.random.randn(m, k).astype(float_type) B = np.random.randn(k, n).astype(float_type) C = np.random.randn(m, n).astype(float_type) # wie gehabt A_cm = A.T.copy() B_cm = B.T.copy() C_cm = C.T.copy() A_gpu = gpuarray.to_gpu(A_cm) B_gpu = gpuarray.to_gpu(B_cm) C_gpu = gpuarray.to_gpu(C_cm) alpha = np.random.randn() beta = np.random.randn() transa = cublas._CUBLAS_OP['N'] transb = cublas._CUBLAS_OP['N'] lda = m ldb = k ldc = m t = time() handle = cublas.cublasCreate() # two different (relevant) precision modes, D and S exec( 'cublas.cublas%sgemm(handle, transa, transb, m, n, k, alpha, A_gpu.gpudata, lda, \ B_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)' % precision) cublas.cublasDestroy(handle) t = time() - t # a total of 2kmn - mn + 3mn = 2kmn + 2mn = 2mn(k+1) floating point operations in a given GEMM operation gflops = 2 * m * n * (k + 1) * (10**-9) / t return gflops
def matmul(self, mat, return_time=False): '''Matrix multiplication between two matrices''' # check dimensions first: if self.ncols != mat.nrows: raise ValueError("Dimensions {0} and {1} do not match.".format( self.ncols, mat.nrows)) # move matrices to gpu # somehow we need to transpose this to make it work, not sure why tho a_gpu = gpuarray.to_gpu(self.arr.T.copy()) b_gpu = gpuarray.to_gpu(mat.arr.T.copy()) c_gpu = gpuarray.to_gpu( np.zeros((self.nrows, mat.ncols)).astype(np.float32).T.copy()) # initialize culas context h = cublasCreate() # evaluate the matrix multiplication # cubals syntax as follows: # h = handle, "n" : op(A) = A (for op(A) = A^t, use "t"), # then list m, n, k if we have m x k dot k x n # then first value is scalar in front of product (set to 1) # then give array data followed by row dim of each matrix # last value is for adding another matrix, set to 0. # also record the time it takes for the evaluation t0 = time.perf_counter_ns() cublasSgemm(h, "n", "n", self.nrows, mat.ncols, self.ncols, np.float32(1.0), a_gpu.gpudata, self.nrows, b_gpu.gpudata, mat.nrows, np.float32(0.0), c_gpu.gpudata, self.nrows) t1 = time.perf_counter_ns() eval_time = (t1 - t0) * (1e-9) # time for each matmul evaluation # move from device to host prod_arr = c_gpu.get().T # free allocated memory for handle cublasDestroy(h) return (cublasMatrix(prod_arr), eval_time) if return_time else cublasMatrix(prod_arr)
def vis_gpu(antpos, freq, eq2tops, crd_eq, I_sky, bm_cube, nthreads=NTHREADS, max_memory=MAX_MEMORY, real_dtype=np.float32, complex_dtype=np.complex64, verbose=False): # ensure shapes nant = antpos.shape[0] assert (antpos.shape == (nant, 3)) npix = crd_eq.shape[1] assert (crd_eq.shape == (3, npix)) assert (I_sky.shape == (npix, )) beam_px = bm_cube.shape[1] assert (bm_cube.shape == (nant, beam_px, beam_px)) ntimes = eq2tops.shape[0] assert (eq2tops.shape == (ntimes, 3, 3)) # ensure data types antpos = antpos.astype(real_dtype) eq2tops = eq2tops.astype(real_dtype) crd_eq = crd_eq.astype(real_dtype) Isqrt = np.sqrt(I_sky).astype(real_dtype) bm_cube = bm_cube.astype(real_dtype) # XXX complex? chunk = max(min(npix, MIN_CHUNK), 2**int(ceil(np.log2(float(nant * npix) / max_memory / 2)))) npixc = npix / chunk # blocks of threads are mapped to (pixels,ants,freqs) block = (max(1, nthreads / nant), min(nthreads, nant), 1) grid = (int(ceil(npixc / float(block[0]))), int(ceil(nant / float(block[1])))) gpu_code = GPU_TEMPLATE % { 'NANT': nant, 'NPIX': npixc, 'BEAM_PX': beam_px, 'BLOCK_PX': block[0], } gpu_module = compiler.SourceModule(gpu_code) bm_interp = gpu_module.get_function("InterpolateBeam") meas_eq = gpu_module.get_function("MeasEq") bm_texref = gpu_module.get_texref("bm_tex") import pycuda.autoinit h = cublasCreate() # handle for managing cublas # define GPU buffers and transfer initial values bm_texref.set_array( numpy3d_to_array(bm_cube) ) # never changes, transpose happens in copy so cuda bm_tex is (BEAM_PX,BEAM_PX,NANT) antpos_gpu = gpuarray.to_gpu( antpos) # never changes, set to -2*pi*antpos/c Isqrt_gpu = gpuarray.empty(shape=(npixc, ), dtype=real_dtype) A_gpu = gpuarray.empty(shape=(nant, npixc), dtype=real_dtype) # will be set on GPU by bm_interp crd_eq_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype) eq2top_gpu = gpuarray.empty(shape=(3, 3), dtype=real_dtype) # sent from CPU each time crdtop_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype) # will be set on GPU tau_gpu = gpuarray.empty(shape=(nant, npixc), dtype=real_dtype) # will be set on GPU v_gpu = gpuarray.empty(shape=(nant, npixc), dtype=complex_dtype) # will be set on GPU vis_gpus = [ gpuarray.empty(shape=(nant, nant), dtype=complex_dtype) for i in xrange(chunk) ] # output CPU buffers for downloading answers vis_cpus = [ np.empty(shape=(nant, nant), dtype=complex_dtype) for i in xrange(chunk) ] streams = [driver.Stream() for i in xrange(chunk)] event_order = ('start', 'upload', 'eq2top', 'tau', 'interpolate', 'meas_eq', 'vis', 'end') vis = np.empty((ntimes, nant, nant), dtype=complex_dtype) for t in xrange(ntimes): if verbose: print '%d/%d' % (t + 1, ntimes) eq2top_gpu.set( eq2tops[t]) # defines sky orientation for this time step events = [{e: driver.Event() for e in event_order} for i in xrange(chunk)] for c in xrange(chunk + 2): cc = c - 1 ccc = c - 2 if 0 <= ccc < chunk: stream = streams[ccc] vis_gpus[ccc].get_async(ary=vis_cpus[ccc], stream=stream) events[ccc]['end'].record(stream) if 0 <= cc < chunk: stream = streams[cc] cublasSetStream(h, stream.handle) ## compute crdtop = dot(eq2top,crd_eq) # cublas arrays are in Fortran order, so P=M*N is actually # peformed as P.T = N.T * M.T cublasSgemm(h, 'n', 'n', npixc, 3, 3, 1., crd_eq_gpu.gpudata, npixc, eq2top_gpu.gpudata, 3, 0., crdtop_gpu.gpudata, npixc) events[cc]['eq2top'].record(stream) ## compute tau = dot(antpos,crdtop) cublasSgemm(h, 'n', 'n', npixc, nant, 3, 1., crdtop_gpu.gpudata, npixc, antpos_gpu.gpudata, 3, 0., tau_gpu.gpudata, npixc) events[cc]['tau'].record(stream) ## interpolate bm_tex at specified topocentric coords, store interpolation in A ## threads are parallelized across pixel axis bm_interp(crdtop_gpu, A_gpu, grid=grid, block=block, stream=stream) events[cc]['interpolate'].record(stream) # compute v = A * I * exp(1j*tau*freq) meas_eq(A_gpu, Isqrt_gpu, tau_gpu, real_dtype(freq), v_gpu, grid=grid, block=block, stream=stream) events[cc]['meas_eq'].record(stream) # compute vis = dot(v, v.T) # transpose below incurs about 20% overhead cublasCgemm(h, 'c', 'n', nant, nant, npixc, 1., v_gpu.gpudata, npixc, v_gpu.gpudata, npixc, 0., vis_gpus[cc].gpudata, nant) events[cc]['vis'].record(stream) if c < chunk: stream = streams[c] events[c]['start'].record(stream) crd_eq_gpu.set_async(crd_eq[:, c * npixc:(c + 1) * npixc], stream=stream) Isqrt_gpu.set_async(Isqrt[c * npixc:(c + 1) * npixc], stream=stream) events[c]['upload'].record(stream) events[chunk - 1]['end'].synchronize() vis[t] = sum(vis_cpus) if verbose: for c in xrange(chunk): print '%d:%d START->END:' % ( c, chunk), events[c]['start'].time_till( events[c]['end']) * 1e-3 #for i,e in enumerate(event_order[:-1]): # print c, e,'->',event_order[i+1], ':', events[c][e].time_till(events[c][event_order[i+1]]) * 1e-3 print 'TOTAL:', events[0]['start'].time_till( events[chunk - 1]['end']) * 1e-3 # teardown GPU configuration cublasDestroy(h) return vis
def _shutdown_gpucsrarray(): cublas.cublasDestroy(cublas_handle) cusparse.cusparseDestroy(cusparse_handle)
a * x + y, y_gpu.get()) w_gpu = gpuarray.to_gpu(x) v_gpu = gpuarray.to_gpu(y) #perform a dot product dot_output = cublas.cublasSdot(cublas_context_h, v_gpu.size, v_gpu.gpudata, 1, w_gpu.gpudata, 1) print(dot_output) l2_output = cublas.cublasSnrm2(cublas_context_h, v_gpu.size, v_gpu.gpudata, 1) print(l2_output) cublas.cublasDestroy(cublas_context_h) #(f we want to operate on arrays of 64-bit real floating point values, (float64 in NumPy and PyCUDA), then we would use the cublasDaxpy) """Level-2 GEMV (general matrix-vector)""" # m and n are the number of rows and columns m = 10 n = 100 # is the floating-point value for α alpha = 1 # s the floating-point value for β beta = 0 # set alpha to 1 and beta to 0 to get a direct matrix multiplication with no scaling A = np.random.rand(m, n).astype('float32') x = np.random.rand(n).astype('float32') y = np.zeros(m).astype('float32')
def tearDownClass(cls): cublas.cublasDestroy(cls.cublas_handle)
def register_multiple_images_subpix_cuda(stack, template): import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.cumath as cumath import skcuda.fft as cu_fft import skcuda.linalg as lin import skcuda.cublas as cub from numpy import pi, newaxis, floor import cmath from pycuda.elementwise import ElementwiseKernel from pycuda.compiler import SourceModule from numpy import conj, abs, arctan2, sqrt, real, imag, shape, zeros, trunc, ceil, floor, fix from numpy.fft import fftshift, ifftshift fft2, ifft2 = fftn, ifftn = fast_ffts.get_ffts(nthreads=1, use_numpy_fft=False) mod = SourceModule(""" #include <pycuda-complex.hpp>" __global__ void load_convert(unsigned short *a, float *b,int f, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; int offset = f * imlen; if (idx <imlen) { b[idx] = (float)a[offset+idx]; } } __global__ void convert_export(float *a, unsigned short *b,int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { b[idx] = (unsigned short)(a[idx]>0 ? a[idx] : 0) ; } } __global__ void multiply_comp_float(pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { z[idx] = x[idx] * y[idx]; } } __global__ void calc_conj(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { y[idx]._M_re = x[idx]._M_re; y[idx]._M_im = -x[idx]._M_im; } } __global__ void convert_multiply(float *x, pycuda::complex<float> *y, float sx, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { y[idx]._M_re = 0; y[idx]._M_im = x[idx] * sx; } } __global__ void transfer_array(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlenl, int imlen, int nlargeh, int nh) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; int offset = imlenl*3/4; if (idx<imlen) { int target_ind = (offset+(idx/nh)*nlargeh + (idx % nh))%imlenl; x[target_ind] = y[idx]; } } __global__ void calc_shiftmatrix(float *x, float *y, pycuda::complex<float> *z, float sx, float sy,float dg, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { z[idx]._M_re = 0; z[idx]._M_im = x[idx] * sx + y[idx] * sy + dg; } } __global__ void sub_float(float *x, float *y, float sv, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { x[idx] = y[idx]-sv; } } """) load_convert_kernel = mod.get_function('load_convert') convert_export_kernel = mod.get_function('convert_export') convert_multiply_kernel = mod.get_function('convert_multiply') multiply_float_kernel = mod.get_function('multiply_comp_float') transfer_array_kernel = mod.get_function('transfer_array') calc_shiftmatrix_kernel = mod.get_function('calc_shiftmatrix') conj_kernel = mod.get_function('calc_conj') sub_float_kernel = mod.get_function('sub_float') Z = stack.shape[0] M = stack.shape[1] N = stack.shape[2] max_memsize = 4200000000 imlen = M * N half_imlen = M * (N // 2 + 1) grid_dim = (64, int(imlen / (512 * 64)) + 1, 1) block_dim = (512, 1, 1) #512 threads per block stack_bin = int(max_memsize / (M * N * stack.itemsize)) stack_ite = int(Z / stack_bin) + 1 usfac = 100 ## needs to be bigger than 10 if not template.shape == stack.shape[1:]: raise ValueError("Images must have same shape.") if np.any(np.isnan(template)): template = template.copy() template[template != template] = 0 if np.any(np.isnan(stack)): stack = stack.copy() stack[stack != stack] = 0 mlarge = M * 2 nlarge = N * 2 t = time.time() plan_forward = cu_fft.Plan((M, N), np.float32, np.complex64) plan_inverse = cu_fft.Plan((M, N), np.complex64, np.float32) plan_inverse_big = cu_fft.Plan((mlarge, nlarge), np.complex64, np.float32) cub_h = cub.cublasCreate() template_gpu = gpuarray.to_gpu(template.astype('float32')) source_gpu = gpuarray.empty((M, N), np.float32) ifft_gpu = gpuarray.empty((M, N), np.float32) result_gpu = gpuarray.empty((M, N), np.uint16) templatef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64) sourcef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64) prod_gpu1 = gpuarray.empty((M, N // 2 + 1), np.complex64) prod_gpu2 = gpuarray.empty((M, N // 2 + 1), np.complex64) shiftmatrix = gpuarray.empty((M, N // 2 + 1), np.complex64) cu_fft.fft(template_gpu, templatef_gpu, plan_forward, scale=True) templatef_gpu = templatef_gpu.conj() move_list = np.zeros((Z, 2)) largearray1_gpu = gpuarray.zeros((mlarge, nlarge // 2 + 1), np.complex64) largearray2_gpu = gpuarray.empty((mlarge, nlarge), np.float32) imlenl = mlarge * (nlarge // 2 + 1) zoom_factor = 1.5 dftshiftG = trunc(ceil(usfac * zoom_factor) / 2) #% Center of output array at dftshift+1 upsample_dim = int(ceil(usfac * zoom_factor)) term1c = (ifftshift(np.arange(N, dtype='float') - floor(N / 2)). T[:, newaxis]) / N # fftfreq # output points term2c = ((np.arange(upsample_dim, dtype='float')) / usfac)[newaxis, :] term1r = (np.arange(upsample_dim, dtype='float').T)[:, newaxis] term2r = (ifftshift(np.arange(M, dtype='float')) - floor(M / 2))[newaxis, :] # fftfreq term1c_gpu = gpuarray.to_gpu(term1c[:int(floor(N / 2) + 1), :].astype('float32')) term2c_gpu = gpuarray.to_gpu(term2c.astype('float32')) term1r_gpu = gpuarray.to_gpu(term1r.astype('float32')) term2r_gpu = gpuarray.to_gpu(term2r.astype('float32')) term2c_gpu_ori = gpuarray.to_gpu(term2c.astype('float32')) term1r_gpu_ori = gpuarray.to_gpu(term1r.astype('float32')) kernc_gpu = gpuarray.zeros((N // 2 + 1, upsample_dim), np.float32) kernr_gpu = gpuarray.zeros((upsample_dim, M), np.float32) kernc_gpuc = gpuarray.zeros((N // 2 + 1, upsample_dim), np.complex64) kernr_gpuc = gpuarray.zeros((upsample_dim, M), np.complex64) Nr = np.fft.ifftshift(np.linspace(-np.fix(M / 2), np.ceil(M / 2) - 1, M)) Nc = np.fft.ifftshift(np.linspace(-np.fix(N / 2), np.ceil(N / 2) - 1, N)) [Nc, Nr] = np.meshgrid(Nc, Nr) Nc_gpu = gpuarray.to_gpu((Nc[:, :N // 2 + 1] / N).astype('float32')) Nr_gpu = gpuarray.to_gpu((Nr[:, :N // 2 + 1] / M).astype('float32')) upsampled1 = gpuarray.empty((upsample_dim, N // 2 + 1), np.complex64) upsampled2 = gpuarray.empty((upsample_dim, upsample_dim), np.complex64) source_stack = gpuarray.empty((stack_bin, M, N), dtype=stack.dtype) copy = drv.Memcpy3D() copy.set_src_host(stack.data) copy.set_dst_device(source_stack.gpudata) copy.width_in_bytes = copy.src_pitch = stack.strides[1] copy.src_height = copy.height = M for zb in range(stack_ite): zrange = np.arange(zb * stack_bin, min((stack_bin * (zb + 1)), Z)) copy.depth = len(zrange) copy.src_z = int(zrange[0]) copy() for i in range(len(zrange)): t = zb * stack_bin + i load_convert_kernel(source_stack, source_gpu.gpudata, np.int32(i), np.int32(imlen), block=block_dim, grid=grid_dim) cu_fft.fft(source_gpu, sourcef_gpu, plan_forward, scale=True) multiply_float_kernel(sourcef_gpu, templatef_gpu, prod_gpu1, np.int32(half_imlen), block=block_dim, grid=grid_dim) transfer_array_kernel(largearray1_gpu, prod_gpu1, np.int32(imlenl), np.int32(half_imlen), np.int32(nlarge // 2 + 1), np.int32(N // 2 + 1), block=block_dim, grid=grid_dim) cu_fft.ifft(largearray1_gpu, largearray2_gpu, plan_inverse_big, scale=True) peakind = cub.cublasIsamax(cub_h, largearray2_gpu.size, largearray2_gpu.gpudata, 1) rloc, cloc = np.unravel_index(peakind, largearray2_gpu.shape) md2 = trunc(mlarge / 2) nd2 = trunc(nlarge / 2) if rloc > md2: row_shift2 = rloc - mlarge else: row_shift2 = rloc if cloc > nd2: col_shift2 = cloc - nlarge else: col_shift2 = cloc row_shiftG = row_shift2 / 2. col_shiftG = col_shift2 / 2. # Initial shift estimate in upsampled grid row_shiftG0 = round(row_shiftG * usfac) / usfac col_shiftG0 = round(col_shiftG * usfac) / usfac # Matrix multiply DFT around the current shift estimate roffG = dftshiftG - row_shiftG0 * usfac coffG = dftshiftG - col_shiftG0 * usfac sub_float_kernel(term2c_gpu, term2c_gpu_ori, np.float32(coffG / usfac), np.int32(term2c_gpu.size), block=block_dim, grid=grid_dim) sub_float_kernel(term1r_gpu, term1r_gpu_ori, np.float32(roffG), np.int32(term1r_gpu.size), block=block_dim, grid=grid_dim) lin.dot(term1c_gpu, term2c_gpu, handle=cub_h, out=kernc_gpu) lin.dot(term1r_gpu, term2r_gpu, handle=cub_h, out=kernr_gpu) convert_multiply_kernel(kernc_gpu, kernc_gpuc, np.float32(-2 * pi), np.int32(kernc_gpu.size), block=block_dim, grid=grid_dim) convert_multiply_kernel(kernr_gpu, kernr_gpuc, np.float32(-2 * pi / (M * usfac)), np.int32(kernr_gpu.size), block=block_dim, grid=grid_dim) cumath.exp(kernc_gpuc, out=kernc_gpuc) cumath.exp(kernr_gpuc, out=kernr_gpuc) conj_kernel(prod_gpu1, prod_gpu2, np.int32(half_imlen), block=block_dim, grid=grid_dim) lin.dot(kernr_gpuc, prod_gpu2, handle=cub_h, out=upsampled1) lin.dot(upsampled1, kernc_gpuc, handle=cub_h, out=upsampled2) CCG = conj(upsampled2.get()) / (md2 * nd2 * usfac**2) rlocG, clocG = np.unravel_index(abs(CCG).argmax(), CCG.shape) CCGmax = CCG[rlocG, clocG] rlocG = rlocG - dftshiftG #+ 1 # +1 # questionable/failed hack + 1; clocG = clocG - dftshiftG #+ 1 # -1 # questionable/failed hack - 1; row_shiftG = row_shiftG0 + rlocG / usfac col_shiftG = col_shiftG0 + clocG / usfac diffphaseG = arctan2(imag(CCGmax), real(CCGmax)) # Compute registered version of source stack calc_shiftmatrix_kernel(Nr_gpu, Nc_gpu, shiftmatrix, np.float32(row_shiftG * 2 * np.pi), np.float32(col_shiftG * 2 * np.pi), np.float32(diffphaseG), np.int32(half_imlen), block=block_dim, grid=grid_dim) cumath.exp(shiftmatrix, out=shiftmatrix) multiply_float_kernel(sourcef_gpu, shiftmatrix, prod_gpu1, np.int32(half_imlen), block=block_dim, grid=grid_dim) cu_fft.ifft(prod_gpu1, ifft_gpu, plan_inverse) convert_export_kernel(ifft_gpu, result_gpu, np.int32(imlen), block=block_dim, grid=grid_dim) move_list[t, :] = (row_shiftG, col_shiftG) stack[t, :, :] = result_gpu.get() cub.cublasDestroy(cub_h) return (stack, move_list)
def destroy_contexts(self): """ Destroy context """ cublas.cublasDestroy(self.cublasContext) cudnn.cudnnDestroy(self.cudnnContext) self.cudaContext.pop()
def destroy(self): if self.handle is not None: cublas.cublasDestroy(self.handle)
def tearDownClass(cls): cublas.cublasDestroy(cls.cublas_handle) cls.ctx.pop() clear_context_caches()
def cor_mat_3(BOLD, upper_tri, N, L, OOO): # calcolo memoria disponibile meminfo = cuda.mem_get_info() print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) available_mem = float(meminfo[0]) available_mem /= np.dtype(np.float32).itemsize available_mem -= N * L # print("Available memory: ", available_mem) # preprocessing fMRI data in CPU start_time = time.time() BOLD = preprocessing(BOLD, N, L) stop_time = time.time() delta = stop_time - start_time print("Running time for preprocessing: ", delta, "\n") # passaggio di BOLD in device start_time = time.time() BOLD_device = gpuarray.to_gpu(BOLD) stop_time = time.time() delta = stop_time - start_time print("Running time matrices to device: ", delta, "\n") # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("After BOLD_device free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) # inizializzazione variabili flag = 1 ii=0 upper_size = (N-1) * N / 2 block = OOO N_prime = N temp = 0 temp2 = 0 temp3 = 0 pak = 0 so_far = 0 count = 1 temp4 = 0 alpha = np.float32(1.0) beta = np.float32(0.0) while flag is 1: print("###### ITERAZIONE ", count, " #####") # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("After BOLD_device free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) # print("block: ", block) # print("N_prime: ", N_prime) # checking for the last chunk if block == N_prime: flag = 0 if pak is not 0: del dev_upper del result_device temp = block temp *= (block + 1) temp /= 2 # M1 is the size of upper triangle part of chunk M1 = N_prime M1 *= block M1 -= temp M1 = int(M1) # print("M1: ", M1) pak += 1 # print("so_far*L: ", so_far*L) start_time = time.time() result = np.zeros((block, N_prime), np.float32) # print("result shape: ", result.shape) BOLD_device = BOLD_device.reshape(-1) # allocate memory on the device for the result result_device = gpuarray.to_gpu(result) # print("result_device shape: ", result_device.shape) # # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("Before cublasSgemm free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) stop_time = time.time() delta = stop_time - start_time print("Running time matrices to device: ", delta, "\n") start_time = time.time() h = cublas.cublasCreate() cublas.cublasSgemm(h, 'T', 'n', block, N_prime, L, alpha, BOLD_device[so_far*L:].gpudata, L, BOLD_device[so_far*L:].gpudata, L, beta, result_device.gpudata, block) stop_time = time.time() delta = stop_time - start_time print("Running time core function: ", delta, "\n") temp2 = block temp2 *= N_prime # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) # result_device = gpuarray.to_gpu(result1) start_time = time.time() threads_per_block = 1024 blocks_per_grid = 1 + math.ceil(((temp2-1) / threads_per_block)) grid = (blocks_per_grid, 1) # print("temp2:", temp2) # print("threads_per_block: ", threads_per_block) # print("blocks_per_grid: ", blocks_per_grid) upper = np.zeros(M1, np.float32) # print("upper shape:", upper.shape) dev_upper = gpuarray.to_gpu(upper) # print("dev_upper shape: ", dev_upper.shape) # print("result_device shape: ", result_device.shape) mod = pycuda.compiler.SourceModule(""" __global__ void ker2(float * cormat, float * upper,int n1,int n,long long upper_size,int N,int i_so_far,long long M1) { long long idx = blockDim.x; idx*=blockIdx.x; idx+=threadIdx.x; long i = idx/n; long j = idx%n; if(i<j && i<n1 && j<n)// &&i<N &&j<N && idx<(n1*n)) { long long tmp=i; tmp*=(i+1); tmp/=2; long long tmp_2=i; tmp_2*=n; tmp_2=tmp_2-tmp; tmp_2+=j; tmp_2-=i; long long indexi=n1; indexi*=j; indexi=indexi+i; upper[tmp_2-1]=cormat[indexi]; } } """) # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) funct = mod.get_function("ker2") funct(result_device, dev_upper, np.int32(block), np.int32(N_prime), np.int64(upper_size), np.int32(N), np.int32(ii), np.int64(M1), block=(threads_per_block, 1, 1), grid=grid ) temp3+=M1 # print("upper_tri shape:", upper_tri.shape) upper_tri[temp4:temp3] = dev_upper.get() stop_time = time.time() delta = stop_time - start_time print("Running time to get upper tri: ", delta, "\n") temp4 += M1 ii += block cublas.cublasDestroy(h) so_far += block if N_prime > block: N_prime = N_prime - block block = remaining_N2(N_prime, L, available_mem) if N_prime < block: block = N_prime count += 1 # liberare la memoria del BOLD_device del result_device del dev_upper # calcolo memoria disponibile # meminfo = cuda.mem_get_info() # print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) return upper_tri
import pycuda.autoinit from pycuda import gpuarray import numpy as np from skcuda import cublas a = np.float32(10) x = np.float32([1, 2, 3]) y = np.float32([-.345, 8.15, -15.867]) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) cublas_context_h = cublas.cublasCreate() cublas.cublasSaxpy(cublas_context_h, x_gpu.size, a, x_gpu.gpudata, 1, y_gpu.gpudata, 1) cublas.cublasDestroy(cublas_context_h) print("This is close to the NumPy approximation: {}".format(np.allclose(a * x + y, y_gpu.get())))
def cor_mat_2(BOLD, upper_tri, N, L): # preprocessing fMRI data in CPU start_time = time.time() BOLD = preprocessing(BOLD, N, L) stop_time = time.time() delta = stop_time - start_time print("Running time for preprocessing: ", delta, "\n") alpha = np.float32(1.0) beta = np.float32(0.0) # passaggio su device start_time = time.time() BOLD_device = gpuarray.to_gpu(BOLD) result = np.zeros((BOLD.shape[0], BOLD.shape[0]), np.float32) result_device = gpuarray.to_gpu(result) # print("BOLD_device shape:", BOLD_device.shape) # print("result_device shape:", result_device.shape) stop_time = time.time() delta = stop_time - start_time print("Running time matrices to device: ", delta, "\n") start_time = time.time() h = cublas.cublasCreate() cublas.cublasSgemm(h, 'T', 'n', N, N, L, alpha, BOLD_device.gpudata, L, BOLD_device.gpudata, L, beta, result_device.gpudata, N) stop_time = time.time() delta = stop_time - start_time print("Running time core function: ", delta, "\n") start_time = time.time() threads_per_block = 1024 blocks_per_grid = int(math.ceil(1 + ((N*N - 1) / threads_per_block))) mod = pycuda.compiler.SourceModule(""" __global__ void ker(float * cormat, float * upper,int n1,int n) { long idx = blockDim.x*blockIdx.x+threadIdx.x; long i = idx%n1; long j = idx/n1; if(i<j && i<n1 && j<n) { long tmp=i; tmp*=(i+1); tmp/=2; long tmp_2=i; tmp_2*=n; tmp_2=tmp_2-tmp; tmp_2+=j; tmp_2-=i; upper[tmp_2-1]=cormat[j*n+i]; } } """) result_device = result_device.reshape(-1) # print("result device shape:", result_device.shape) upper_tri_device = gpuarray.to_gpu(upper_tri) funct = mod.get_function("ker") funct(result_device, upper_tri_device, np.int32(N), np.int32(N), block=(threads_per_block, 1, 1), grid=(blocks_per_grid, 1) ) upper_tri = upper_tri_device.get() stop_time = time.time() delta = stop_time - start_time print("Running time to get upper tri: ", delta, "\n") cublas.cublasDestroy(h) return upper_tri
def omnical(ggu_indices, gains, ubls, data, wgts, conv_crit=1e-10, maxiter=50, check_every=4, check_after=1, gain=0.3, nthreads=NTHREADS, precision=1, verbose=False): '''CPU-side function for organizing GPU-acceleration primitives into a cohesive omnical algorithm. Args: ggu_indices: (nbls,3) array of (i,j,k) indices denoting data order as gains[i] * gains[j].conj() * ubl[k] gains: (ndata, nants) array of estimated complex gains ubls: (ndata, nubls) array of estimated complex unique baselines data: (ndata, nbls) array of data to be calibrated wgts: (ndata, nbls) array of weights for each data conv_crit: maximum allowed relative change in solutions to be considered converged maxiter: maximum number of omnical iterations allowed before it gives up. check_every: Compute convergence every Nth iteration (saves computation). Default 4. check_after: Start computing convergence only after N iterations. Default 1. gain: The fractional step made toward the new solution each iteration. Values in the range 0.1 to 0.5 are generally safe. Increasing values trade speed for stability. Default is 0.3. nthreads: Number of GPU threads to use. Default NTHREADS=1024 precision: 1=float32, 2=double (float64). Default 1. verbose: If True, print things. Default False. Returns: info dictionary of {'gains':gains, 'ubls':ubls, 'chisq':chisq, 'iters':iters, 'conv':conv} ''' # Sanity check input array dimensions nbls = ggu_indices.shape[0] assert ggu_indices.shape == (nbls, 3) ndata = data.shape[0] assert data.shape == (ndata, nbls) assert wgts.shape == (ndata, nbls) nants = gains.shape[1] assert gains.shape == (ndata, nants) nubls = ubls.shape[1] assert ubls.shape == (ndata, nubls) assert precision in (1, 2) assert check_every > 1 if verbose: print('PRECISION:', precision) print('NDATA:', ndata) print('NANTS:', nants) print('NUBLS:', nubls) print('NBLS:', nbls) # Choose between float/double primitives if precision == 1: real_dtype, complex_dtype = np.float32, np.complex64 DTYPE, CDTYPE = 'float', 'cuFloatComplex' CMULT, CONJ, CSUB = 'cuCmulf', 'cuConjf', 'cuCsubf' COPY = cublasCcopy else: real_dtype, complex_dtype = np.float64, np.complex128 DTYPE, CDTYPE = 'double', 'cuDoubleComplex' CMULT, CONJ, CSUB = 'cuCmul', 'cuConj', 'cuCsub' COPY = cublasZcopy # ensure data types ggu_indices = ggu_indices.astype(np.uint32) data = data.astype(complex_dtype) wgts = wgts.astype(real_dtype) gains = gains.astype(complex_dtype) ubls = ubls.astype(complex_dtype) # Profiling info used to determine optimal chunk size: # Model: 15.3 s + ndata/140 => px=8192 in 75 s on Quadro T2000 # px=2000, nants=126, precision=2 #chunk_size = min(nthreads // 2, ndata) # 8.5 s #chunk_size = min(nthreads // 4, ndata) # 6.7 s #chunk_size = min(nthreads // 8, ndata) # 5.6 s #chunk_size = min(nthreads // 16, ndata) # 5.2 s #chunk_size = min(nthreads // 32, ndata) # 5.4 s #chunk_size = min(nthreads // 64, ndata) # 5.9 s # px=2000, nants=54, precision=2 #chunk_size = min(nthreads // 2, ndata) # 1.5 s #chunk_size = min(nthreads // 4, ndata) # 1.3 s #chunk_size = min(nthreads // 8, ndata) # 1.3 s #chunk_size = min(nthreads // 16, ndata) # 1.5 s #chunk_size = min(nthreads // 32, ndata) # 1.9 s #chunk_size = min(nthreads // 64, ndata) # 3.0 s # px=2000, nants=180, precision=2 #chunk_size = min(nthreads // 2, ndata) # 16.8 s #chunk_size = min(nthreads // 4, ndata) # 13.8 s #chunk_size = min(nthreads // 8, ndata) # 11.1 s #chunk_size = min(nthreads // 16, ndata) # 9.9 s #chunk_size = min(nthreads // 32, ndata) # 10.0 s #chunk_size = min(nthreads // 64, ndata) # 10.14 s # px=2000, nants=180, precision=1 #chunk_size = min(nthreads // 2, ndata) # 7.6 s #chunk_size = min(nthreads // 4, ndata) # 8.3 s #chunk_size = min(nthreads // 8, ndata) # 7.4 s #chunk_size = min(nthreads // 16, ndata) # 7.3 s #chunk_size = min(nthreads // 32, ndata) # 7.4 s #chunk_size = min(nthreads // 64, ndata) # 7.5 s # Upshot: nthreads // 16 seems to cover most cases, no need to tune. # Build the CUDA code gpu_code = GPU_TEMPLATE.format( **{ 'NBLS': nbls, 'NUBLS': nubls, 'NANTS': nants, 'GAIN': gain, 'CMULT': CMULT, 'CONJ': CONJ, 'CSUB': CSUB, 'DTYPE': DTYPE, 'CDTYPE': CDTYPE, }) # Extract functions from CUDA, suffix _cuda indicates GPU operation gpu_module = compiler.SourceModule(gpu_code) gen_dmdl_cuda = gpu_module.get_function("gen_dmdl") calc_chisq_cuda = gpu_module.get_function("calc_chisq") calc_dwgts_cuda = gpu_module.get_function("calc_dwgts") calc_gu_wgt_cuda = gpu_module.get_function("calc_gu_wgt") calc_gu_buf_cuda = gpu_module.get_function("calc_gu_buf") clear_complex_cuda = gpu_module.get_function("clear_complex") set_val_real_cuda = gpu_module.get_function("set_val_real") set_val_uint_cuda = gpu_module.get_function("set_val_uint") update_gains_cuda = gpu_module.get_function("update_gains") update_ubls_cuda = gpu_module.get_function("update_ubls") calc_conv_cuda = gpu_module.get_function("calc_conv") update_active_cuda = gpu_module.get_function("update_active") h = cublasCreate() # handle for managing cublas, used for buffer copies # define GPU buffers, suffix _g indicates GPU buffer chunk_size = min(nthreads // 16, ndata) data_chunks = 1 ANT_SHAPE = (chunk_size, nants) UBL_SHAPE = (chunk_size, nubls) BLS_SHAPE = (chunk_size, nbls) block = (chunk_size, int(np.floor(nthreads / chunk_size)), 1) ant_grid = (data_chunks, int(np.ceil(nants / block[1]))) ubl_grid = (data_chunks, int(np.ceil(nubls / block[1]))) bls_grid = (data_chunks, int(np.ceil(nbls / block[1]))) conv_grid = (data_chunks, int(np.ceil(max(nants, nubls) / block[1]))) if verbose: print('GPU block:', block) print('ANT grid:', ant_grid) print('UBL grid:', ubl_grid) print('BLS grid:', bls_grid) ggu_indices_g = gpuarray.empty(shape=ggu_indices.shape, dtype=np.uint32) active_g = gpuarray.empty(shape=(chunk_size, ), dtype=np.uint32) iters_g = gpuarray.empty(shape=(chunk_size, ), dtype=np.uint32) gains_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype) new_gains_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype) gbuf_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype) gwgt_g = gpuarray.empty(shape=ANT_SHAPE, dtype=real_dtype) ubls_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype) new_ubls_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype) ubuf_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype) uwgt_g = gpuarray.empty(shape=UBL_SHAPE, dtype=real_dtype) data_g = gpuarray.empty(shape=BLS_SHAPE, dtype=complex_dtype) dmdl_g = gpuarray.empty(shape=BLS_SHAPE, dtype=complex_dtype) wgts_g = gpuarray.empty(shape=BLS_SHAPE, dtype=real_dtype) dwgts_g = gpuarray.empty(shape=BLS_SHAPE, dtype=real_dtype) chisq_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype) new_chisq_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype) conv_sum_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype) conv_wgt_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype) conv_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype) # Define return buffers chisq = np.empty((ndata, ), dtype=real_dtype) conv = np.empty((ndata, ), dtype=real_dtype) iters = np.empty((ndata, ), dtype=np.uint32) active = np.empty((chunk_size, ), dtype=np.uint32) # upload data indices ggu_indices_g.set_async(ggu_indices) # initialize structures used to time code event_order = ('start', 'upload', 'dmdl', 'calc_chisq', 'loop_top', 'calc_gu_wgt', 'calc_gu_buf', 'copy_gains', 'copy_ubls', 'update_gains', 'update_ubls', 'dmdl2', 'chisq2', 'calc_conv', 'update_active', 'get_active', 'end') event_pairs = list((event_order[i], event_order[i + 1]) for i in range(len(event_order[:-1]))) cum_time = {} t0 = time.time() # Loop over chunks of parallel omnical problems for px in range(0, ndata, chunk_size): events = {e: driver.Event() for e in event_order} events['start'].record() end = min(ndata, px + chunk_size) beg = end - chunk_size offset = px - beg gains_g.set_async(gains[beg:end]) ubls_g.set_async(ubls[beg:end]) data_g.set_async(data[beg:end]) wgts_g.set_async(wgts[beg:end]) active = np.ones((chunk_size, ), dtype=np.uint32) if offset > 0: active[:offset] = 0 active_g.set_async(active) set_val_real_cuda(conv_sum_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) set_val_real_cuda(conv_wgt_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) events['upload'].record() gen_dmdl_cuda(ggu_indices_g, gains_g, ubls_g, dmdl_g, active_g, block=block, grid=bls_grid) events['dmdl'].record() set_val_real_cuda(chisq_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) calc_chisq_cuda(data_g, dmdl_g, wgts_g, chisq_g, active_g, block=block, grid=bls_grid) events['calc_chisq'].record() if TIME_IT: events['calc_chisq'].synchronize() for (e1, e2) in event_pairs[:3]: cum_time[(e1,e2)] = cum_time.get((e1,e2), 0) + \ events[e2].time_since(events[e1]) # Loop over iterations within an omnical problem for i in range(1, maxiter + 1): events['loop_top'].record() if (i % check_every) == 1: # Per standard omnical algorithm, only update gwgt/uwgt # every few iterations to save compute. calc_dwgts_cuda(dmdl_g, wgts_g, dwgts_g, active_g, block=block, grid=bls_grid) set_val_real_cuda( gwgt_g, np.uint32(nants * chunk_size), real_dtype(0), block=(nthreads, 1, 1), grid=(int(np.ceil(nants * chunk_size / nthreads)), 1)) set_val_real_cuda( uwgt_g, np.uint32(nubls * chunk_size), real_dtype(0), block=(nthreads, 1, 1), grid=(int(np.ceil(nubls * chunk_size / nthreads)), 1)) calc_gu_wgt_cuda(ggu_indices_g, dmdl_g, dwgts_g, gwgt_g, uwgt_g, active_g, block=block, grid=bls_grid) events['calc_gu_wgt'].record() clear_complex_cuda(gbuf_g, np.uint32(nants * chunk_size), block=(nthreads, 1, 1), grid=(int(np.ceil(nants * chunk_size / nthreads)), 1)) clear_complex_cuda(ubuf_g, np.uint32(nubls * chunk_size), block=(nthreads, 1, 1), grid=(int(np.ceil(nubls * chunk_size / nthreads)), 1)) # This is 75% of the compute load calc_gu_buf_cuda(ggu_indices_g, data_g, dwgts_g, dmdl_g, gbuf_g, ubuf_g, active_g, block=block, grid=bls_grid) events['calc_gu_buf'].record() if (i < maxiter) and (i < check_after or (i % check_every != 0)): # Fast branch: don't check convergence/divergence events['copy_gains'].record() events['copy_ubls'].record() update_gains_cuda(gbuf_g, gwgt_g, gains_g, np.float32(gain), active_g, block=block, grid=ant_grid) events['update_gains'].record() update_ubls_cuda(ubuf_g, uwgt_g, ubls_g, np.float32(gain), active_g, block=block, grid=ubl_grid) events['update_ubls'].record() gen_dmdl_cuda(ggu_indices_g, gains_g, ubls_g, dmdl_g, active_g, block=block, grid=bls_grid) events['dmdl2'].record() events['chisq2'].record() events['calc_conv'].record() events['update_active'].record() events['get_active'].record() else: # Slow branch: check convergence/divergence COPY(h, nants * chunk_size, gains_g.gpudata, 1, new_gains_g.gpudata, 1) events['copy_gains'].record() COPY(h, nubls * chunk_size, ubls_g.gpudata, 1, new_ubls_g.gpudata, 1) events['copy_ubls'].record() update_gains_cuda(gbuf_g, gwgt_g, new_gains_g, np.float32(gain), active_g, block=block, grid=ant_grid) events['update_gains'].record() update_ubls_cuda(ubuf_g, uwgt_g, new_ubls_g, np.float32(gain), active_g, block=block, grid=ubl_grid) events['update_ubls'].record() gen_dmdl_cuda(ggu_indices_g, new_gains_g, new_ubls_g, dmdl_g, active_g, block=block, grid=bls_grid) events['dmdl2'].record() set_val_real_cuda(new_chisq_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) calc_chisq_cuda(data_g, dmdl_g, wgts_g, new_chisq_g, active_g, block=block, grid=bls_grid) events['chisq2'].record() set_val_real_cuda(conv_sum_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) set_val_real_cuda(conv_wgt_g, np.uint32(chunk_size), real_dtype(0), block=(chunk_size, 1, 1), grid=(1, 1)) calc_conv_cuda(new_gains_g, gains_g, new_ubls_g, ubls_g, conv_sum_g, conv_wgt_g, active_g, block=block, grid=conv_grid) events['calc_conv'].record() update_active_cuda(new_gains_g, gains_g, new_ubls_g, ubls_g, conv_sum_g, conv_wgt_g, conv_g, real_dtype(conv_crit), new_chisq_g, chisq_g, iters_g, np.uint32(i), active_g, block=block, grid=conv_grid) events['update_active'].record() active_g.get_async(ary=active) events['get_active'].record() if not np.any(active): break events['end'].record() if TIME_IT: events['end'].synchronize() for (e1, e2) in event_pairs[4:]: cum_time[(e1,e2)] = cum_time.get((e1,e2), 0) + \ events[e2].time_since(events[e1]) # Download final answers into buffers returned to user # use offset to trim off parts of chunk that were never active _chisq = np.empty((chunk_size, ), dtype=real_dtype) chisq_g.get_async(ary=_chisq) chisq[px:end] = _chisq[offset:] _iters = np.empty((chunk_size, ), dtype=np.uint32) iters_g.get_async(ary=_iters) iters[px:end] = _iters[offset:] _conv = np.empty((chunk_size, ), dtype=real_dtype) conv_g.get_async(ary=_conv) conv[px:end] = _conv[offset:] _gains = np.empty(ANT_SHAPE, dtype=complex_dtype) gains_g.get_async(ary=_gains) gains[px:end, :] = _gains[offset:, :] _ubls = np.empty(UBL_SHAPE, dtype=complex_dtype) ubls_g.get_async(ary=_ubls) ubls[px:end, :] = _ubls[offset:, :] t1 = time.time() if TIME_IT: print('Final, nthreads=%d' % nthreads) for (e1, e2) in event_pairs: try: print('%6.3f' % cum_time[(e1, e2)], e1, e2) except (KeyError): pass print(t1 - t0) print() cublasDestroy(h) # teardown GPU configuration return { 'gains': gains, 'ubls': ubls, 'chisq': chisq, 'iters': iters, 'conv': conv }
y_pin = [ cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams]) for i in range(nStreams) ] h = cublas.cublasCreate() x_gpu = np.empty(nStreams, dtype=object) y_gpu = np.empty(nStreams, dtype=object) ans = np.empty(nStreams, dtype=object) for i in range(nStreams): cublas.cublasSetStream(h, streams[i].handle) x_gpu[i] = gpuarray.to_gpu_async(x_pin[i], stream=streams[i]) y_gpu[i] = gpuarray.to_gpu_async(y_pin[i], stream=streams[i]) cublas.cublasSaxpy(h, x_gpu[i].size, a, x_gpu[i].gpudata, 1, y_gpu[i].gpudata, 1) ans[i] = y_gpu[i].get_async(stream=streams[i]) cublas.cublasDestroy(h) # Uncomment to check for errors in the calculation #y_gpu = np.array([yg.get() for yg in y_gpu]) #y_gpu = np.array(y_gpu).reshape(y.shape) #print np.allclose(y_gpu, a*x + y) e.record() e.synchronize() print s.time_till(e), " ms"
def tearDown(self): cublas.cublasDestroy(self.cublas_handle)
import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np import skcuda.cublas as cublas A = np.array(([1, 2, 3], [4, 5, 6]), order='F').astype(np.float64) B = np.array(([7, 8, 1, 5], [9, 10, 0, 9], [11, 12, 5, 5]), order='F').astype(np.float64) A_gpu = gpuarray.to_gpu(A) B_gpu = gpuarray.to_gpu(B) m, k = A_gpu.shape k, n = B_gpu.shape C_gpu = gpuarray.empty((m, n), np.float64) alpha = np.float64(1.0) beta = np.float64(0.0) cublas_handle = cublas.cublasCreate() cublas.cublasDgemm(cublas_handle, 'n', 'n', m, n, k, alpha, A_gpu.gpudata, m, B_gpu.gpudata, k, beta, C_gpu.gpudata, m) cublas.cublasDestroy(cublas_handle) C_gpu = C_gpu.reshape(C_gpu.shape, order='F') print(np.dot(A, B)) print(C_gpu)