def gpu_sdot(a,b): assert a.size == b.size assert a.shape[0] == b.shape[1] cublas.cublasInit() cublas.cublasFree(0) d_X = Linear(a.shape).from_numpy(a) d_Y = Linear(b.shape).from_numpy(b) gpu_result = cublas.cublasSdot(a.shape[1], d_X.ref, 1, d_Y.ref, 1) cuda.cudaThreadSynchronize() cublas.cublasShutdown() return gpu_result
def gpu_sdot(a, b): assert a.size == b.size assert a.shape[0] == b.shape[1] cublas.cublasInit() cublas.cublasFree(0) d_X = Linear(a.shape).from_numpy(a) d_Y = Linear(b.shape).from_numpy(b) gpu_result = cublas.cublasSdot(a.shape[1], d_X.ref, 1, d_Y.ref, 1) cuda.cudaThreadSynchronize() cublas.cublasShutdown() return gpu_result
def gpu_saxpy(a, b, alpha): # init cublas lib cublasInit() # allocate device vectors from host d_X = Linear(a.shape).from_numpy(a) d_Y = Linear(b.shape).from_numpy(b) # execute cublasSaxpy and sync threads cublasSaxpy(a.shape[1], alpha, d_X.ref, 1, d_Y.ref, 1) cudaThreadSynchronize() return d_Y.to_numpy()
def gpu_saxpy(a,b,alpha): # init cublas lib cublasInit() # allocate device vectors from host d_X = Linear(a.shape).from_numpy(a) d_Y = Linear(b.shape).from_numpy(b) # execute cublasSaxpy and sync threads cublasSaxpy(a.shape[1],alpha,d_X.ref,1,d_Y.ref,1) cudaThreadSynchronize() return d_Y.to_numpy()
def gpu_sgemm(a,b, alpha=1): """ Single Precision Matrix Multiplication on GPU, expects two, two-dimensional numpy arrays as input. Arrays must be such that a.shape[1] == b.shape[0]. Optionally specify alpha for scalar multiplication""" # init cublas cublasInit() assert a.shape[1] == b.shape[0] c_shape = (a.shape[0], b.shape[1]) # allocate device matrices from host dA = Linear(a.shape, order='F').from_numpy(a) dB = Linear(b.shape, order='F').from_numpy(b) dC = Linear(c_shape, order='F') # transpose a/b ? t = yes, n = no transa = 'n' transb = 'n' # compute with CUBLAS cublasSgemm( transa, transb, a.shape[0], b.shape[1], a.shape[1], alpha, dA.ref, a.shape[0], dB.ref, b.shape[0], 0, dC.ref, a.shape[0] ) cudaThreadSynchronize() # shutdown cublasShutdown() return dC.to_numpy()
def gpu_sgemm(a, b, alpha=1): """ Single Precision Matrix Multiplication on GPU, expects two, two-dimensional numpy arrays as input. Arrays must be such that a.shape[1] == b.shape[0]. Optionally specify alpha for scalar multiplication""" # init cublas cublasInit() assert a.shape[1] == b.shape[0] c_shape = (a.shape[0], b.shape[1]) # allocate device matrices from host dA = Linear(a.shape, order='F').from_numpy(a) dB = Linear(b.shape, order='F').from_numpy(b) dC = Linear(c_shape, order='F') # transpose a/b ? t = yes, n = no transa = 'n' transb = 'n' # compute with CUBLAS cublasSgemm(transa, transb, a.shape[0], b.shape[1], a.shape[1], alpha, dA.ref, a.shape[0], dB.ref, b.shape[0], 0, dC.ref, a.shape[0]) cudaThreadSynchronize() # shutdown cublasShutdown() return dC.to_numpy()
def fftconvolve2d(data, kernel, test=False): s1 = numpy.array(data.shape) s2 = numpy.array(kernel.shape) dh, dw = data.shape h_Kernel = kernel h_Data = data # alias Complex type to float2 Complex = cuda.float2 # Kernel dimensions KERNEL_W = kernel.shape[0] KERNEL_H = kernel.shape[1] # Kernel center position KERNEL_X = KERNEL_W / 2 KERNEL_Y = KERNEL_H / 2 # Width and height of padding for "clamp to border" addressing mode PADDING_W = KERNEL_W - 1 PADDING_H = KERNEL_H - 1 # Input data dimension DATA_W = data.shape[0] DATA_H = data.shape[1] # Derive FFT size from data and kernel dimensions FFT_W = _calc_fft_size(DATA_W + PADDING_W) FFT_H = _calc_fft_size(DATA_H + PADDING_H) FFT_SIZE = FFT_W * FFT_H * ctypes.sizeof(Complex) KERNEL_SIZE = KERNEL_W * KERNEL_H * ctypes.sizeof(Complex) DATA_SIZE = DATA_W * DATA_H * ctypes.sizeof(Complex) e = ctypes.sizeof(ctypes.c_float) * 8 float2tex = cuda.cudaCreateChannelDesc(e, e, 0, 0, cuda.cudaChannelFormatKindFloat) log.debug("Input data size : %i x %i" % (DATA_W, DATA_H)) log.debug("Convolution kernel size : %i x %i" % (KERNEL_W, KERNEL_H)) log.debug("Padded image size : %i x %i" % (DATA_W + PADDING_W, DATA_H + PADDING_H)) log.debug("Aligned padded image size : %i x %i" % (FFT_W, FFT_H)) log.debug("Loading Kernels...") kernel_src = os.path.join(os.path.dirname(__file__), 'fftconvolve2d_kernel.cu') fftconvolve2d = SourceModule(open(kernel_src, 'r').read(), no_extern_c=True) log.debug("Extracting functions from Kernel...") log.debug("[*] Configuring Block/Grid dimensions...") # Block width should be a multiple of maximum coalesced write size # for coalesced memory writes in padKernel() and padData() threadBlock = cuda.dim3(16, 12, 1) kernelBlockGrid = cuda.dim3(_i_div_up(KERNEL_W, threadBlock.x), _i_div_up(KERNEL_H, threadBlock.y), 1) dataBlockGrid = cuda.dim3(_i_div_up(FFT_W, threadBlock.x), _i_div_up(FFT_H, threadBlock.y), 1) sixteen = cuda.dim3(16, 1, 1) onetwentyeight = cuda.dim3(128, 1, 1) # Extract kernel functions from SourceModule log.debug("[*] Loading padKernel...") padKernel = fftconvolve2d.padKernel(kernelBlockGrid, threadBlock) log.debug("[*] Loading padData...") padData = fftconvolve2d.padData(dataBlockGrid, threadBlock) log.debug("[*] Loading modulateAndNormalize...") modulateAndNormalize = fftconvolve2d.modulateAndNormalize( sixteen, onetwentyeight) log.debug("Allocating memory...") #log.debug("[*] Generating random input data...") #h_Kernel = numpy.random.uniform(0,1,(KERNEL_W,KERNEL_H)).astype(numpy.complex64) #h_Data = numpy.random.uniform(0,1,(DATA_W,DATA_H)).astype(numpy.complex64) log.debug("[*] Allocating host memory for results...") h_ResultGPU = numpy.zeros((FFT_W, FFT_H)).astype(numpy.complex64) log.debug("[*] Allocating linear device memory (Complex)...") d_PaddedKernel = fft._get_cufft_signal( numpy.zeros((FFT_W, FFT_H)).astype(numpy.complex64)) d_PaddedData = fft._get_cufft_signal( numpy.zeros((FFT_W, FFT_H)).astype(numpy.complex64)) log.debug("[*] Allocating cuda array device memory...") a_Kernel = _get_cuda_array(h_Kernel, float2tex) a_Data = _get_cuda_array(h_Data, float2tex) log.debug("[*] Binding textures...") texKernel = ctypes.cast(ctypes.c_void_p(), ctypes.POINTER(cuda.textureReference)) cuda_check_error(cuda.cudaGetTextureReference(texKernel, 'texKernel')) texData = ctypes.cast(ctypes.c_void_p(), ctypes.POINTER(cuda.textureReference)) cuda_check_error(cuda.cudaGetTextureReference(texData, 'texData')) fdesc = cuda.cudaChannelFormatDesc() cuda_check_error(cuda.cudaGetChannelDesc(fdesc, a_Kernel)) cuda_check_error(cuda.cudaBindTextureToArray(texKernel, a_Kernel, fdesc)) fdesc2 = cuda.cudaChannelFormatDesc() cuda_check_error(cuda.cudaGetChannelDesc(fdesc2, a_Data)) cuda_check_error(cuda.cudaBindTextureToArray(texData, a_Data, fdesc2)) log.debug('Calling kernels') log.debug("[*] Padding convolution kernel") padKernel(d_PaddedKernel, FFT_W, FFT_H, KERNEL_W, KERNEL_H, KERNEL_X, KERNEL_Y) log.debug("[*] Padding input data array") padData(d_PaddedData, FFT_W, FFT_H, DATA_W, DATA_H, KERNEL_W, KERNEL_H, KERNEL_X, KERNEL_Y) # Not including kernel transformation into time measurement, # since convolution kernel is not changed very frequently log.debug('Calling CUFFT') log.debug("[*] Transforming convolution kernel (CUFFT)...") FFTplan = fft._get_plan(h_ResultGPU.shape) cuda_check_error( cufft.cufftExecC2C(FFTplan, d_PaddedKernel, d_PaddedKernel, cufft.CUFFT_FORWARD)) log.debug("[*] Transforming data (CUFFT)...") cuda_check_error(cuda.cudaThreadSynchronize()) cuda_check_error( cufft.cufftExecC2C(FFTplan, d_PaddedData, d_PaddedData, cufft.CUFFT_FORWARD)) log.debug('Calling kernel') log.debug("[*] modulateAndNormalize()") modulateAndNormalize(d_PaddedData, d_PaddedKernel, FFT_W * FFT_H) log.debug('Calling CUFFT') log.debug("[*] Inverse transforming data (CUFFT)...") cuda_check_error( cufft.cufftExecC2C(FFTplan, d_PaddedData, d_PaddedData, cufft.CUFFT_INVERSE)) cuda_check_error(cuda.cudaThreadSynchronize()) log.debug("Copying results from GPU...") cuda_check_error( cuda.cudaMemcpy(h_ResultGPU.ctypes.data, d_PaddedData, FFT_SIZE, cuda.cudaMemcpyDeviceToHost)) h_ResultGPU = _centered(h_ResultGPU.real[0:dh, 0:dw], abs(s2 - s1) + 1) if test: log.info("Checking GPU results...") log.info("[*] running reference CPU convolution...") #conv_gold = get_convolution_cpu() #conv_gold(_get_float2_ptr(h_ResultCPU), _get_float2_ptr(h_Data), _get_float2_ptr(h_Kernel), DATA_W, DATA_H, KERNEL_W, KERNEL_H, KERNEL_X, KERNEL_Y) h_ResultCPU = scipy.signal.fftconvolve(h_Data.real, h_Kernel.real, mode='valid') log.info("[*] comparing the results...") check_results(h_ResultCPU, h_ResultGPU) log.debug("Shutting down...") log.debug("[*] Destroying FFT plans...") cuda_check_error(cufft.cufftDestroy(FFTplan)) log.debug("[*] Unbinding textures...") cuda_check_error(cuda.cudaUnbindTexture(texData)) cuda_check_error(cuda.cudaUnbindTexture(texKernel)) log.debug("[*] Freeing device memory...") cuda_check_error(cuda.cudaFree(d_PaddedData)) cuda_check_error(cuda.cudaFree(d_PaddedKernel)) cuda_check_error(cuda.cudaFreeArray(a_Data)) cuda_check_error(cuda.cudaFreeArray(a_Kernel)) log.debug("[*] CUDA Thread Exit") cuda.cudaThreadExit() return h_ResultGPU
def fftconvolve2d(data, kernel, test=False): s1 = numpy.array(data.shape) s2 = numpy.array(kernel.shape) dh, dw = data.shape h_Kernel = kernel h_Data = data # alias Complex type to float2 Complex = cuda.float2 # Kernel dimensions KERNEL_W = kernel.shape[0] KERNEL_H = kernel.shape[1] # Kernel center position KERNEL_X = KERNEL_W/2 KERNEL_Y = KERNEL_H/2 # Width and height of padding for "clamp to border" addressing mode PADDING_W = KERNEL_W - 1 PADDING_H = KERNEL_H - 1 # Input data dimension DATA_W = data.shape[0] DATA_H = data.shape[1] # Derive FFT size from data and kernel dimensions FFT_W = _calc_fft_size(DATA_W + PADDING_W) FFT_H = _calc_fft_size(DATA_H + PADDING_H) FFT_SIZE = FFT_W * FFT_H * ctypes.sizeof(Complex) KERNEL_SIZE = KERNEL_W * KERNEL_H * ctypes.sizeof(Complex) DATA_SIZE = DATA_W * DATA_H * ctypes.sizeof(Complex) e = ctypes.sizeof(ctypes.c_float) * 8 float2tex = cuda.cudaCreateChannelDesc(e, e, 0, 0, cuda.cudaChannelFormatKindFloat) log.debug("Input data size : %i x %i" % (DATA_W, DATA_H)) log.debug("Convolution kernel size : %i x %i" % (KERNEL_W, KERNEL_H)) log.debug("Padded image size : %i x %i" % (DATA_W + PADDING_W, DATA_H + PADDING_H)) log.debug("Aligned padded image size : %i x %i" % (FFT_W, FFT_H)) log.debug("Loading Kernels...") kernel_src = os.path.join(os.path.dirname(__file__), 'fftconvolve2d_kernel.cu') fftconvolve2d = SourceModule(open(kernel_src,'r').read(), no_extern_c=True) log.debug("Extracting functions from Kernel...") log.debug("[*] Configuring Block/Grid dimensions...") # Block width should be a multiple of maximum coalesced write size # for coalesced memory writes in padKernel() and padData() threadBlock = cuda.dim3(16, 12, 1) kernelBlockGrid = cuda.dim3(_i_div_up(KERNEL_W, threadBlock.x), _i_div_up(KERNEL_H, threadBlock.y),1) dataBlockGrid = cuda.dim3(_i_div_up(FFT_W, threadBlock.x),_i_div_up(FFT_H, threadBlock.y),1) sixteen = cuda.dim3(16,1,1) onetwentyeight = cuda.dim3(128,1,1) # Extract kernel functions from SourceModule log.debug("[*] Loading padKernel...") padKernel = fftconvolve2d.padKernel(kernelBlockGrid, threadBlock) log.debug("[*] Loading padData...") padData = fftconvolve2d.padData(dataBlockGrid, threadBlock) log.debug("[*] Loading modulateAndNormalize...") modulateAndNormalize = fftconvolve2d.modulateAndNormalize(sixteen, onetwentyeight) log.debug("Allocating memory...") #log.debug("[*] Generating random input data...") #h_Kernel = numpy.random.uniform(0,1,(KERNEL_W,KERNEL_H)).astype(numpy.complex64) #h_Data = numpy.random.uniform(0,1,(DATA_W,DATA_H)).astype(numpy.complex64) log.debug("[*] Allocating host memory for results...") h_ResultGPU = numpy.zeros((FFT_W,FFT_H)).astype(numpy.complex64) log.debug("[*] Allocating linear device memory (Complex)...") d_PaddedKernel = fft._get_cufft_signal(numpy.zeros((FFT_W,FFT_H)).astype(numpy.complex64)) d_PaddedData = fft._get_cufft_signal(numpy.zeros((FFT_W,FFT_H)).astype(numpy.complex64)) log.debug("[*] Allocating cuda array device memory...") a_Kernel = _get_cuda_array(h_Kernel,float2tex) a_Data = _get_cuda_array(h_Data, float2tex) log.debug("[*] Binding textures...") texKernel = ctypes.cast(ctypes.c_void_p(), ctypes.POINTER(cuda.textureReference)) cuda_check_error(cuda.cudaGetTextureReference(texKernel,'texKernel')) texData = ctypes.cast(ctypes.c_void_p(), ctypes.POINTER(cuda.textureReference)) cuda_check_error(cuda.cudaGetTextureReference(texData,'texData')) fdesc = cuda.cudaChannelFormatDesc() cuda_check_error(cuda.cudaGetChannelDesc(fdesc, a_Kernel)) cuda_check_error(cuda.cudaBindTextureToArray(texKernel, a_Kernel, fdesc)) fdesc2 = cuda.cudaChannelFormatDesc() cuda_check_error(cuda.cudaGetChannelDesc(fdesc2, a_Data)) cuda_check_error(cuda.cudaBindTextureToArray(texData, a_Data, fdesc2)) log.debug('Calling kernels') log.debug("[*] Padding convolution kernel") padKernel(d_PaddedKernel, FFT_W, FFT_H, KERNEL_W, KERNEL_H, KERNEL_X, KERNEL_Y) log.debug("[*] Padding input data array") padData(d_PaddedData, FFT_W, FFT_H, DATA_W, DATA_H, KERNEL_W, KERNEL_H, KERNEL_X, KERNEL_Y) # Not including kernel transformation into time measurement, # since convolution kernel is not changed very frequently log.debug('Calling CUFFT') log.debug("[*] Transforming convolution kernel (CUFFT)...") FFTplan = fft._get_plan(h_ResultGPU.shape) cuda_check_error(cufft.cufftExecC2C(FFTplan, d_PaddedKernel, d_PaddedKernel, cufft.CUFFT_FORWARD)) log.debug("[*] Transforming data (CUFFT)...") cuda_check_error(cuda.cudaThreadSynchronize()) cuda_check_error(cufft.cufftExecC2C(FFTplan, d_PaddedData, d_PaddedData, cufft.CUFFT_FORWARD)) log.debug('Calling kernel') log.debug("[*] modulateAndNormalize()") modulateAndNormalize(d_PaddedData, d_PaddedKernel, FFT_W * FFT_H) log.debug('Calling CUFFT') log.debug("[*] Inverse transforming data (CUFFT)...") cuda_check_error(cufft.cufftExecC2C(FFTplan, d_PaddedData, d_PaddedData, cufft.CUFFT_INVERSE)) cuda_check_error(cuda.cudaThreadSynchronize()) log.debug("Copying results from GPU...") cuda_check_error(cuda.cudaMemcpy(h_ResultGPU.ctypes.data, d_PaddedData, FFT_SIZE, cuda.cudaMemcpyDeviceToHost)) h_ResultGPU = _centered(h_ResultGPU.real[0:dh,0:dw], abs(s2-s1)+1) if test: log.info("Checking GPU results...") log.info("[*] running reference CPU convolution...") #conv_gold = get_convolution_cpu() #conv_gold(_get_float2_ptr(h_ResultCPU), _get_float2_ptr(h_Data), _get_float2_ptr(h_Kernel), DATA_W, DATA_H, KERNEL_W, KERNEL_H, KERNEL_X, KERNEL_Y) h_ResultCPU = scipy.signal.fftconvolve(h_Data.real, h_Kernel.real, mode='valid') log.info( "[*] comparing the results...") check_results(h_ResultCPU, h_ResultGPU) log.debug( "Shutting down...") log.debug( "[*] Destroying FFT plans...") cuda_check_error(cufft.cufftDestroy(FFTplan)) log.debug( "[*] Unbinding textures...") cuda_check_error(cuda.cudaUnbindTexture(texData)) cuda_check_error(cuda.cudaUnbindTexture(texKernel)) log.debug( "[*] Freeing device memory...") cuda_check_error(cuda.cudaFree(d_PaddedData)) cuda_check_error(cuda.cudaFree(d_PaddedKernel)) cuda_check_error(cuda.cudaFreeArray(a_Data)) cuda_check_error(cuda.cudaFreeArray(a_Kernel)) log.debug( "[*] CUDA Thread Exit") cuda.cudaThreadExit() return h_ResultGPU