Esempio n. 1
0
def gpu_sdot(a,b):
    assert a.size == b.size
    assert a.shape[0] == b.shape[1]
    cublas.cublasInit()
    cublas.cublasFree(0)
    d_X = Linear(a.shape).from_numpy(a)
    d_Y = Linear(b.shape).from_numpy(b)
    gpu_result = cublas.cublasSdot(a.shape[1], d_X.ref, 1, d_Y.ref, 1)
    cuda.cudaThreadSynchronize()
    cublas.cublasShutdown()
    return gpu_result
Esempio n. 2
0
def gpu_sdot(a, b):
    assert a.size == b.size
    assert a.shape[0] == b.shape[1]
    cublas.cublasInit()
    cublas.cublasFree(0)
    d_X = Linear(a.shape).from_numpy(a)
    d_Y = Linear(b.shape).from_numpy(b)
    gpu_result = cublas.cublasSdot(a.shape[1], d_X.ref, 1, d_Y.ref, 1)
    cuda.cudaThreadSynchronize()
    cublas.cublasShutdown()
    return gpu_result
Esempio n. 3
0
def gpu_saxpy(a, b, alpha):
    # init cublas lib
    cublasInit()

    # allocate device vectors from host
    d_X = Linear(a.shape).from_numpy(a)
    d_Y = Linear(b.shape).from_numpy(b)

    # execute cublasSaxpy and sync threads
    cublasSaxpy(a.shape[1], alpha, d_X.ref, 1, d_Y.ref, 1)
    cudaThreadSynchronize()

    return d_Y.to_numpy()
Esempio n. 4
0
def gpu_saxpy(a,b,alpha):
    # init cublas lib
    cublasInit()

    # allocate device vectors from host
    d_X = Linear(a.shape).from_numpy(a)
    d_Y = Linear(b.shape).from_numpy(b)

    # execute cublasSaxpy and sync threads
    cublasSaxpy(a.shape[1],alpha,d_X.ref,1,d_Y.ref,1)
    cudaThreadSynchronize()

    return d_Y.to_numpy()
Esempio n. 5
0
def gpu_sgemm(a,b, alpha=1):
    """ Single Precision Matrix Multiplication on GPU, expects two, two-dimensional numpy arrays as input. Arrays must be such that a.shape[1] == b.shape[0]. Optionally specify alpha for scalar multiplication"""
    # init cublas
    cublasInit()

    assert a.shape[1] == b.shape[0]

    c_shape = (a.shape[0], b.shape[1])
    # allocate device matrices from host
    dA = Linear(a.shape, order='F').from_numpy(a)
    dB = Linear(b.shape, order='F').from_numpy(b)
    dC = Linear(c_shape, order='F')

    # transpose a/b ? t = yes, n = no
    transa = 'n'
    transb = 'n'

    # compute with CUBLAS
    cublasSgemm( transa, transb, a.shape[0], b.shape[1], a.shape[1], alpha, dA.ref, a.shape[0], dB.ref, b.shape[0], 0, dC.ref, a.shape[0] )
    cudaThreadSynchronize()
    # shutdown
    cublasShutdown() 
    return dC.to_numpy()
Esempio n. 6
0
def gpu_sgemm(a, b, alpha=1):
    """ Single Precision Matrix Multiplication on GPU, expects two, two-dimensional numpy arrays as input. Arrays must be such that a.shape[1] == b.shape[0]. Optionally specify alpha for scalar multiplication"""
    # init cublas
    cublasInit()

    assert a.shape[1] == b.shape[0]

    c_shape = (a.shape[0], b.shape[1])
    # allocate device matrices from host
    dA = Linear(a.shape, order='F').from_numpy(a)
    dB = Linear(b.shape, order='F').from_numpy(b)
    dC = Linear(c_shape, order='F')

    # transpose a/b ? t = yes, n = no
    transa = 'n'
    transb = 'n'

    # compute with CUBLAS
    cublasSgemm(transa, transb, a.shape[0], b.shape[1], a.shape[1], alpha,
                dA.ref, a.shape[0], dB.ref, b.shape[0], 0, dC.ref, a.shape[0])
    cudaThreadSynchronize()
    # shutdown
    cublasShutdown()
    return dC.to_numpy()
Esempio n. 7
0
def fftconvolve2d(data, kernel, test=False):
    s1 = numpy.array(data.shape)
    s2 = numpy.array(kernel.shape)
    dh, dw = data.shape

    h_Kernel = kernel
    h_Data = data

    # alias Complex type to float2
    Complex = cuda.float2

    # Kernel dimensions
    KERNEL_W = kernel.shape[0]
    KERNEL_H = kernel.shape[1]

    # Kernel center position
    KERNEL_X = KERNEL_W / 2
    KERNEL_Y = KERNEL_H / 2

    # Width and height of padding for "clamp to border" addressing mode
    PADDING_W = KERNEL_W - 1
    PADDING_H = KERNEL_H - 1

    # Input data dimension
    DATA_W = data.shape[0]
    DATA_H = data.shape[1]

    # Derive FFT size from data and kernel dimensions
    FFT_W = _calc_fft_size(DATA_W + PADDING_W)
    FFT_H = _calc_fft_size(DATA_H + PADDING_H)
    FFT_SIZE = FFT_W * FFT_H * ctypes.sizeof(Complex)
    KERNEL_SIZE = KERNEL_W * KERNEL_H * ctypes.sizeof(Complex)
    DATA_SIZE = DATA_W * DATA_H * ctypes.sizeof(Complex)

    e = ctypes.sizeof(ctypes.c_float) * 8
    float2tex = cuda.cudaCreateChannelDesc(e, e, 0, 0,
                                           cuda.cudaChannelFormatKindFloat)

    log.debug("Input data size           : %i x %i" % (DATA_W, DATA_H))
    log.debug("Convolution kernel size   : %i x %i" % (KERNEL_W, KERNEL_H))
    log.debug("Padded image size         : %i x %i" %
              (DATA_W + PADDING_W, DATA_H + PADDING_H))
    log.debug("Aligned padded image size : %i x %i" % (FFT_W, FFT_H))

    log.debug("Loading Kernels...")
    kernel_src = os.path.join(os.path.dirname(__file__),
                              'fftconvolve2d_kernel.cu')
    fftconvolve2d = SourceModule(open(kernel_src, 'r').read(),
                                 no_extern_c=True)

    log.debug("Extracting functions from Kernel...")
    log.debug("[*] Configuring Block/Grid dimensions...")
    # Block width should be a multiple of maximum coalesced write size
    # for coalesced memory writes in padKernel() and padData()
    threadBlock = cuda.dim3(16, 12, 1)
    kernelBlockGrid = cuda.dim3(_i_div_up(KERNEL_W, threadBlock.x),
                                _i_div_up(KERNEL_H, threadBlock.y), 1)
    dataBlockGrid = cuda.dim3(_i_div_up(FFT_W, threadBlock.x),
                              _i_div_up(FFT_H, threadBlock.y), 1)
    sixteen = cuda.dim3(16, 1, 1)
    onetwentyeight = cuda.dim3(128, 1, 1)
    # Extract kernel functions from SourceModule
    log.debug("[*] Loading padKernel...")
    padKernel = fftconvolve2d.padKernel(kernelBlockGrid, threadBlock)
    log.debug("[*] Loading padData...")
    padData = fftconvolve2d.padData(dataBlockGrid, threadBlock)
    log.debug("[*] Loading modulateAndNormalize...")
    modulateAndNormalize = fftconvolve2d.modulateAndNormalize(
        sixteen, onetwentyeight)

    log.debug("Allocating memory...")

    #log.debug("[*] Generating random input data...")
    #h_Kernel = numpy.random.uniform(0,1,(KERNEL_W,KERNEL_H)).astype(numpy.complex64)
    #h_Data = numpy.random.uniform(0,1,(DATA_W,DATA_H)).astype(numpy.complex64)

    log.debug("[*] Allocating host memory for results...")
    h_ResultGPU = numpy.zeros((FFT_W, FFT_H)).astype(numpy.complex64)

    log.debug("[*] Allocating linear device memory (Complex)...")
    d_PaddedKernel = fft._get_cufft_signal(
        numpy.zeros((FFT_W, FFT_H)).astype(numpy.complex64))
    d_PaddedData = fft._get_cufft_signal(
        numpy.zeros((FFT_W, FFT_H)).astype(numpy.complex64))

    log.debug("[*] Allocating cuda array device memory...")
    a_Kernel = _get_cuda_array(h_Kernel, float2tex)
    a_Data = _get_cuda_array(h_Data, float2tex)

    log.debug("[*] Binding textures...")
    texKernel = ctypes.cast(ctypes.c_void_p(),
                            ctypes.POINTER(cuda.textureReference))
    cuda_check_error(cuda.cudaGetTextureReference(texKernel, 'texKernel'))
    texData = ctypes.cast(ctypes.c_void_p(),
                          ctypes.POINTER(cuda.textureReference))
    cuda_check_error(cuda.cudaGetTextureReference(texData, 'texData'))

    fdesc = cuda.cudaChannelFormatDesc()
    cuda_check_error(cuda.cudaGetChannelDesc(fdesc, a_Kernel))
    cuda_check_error(cuda.cudaBindTextureToArray(texKernel, a_Kernel, fdesc))

    fdesc2 = cuda.cudaChannelFormatDesc()
    cuda_check_error(cuda.cudaGetChannelDesc(fdesc2, a_Data))
    cuda_check_error(cuda.cudaBindTextureToArray(texData, a_Data, fdesc2))

    log.debug('Calling kernels')
    log.debug("[*] Padding convolution kernel")
    padKernel(d_PaddedKernel, FFT_W, FFT_H, KERNEL_W, KERNEL_H, KERNEL_X,
              KERNEL_Y)

    log.debug("[*] Padding input data array")
    padData(d_PaddedData, FFT_W, FFT_H, DATA_W, DATA_H, KERNEL_W, KERNEL_H,
            KERNEL_X, KERNEL_Y)

    # Not including kernel transformation into time measurement,
    # since convolution kernel is not changed very frequently
    log.debug('Calling CUFFT')
    log.debug("[*] Transforming convolution kernel (CUFFT)...")
    FFTplan = fft._get_plan(h_ResultGPU.shape)
    cuda_check_error(
        cufft.cufftExecC2C(FFTplan, d_PaddedKernel, d_PaddedKernel,
                           cufft.CUFFT_FORWARD))
    log.debug("[*] Transforming data (CUFFT)...")
    cuda_check_error(cuda.cudaThreadSynchronize())
    cuda_check_error(
        cufft.cufftExecC2C(FFTplan, d_PaddedData, d_PaddedData,
                           cufft.CUFFT_FORWARD))

    log.debug('Calling kernel')
    log.debug("[*] modulateAndNormalize()")
    modulateAndNormalize(d_PaddedData, d_PaddedKernel, FFT_W * FFT_H)
    log.debug('Calling CUFFT')
    log.debug("[*] Inverse transforming data (CUFFT)...")
    cuda_check_error(
        cufft.cufftExecC2C(FFTplan, d_PaddedData, d_PaddedData,
                           cufft.CUFFT_INVERSE))
    cuda_check_error(cuda.cudaThreadSynchronize())

    log.debug("Copying results from GPU...")
    cuda_check_error(
        cuda.cudaMemcpy(h_ResultGPU.ctypes.data, d_PaddedData, FFT_SIZE,
                        cuda.cudaMemcpyDeviceToHost))
    h_ResultGPU = _centered(h_ResultGPU.real[0:dh, 0:dw], abs(s2 - s1) + 1)

    if test:
        log.info("Checking GPU results...")
        log.info("[*] running reference CPU convolution...")
        #conv_gold = get_convolution_cpu()
        #conv_gold(_get_float2_ptr(h_ResultCPU), _get_float2_ptr(h_Data), _get_float2_ptr(h_Kernel), DATA_W, DATA_H, KERNEL_W, KERNEL_H, KERNEL_X, KERNEL_Y)
        h_ResultCPU = scipy.signal.fftconvolve(h_Data.real,
                                               h_Kernel.real,
                                               mode='valid')
        log.info("[*] comparing the results...")
        check_results(h_ResultCPU, h_ResultGPU)

    log.debug("Shutting down...")

    log.debug("[*] Destroying FFT plans...")
    cuda_check_error(cufft.cufftDestroy(FFTplan))

    log.debug("[*] Unbinding textures...")
    cuda_check_error(cuda.cudaUnbindTexture(texData))
    cuda_check_error(cuda.cudaUnbindTexture(texKernel))

    log.debug("[*] Freeing device memory...")
    cuda_check_error(cuda.cudaFree(d_PaddedData))
    cuda_check_error(cuda.cudaFree(d_PaddedKernel))
    cuda_check_error(cuda.cudaFreeArray(a_Data))
    cuda_check_error(cuda.cudaFreeArray(a_Kernel))

    log.debug("[*] CUDA Thread Exit")
    cuda.cudaThreadExit()

    return h_ResultGPU
Esempio n. 8
0
def fftconvolve2d(data, kernel, test=False):
    s1 = numpy.array(data.shape)
    s2 = numpy.array(kernel.shape)
    dh, dw = data.shape

    h_Kernel = kernel
    h_Data = data

    # alias Complex type to float2
    Complex = cuda.float2

    # Kernel dimensions
    KERNEL_W = kernel.shape[0]
    KERNEL_H = kernel.shape[1]

    # Kernel center position
    KERNEL_X = KERNEL_W/2
    KERNEL_Y = KERNEL_H/2

    # Width and height of padding for "clamp to border" addressing mode
    PADDING_W = KERNEL_W - 1
    PADDING_H = KERNEL_H - 1

    # Input data dimension
    DATA_W = data.shape[0] 
    DATA_H = data.shape[1]

    # Derive FFT size from data and kernel dimensions
    FFT_W = _calc_fft_size(DATA_W + PADDING_W)
    FFT_H = _calc_fft_size(DATA_H + PADDING_H)
    FFT_SIZE = FFT_W * FFT_H * ctypes.sizeof(Complex)
    KERNEL_SIZE = KERNEL_W * KERNEL_H * ctypes.sizeof(Complex)
    DATA_SIZE = DATA_W * DATA_H * ctypes.sizeof(Complex)

    e = ctypes.sizeof(ctypes.c_float) * 8
    float2tex = cuda.cudaCreateChannelDesc(e, e, 0, 0, cuda.cudaChannelFormatKindFloat)

    log.debug("Input data size           : %i x %i" % (DATA_W, DATA_H))
    log.debug("Convolution kernel size   : %i x %i" % (KERNEL_W, KERNEL_H))
    log.debug("Padded image size         : %i x %i" % (DATA_W + PADDING_W, DATA_H + PADDING_H))
    log.debug("Aligned padded image size : %i x %i" % (FFT_W, FFT_H))

    log.debug("Loading Kernels...")
    kernel_src = os.path.join(os.path.dirname(__file__), 'fftconvolve2d_kernel.cu')
    fftconvolve2d = SourceModule(open(kernel_src,'r').read(), no_extern_c=True)

    log.debug("Extracting functions from Kernel...")
    log.debug("[*] Configuring Block/Grid dimensions...")
    # Block width should be a multiple of maximum coalesced write size 
    # for coalesced memory writes in padKernel() and padData()
    threadBlock = cuda.dim3(16, 12, 1)
    kernelBlockGrid = cuda.dim3(_i_div_up(KERNEL_W, threadBlock.x), _i_div_up(KERNEL_H, threadBlock.y),1)
    dataBlockGrid = cuda.dim3(_i_div_up(FFT_W, threadBlock.x),_i_div_up(FFT_H, threadBlock.y),1)
    sixteen = cuda.dim3(16,1,1)
    onetwentyeight = cuda.dim3(128,1,1)
    # Extract kernel functions from SourceModule
    log.debug("[*] Loading padKernel...")
    padKernel = fftconvolve2d.padKernel(kernelBlockGrid, threadBlock)
    log.debug("[*] Loading padData...")
    padData = fftconvolve2d.padData(dataBlockGrid, threadBlock)
    log.debug("[*] Loading modulateAndNormalize...")
    modulateAndNormalize = fftconvolve2d.modulateAndNormalize(sixteen, onetwentyeight)

    log.debug("Allocating memory...")

    #log.debug("[*] Generating random input data...")
    #h_Kernel = numpy.random.uniform(0,1,(KERNEL_W,KERNEL_H)).astype(numpy.complex64)
    #h_Data = numpy.random.uniform(0,1,(DATA_W,DATA_H)).astype(numpy.complex64)

    log.debug("[*] Allocating host memory for results...")
    h_ResultGPU = numpy.zeros((FFT_W,FFT_H)).astype(numpy.complex64)

    log.debug("[*] Allocating linear device memory (Complex)...")
    d_PaddedKernel = fft._get_cufft_signal(numpy.zeros((FFT_W,FFT_H)).astype(numpy.complex64))
    d_PaddedData = fft._get_cufft_signal(numpy.zeros((FFT_W,FFT_H)).astype(numpy.complex64))

    log.debug("[*] Allocating cuda array device memory...")
    a_Kernel = _get_cuda_array(h_Kernel,float2tex)
    a_Data = _get_cuda_array(h_Data, float2tex)

    log.debug("[*] Binding textures...")
    texKernel = ctypes.cast(ctypes.c_void_p(), ctypes.POINTER(cuda.textureReference))
    cuda_check_error(cuda.cudaGetTextureReference(texKernel,'texKernel')) 
    texData = ctypes.cast(ctypes.c_void_p(), ctypes.POINTER(cuda.textureReference))
    cuda_check_error(cuda.cudaGetTextureReference(texData,'texData'))

    fdesc = cuda.cudaChannelFormatDesc()
    cuda_check_error(cuda.cudaGetChannelDesc(fdesc, a_Kernel))
    cuda_check_error(cuda.cudaBindTextureToArray(texKernel, a_Kernel, fdesc))

    fdesc2 = cuda.cudaChannelFormatDesc()
    cuda_check_error(cuda.cudaGetChannelDesc(fdesc2, a_Data))
    cuda_check_error(cuda.cudaBindTextureToArray(texData, a_Data, fdesc2))

    log.debug('Calling kernels')
    log.debug("[*] Padding convolution kernel")
    padKernel(d_PaddedKernel, FFT_W, FFT_H, KERNEL_W, KERNEL_H, KERNEL_X, KERNEL_Y)

    log.debug("[*] Padding input data array")
    padData(d_PaddedData, FFT_W, FFT_H, DATA_W, DATA_H, KERNEL_W, KERNEL_H, KERNEL_X, KERNEL_Y)

    # Not including kernel transformation into time measurement,
    # since convolution kernel is not changed very frequently
    log.debug('Calling CUFFT')
    log.debug("[*] Transforming convolution kernel (CUFFT)...")
    FFTplan = fft._get_plan(h_ResultGPU.shape)
    cuda_check_error(cufft.cufftExecC2C(FFTplan, d_PaddedKernel, d_PaddedKernel, cufft.CUFFT_FORWARD))
    log.debug("[*] Transforming data (CUFFT)...")
    cuda_check_error(cuda.cudaThreadSynchronize())
    cuda_check_error(cufft.cufftExecC2C(FFTplan, d_PaddedData, d_PaddedData, cufft.CUFFT_FORWARD))

    log.debug('Calling kernel')
    log.debug("[*] modulateAndNormalize()")
    modulateAndNormalize(d_PaddedData, d_PaddedKernel, FFT_W * FFT_H)
    log.debug('Calling CUFFT')
    log.debug("[*] Inverse transforming data (CUFFT)...")
    cuda_check_error(cufft.cufftExecC2C(FFTplan, d_PaddedData, d_PaddedData, cufft.CUFFT_INVERSE))
    cuda_check_error(cuda.cudaThreadSynchronize())

    log.debug("Copying results from GPU...")
    cuda_check_error(cuda.cudaMemcpy(h_ResultGPU.ctypes.data, d_PaddedData, FFT_SIZE, cuda.cudaMemcpyDeviceToHost))
    h_ResultGPU = _centered(h_ResultGPU.real[0:dh,0:dw], abs(s2-s1)+1)

    if test:
        log.info("Checking GPU results...")
        log.info("[*] running reference CPU convolution...")
        #conv_gold = get_convolution_cpu() 
        #conv_gold(_get_float2_ptr(h_ResultCPU), _get_float2_ptr(h_Data), _get_float2_ptr(h_Kernel), DATA_W, DATA_H, KERNEL_W, KERNEL_H, KERNEL_X, KERNEL_Y)
        h_ResultCPU = scipy.signal.fftconvolve(h_Data.real, h_Kernel.real, mode='valid')
        log.info( "[*] comparing the results...")
        check_results(h_ResultCPU, h_ResultGPU)

    log.debug( "Shutting down...")

    log.debug( "[*] Destroying FFT plans...")
    cuda_check_error(cufft.cufftDestroy(FFTplan))

    log.debug( "[*] Unbinding textures...")
    cuda_check_error(cuda.cudaUnbindTexture(texData))
    cuda_check_error(cuda.cudaUnbindTexture(texKernel))

    log.debug( "[*] Freeing device memory...")
    cuda_check_error(cuda.cudaFree(d_PaddedData))
    cuda_check_error(cuda.cudaFree(d_PaddedKernel))
    cuda_check_error(cuda.cudaFreeArray(a_Data))
    cuda_check_error(cuda.cudaFreeArray(a_Kernel))

    log.debug( "[*] CUDA Thread Exit")
    cuda.cudaThreadExit()

    return h_ResultGPU