Exemple #1
0
def ola_GPU_test(xs_gpu, csf, sw, nhop, offset=(0,0)):
    
    sxs = xs_gpu.shape

    sx = np.array(nhop*csf+sw-nhop)
    sx = ((int(sx[0]), int(sx[1])))

    block_size = (16,16,1)
    grid_size = (int(np.ceil(float(sx[1])/block_size[1])),
                 int(np.ceil(float(sx[0])/block_size[0])))

    if xs_gpu.dtype == np.float32:
        mod = cu.module_from_buffer(cubin)
        ola_Kernel = mod.get_function("ola_Kernel_test")
    elif xs_gpu.dtype == np.complex64:
        mod = cu.module_from_buffer(cubin)
        ola_Kernel = mod.get_function("ola_ComplexKernel_test")

    x_gpu = cua.zeros(sx, np.float32)
    ola_Kernel(x_gpu.gpudata, xs_gpu.gpudata,
               np.uint32(sx[0]), np.uint32(sx[1]),
               np.uint32(sxs[1]), np.uint32(sxs[2]),
               np.uint32(sw[0]), np.uint32(sw[1]),
               np.uint32(offset[0]), np.uint32(offset[1]),
               np.uint32(csf[0]), np.uint32(csf[1]),
               np.uint32(nhop[0]), np.uint32(nhop[1]),
               block=block_size, grid=grid_size)

    return x_gpu
Exemple #2
0
def ola_GPU_test(xs_gpu, csf, sw, nhop, offset=(0,0)):
    
    sxs = xs_gpu.shape

    sx = np.array(nhop*csf+sw-nhop)
    sx = ((int(sx[0]), int(sx[1])))

    block_size = (16,16,1)
    grid_size = (int(np.ceil(float(sx[1])/block_size[1])),
                 int(np.ceil(float(sx[0])/block_size[0])))

    if xs_gpu.dtype == np.float32:
        mod = cu.module_from_buffer(cubin)
        ola_Kernel = mod.get_function("ola_Kernel_test")
    elif xs_gpu.dtype == np.complex64:
        mod = cu.module_from_buffer(cubin)
        ola_Kernel = mod.get_function("ola_ComplexKernel_test")

    x_gpu = cua.zeros(sx, np.float32)
    ola_Kernel(x_gpu.gpudata, xs_gpu.gpudata,
               np.uint32(sx[0]), np.uint32(sx[1]),
               np.uint32(sxs[1]), np.uint32(sxs[2]),
               np.uint32(sw[0]), np.uint32(sw[1]),
               np.uint32(offset[0]), np.uint32(offset[1]),
               np.uint32(csf[0]), np.uint32(csf[1]),
               np.uint32(nhop[0]), np.uint32(nhop[1]),
               block=block_size, grid=grid_size)

    return x_gpu
Exemple #3
0
def create_function(ptxcode, iomap, arg_nametypes):
    m = drv.module_from_buffer(ptxcode)
    #print(ptxcode)
    stub_function = m.get_function('stub')

    iofun = {
        IOTracker.nio: drv.In,
        IOTracker.i: drv.In,
        IOTracker.o: drv.Out,
        IOTracker.io: drv.InOut
    }

    def param_wrapper(bsz, gsz):
        def stub_wrapper(*args):

            assert len(args) == len(
                arg_nametypes), 'error : invalid number argument'

            wrapped_args = []
            for i, arg in enumerate(args):
                arg_name, arg_type = arg_nametypes[i]

                if isinstance(arg_type, nvtype.pointer):
                    wrapped_args.append(iofun[iomap[arg_name]](arg))
                else:
                    wrapped_args.append(arg)

            stub_function(*tuple(wrapped_args), block=bsz, grid=gsz)

            return

        return stub_wrapper

    return param_wrapper
Exemple #4
0
def get_CUDA_function(device_id, function_name):
    """
        Returns the compiled kernel for the given device
        and kernel key.
    """
    global KERNELS
    data = KERNELS.get(function_name)
    if data is None:
        from xpra.platform.paths import default_get_app_dir
        from xpra.os_util import load_binary_file
        cubin_file = os.path.join(default_get_app_dir(), "cuda",
                                  "%s.fatbin" % function_name)
        log("get_CUDA_function(%s, %s) cubin file=%s", device_id,
            function_name, cubin_file)
        data = load_binary_file(cubin_file)
        if not data:
            log.error("failed to load CUDA bin file %s", cubin_file)
            return None
        log(" loaded %s bytes", len(data))
        KERNELS[function_name] = data
    #now load from cubin:
    start = time.time()
    mod = driver.module_from_buffer(data)
    log("get_CUDA_function(%s, %s) module=%s", device_id, function_name, mod)
    try:
        CUDA_function = mod.get_function(function_name)
    except driver.LogicError as e:
        raise Exception("failed to load '%s' from %s: %s" %
                        (function_name, mod, e))
    end = time.time()
    log("loading function %s from pre-compiled cubin took %.1fms",
        function_name, 1000.0 * (end - start))
    return CUDA_function
Exemple #5
0
    def __init__(
        self,
        source,
        nvcc="nvcc",
        options=None,
        keep=False,
        no_extern_c=False,
        arch=None,
        code=None,
        cache_dir=None,
        include_dirs=[],
    ):
        self._check_arch(arch)

        cubin = compile(
            source,
            nvcc,
            options,
            keep,
            no_extern_c,
            arch,
            code,
            cache_dir,
            include_dirs,
        )

        from pycuda.driver import module_from_buffer

        self.module = module_from_buffer(cubin)

        self._bind_module()
Exemple #6
0
def comp_ola_sdeconv(gx_gpu, gy_gpu, xx_gpu, xy_gpu, Ftpy_gpu, f_gpu, L_gpu, alpha, beta, gamma=0):
    """
    Computes the division in Fourier space needed for sparse deconvolution
    """
    
    sfft = xx_gpu.shape
    block_size = (16,16,1)   
    grid_size = (int(np.ceil(np.float32(sfft[0]*sfft[1])/block_size[0])),
                 int(np.ceil(np.float32(sfft[2])/block_size[1])))

    mod = cu.module_from_buffer(cubin)
    comp_ola_sdeconv_Kernel = mod.get_function("comp_ola_sdeconv_Kernel")

    z_gpu = cua.zeros(sfft, np.complex64)

    comp_ola_sdeconv_Kernel(z_gpu.gpudata,
                            np.int32(sfft[0]), np.int32(sfft[1]), np.int32(sfft[2]),
                            gx_gpu.gpudata, gy_gpu.gpudata,
                            xx_gpu.gpudata, xy_gpu.gpudata, 
                            Ftpy_gpu.gpudata, f_gpu.gpudata, L_gpu.gpudata,
                            np.float32(alpha), np.float32(beta),
                            np.float32(gamma),
                            block=block_size, grid=grid_size)

    return z_gpu
Exemple #7
0
def get_CUDA_function(device_id, function_name):
    """
        Returns the compiled kernel for the given device
        and kernel key.
    """
    global KERNELS
    data = KERNELS.get(function_name)
    if data is None:
        from xpra.platform.paths import get_resources_dir
        cubin_file = os.path.join(get_resources_dir(), "cuda", "%s.fatbin" % function_name)
        log("get_CUDA_function(%s, %s) cubin file=%s", device_id, function_name, cubin_file)
        data = load_binary_file(cubin_file)
        if not data:
            log.error("Error: failed to load CUDA bin file '%s'", cubin_file)
            return None
        log(" loaded %s bytes", len(data))
        KERNELS[function_name] = data
    #now load from cubin:
    start = monotonic_time()
    try:
        mod = driver.module_from_buffer(data)
    except Exception as e:
        log("module_from_buffer(%s)", data, exc_info=True)
        log.error("Error: failed to load module from buffer for '%s'", function_name)
        log.error(" %s", e)
        return None
    log("get_CUDA_function(%s, %s) module=%s", device_id, function_name, mod)
    try:
        fn = function_name
        CUDA_function = mod.get_function(fn)
    except driver.LogicError as e:
        raise Exception("failed to load '%s' from %s: %s" % (function_name, mod, e)) from None
    end = monotonic_time()
    log("loading function %s from pre-compiled cubin took %.1fms", function_name, 1000.0*(end-start))
    return CUDA_function
Exemple #8
0
def get_CUDA_function(device_id, function_name):
    """
        Returns the compiled kernel for the given device
        and kernel key.
    """
    global KERNELS
    data = KERNELS.get(function_name)
    if data is None:
        from xpra.platform.paths import default_get_app_dir
        from xpra.os_util import load_binary_file
        cubin_file = os.path.join(default_get_app_dir(), "cuda", "%s.fatbin" % function_name)
        log("get_CUDA_function(%s, %s) cubin file=%s", device_id, function_name, cubin_file)
        data = load_binary_file(cubin_file)
        if not data:
            log.error("failed to load CUDA bin file %s", cubin_file)
            return None
        log(" loaded %s bytes", len(data))
        KERNELS[function_name] = data
    #now load from cubin:
    start = time.time()
    mod = driver.module_from_buffer(data)
    log("get_CUDA_function(%s, %s) module=%s", device_id, function_name, mod)
    try:
        CUDA_function = mod.get_function(function_name)
    except driver.LogicError as e:
        raise Exception("failed to load '%s' from %s: %s" % (function_name, mod, e))
    end = time.time()
    log("loading function %s from pre-compiled cubin took %.1fms", function_name, 1000.0*(end-start))
    return CUDA_function
Exemple #9
0
def get_CUDA_function(device_id, function_name, kernel_source):
    """
        Returns the compiled kernel for the given device
        and kernel key.
        Kernels may be pre-compiled with compile_all.
    """
    global KERNEL_cubins
    cubin = KERNEL_cubins.get((device_id, function_name))
    if cubin is None:
        start = time.time()
        log("compiling for device %s: %s=%s", device_id, function_name,
            kernel_source)
        cubin = compile(kernel_source)
        KERNEL_cubins[(device_id, function_name)] = cubin
        end = time.time()
        log("compilation of %s took %.1fms", function_name,
            1000.0 * (end - start))
    #now load from cubin:
    start = time.time()
    mod = driver.module_from_buffer(cubin)
    CUDA_function = mod.get_function(function_name)
    end = time.time()
    log("loading function %s from pre-compiled cubin took %.1fms",
        function_name, 1000.0 * (end - start))
    return CUDA_function
Exemple #10
0
def comp_ola_sdeconv(gx_gpu, gy_gpu, xx_gpu, xy_gpu, Ftpy_gpu, f_gpu, L_gpu, alpha, beta, gamma=0):
    """
    Computes the division in Fourier space needed for sparse deconvolution
    """
    
    sfft = xx_gpu.shape
    block_size = (16,16,1)   
    grid_size = (int(np.ceil(np.float32(sfft[0]*sfft[1])/block_size[0])),
                 int(np.ceil(np.float32(sfft[2])/block_size[1])))

    mod = cu.module_from_buffer(cubin)
    comp_ola_sdeconv_Kernel = mod.get_function("comp_ola_sdeconv_Kernel")

    z_gpu = cua.zeros(sfft, np.complex64)

    comp_ola_sdeconv_Kernel(z_gpu.gpudata,
                            np.int32(sfft[0]), np.int32(sfft[1]), np.int32(sfft[2]),
                            gx_gpu.gpudata, gy_gpu.gpudata,
                            xx_gpu.gpudata, xy_gpu.gpudata, 
                            Ftpy_gpu.gpudata, f_gpu.gpudata, L_gpu.gpudata,
                            np.float32(alpha), np.float32(beta),
                            np.float32(gamma),
                            block=block_size, grid=grid_size)

    return z_gpu
Exemple #11
0
def cropGPU(gpuArray, size, offset=(0, 0), block_size=(32, 32, 1)):
    """
    Crop an image array that is on the GPU to a new size.

    :param gpuArray: image array to be cropped
    :type gpuArray: GPU array
    :param size: size to which the array is crooped to (y, x)
    :type size: tuple
    :param offset: apply offset?
    :type offset: tuple
    :param block_size: CUDA block_size
    :param block_size: tuple

    :return: cropped array
    :rtype: GPU array
    """

    sfft = gpuArray.shape

    grid_size = (int(np.ceil(float(sfft[1]) / block_size[1])),
                 int(np.ceil(float(sfft[0]) / block_size[0])))

    if gpuArray.dtype == np.float32:
        mod = cuda.module_from_buffer(cubin)
        cropKernel = mod.get_function("crop_Kernel")

    elif gpuArray.dtype == np.complex64:
        mod = cuda.module_from_buffer(cubin)
        cropKernel = mod.get_function("crop_ComplexKernel")
    else:
        print 'Incorrect data type in cropGPU'
        return None

    x_cropped_gpu = cua.empty(tuple((int(size[0]), int(size[1]))), np.float32)

    cropKernel(x_cropped_gpu.gpudata,
               np.int32(size[0]),
               np.int32(size[1]),
               gpuArray.gpudata,
               np.int32(sfft[0]),
               np.int32(sfft[1]),
               np.int32(offset[0]),
               np.int32(offset[1]),
               block=block_size,
               grid=grid_size)

    return x_cropped_gpu
Exemple #12
0
 def load(self, cubin):
     if cubin in self._modrefs:
         return self._modrefs[cubin]
     mod = cuda.module_from_buffer(self.cubin)
     if len(self._modrefs) > self.MAX_MODREFS:
         self._modrefs.clear()
     self._modrefs[cubin] = mod
     return mod
Exemple #13
0
 def load(self, cubin):
     if cubin in self._modrefs:
         return self._modrefs[cubin]
     mod = cuda.module_from_buffer(self.cubin)
     if len(self._modrefs) > self.MAX_MODREFS:
         self._modrefs.clear()
     self._modrefs[cubin] = mod
     return mod
Exemple #14
0
def chop_pad_GPU_test(x, csf, sw, nhop, sz=None, offset=(0,0), dtype='real'):
    
    sx = x.shape

    if sz == None:
        sz = 2**np.ceil(np.log2(sw))

    block_size = (32,32,1)
    grid_size = (int(np.ceil(np.float32(sz[0]*sz[1])/block_size[0])),
                 int(np.ceil(np.float32(np.prod(csf))/block_size[1])))

    #print block_size
    #print grid_size
    #print csf

    sxp = np.array(nhop*csf+sw-nhop)
    sxp = ((int(sxp[0]), int(sxp[1])))

    x_gpu = pad_cpu2gpu(x, sxp, dtype='real')
    
    if dtype == 'real':
        mod = cu.module_from_buffer(cubin)
        chop_pad_Kernel = mod.get_function("chop_pad_Kernel_test")
        xs_gpu = cua.empty(((int(np.prod(csf)), int(sz[0]),int(sz[1]))),
                           np.float32)
    elif dtype == 'complex':
        mod = cu.module_from_buffer(cubin)
        chop_pad_Kernel = mod.get_function("chop_pad_ComplexKernel_test")     
        xs_gpu = cua.empty(((int(np.prod(csf)), int(sz[0]),int(sz[1]))),
                           np.complex64)
        
    sz = xs_gpu.shape        
    chop_pad_Kernel(xs_gpu.gpudata, np.int32(sz[1]),
                               np.int32(sz[2]), np.int32(sz[0]), x_gpu.gpudata,
                               np.int32(sxp[0]), np.int32(sxp[1]),
                               np.int32(sw[0]), np.int32(sw[1]),
                               np.int32(offset[0]), np.int32(offset[1]),
                               np.int32(csf[0]), np.int32(csf[1]),
                               np.int32(nhop[0]), np.int32(nhop[1]),
                               block=block_size, grid=grid_size)

    return xs_gpu
Exemple #15
0
def crop_stack_GPU(x, sz, offset=(0,0), dtype='real'):
    
    if x.__class__ == np.ndarray:
        x = np.array(x).astype(np.float32)
        x_gpu = cua.to_gpu(x)
    elif x.__class__ == cua.GPUArray:
        x_gpu = x

    sx = x_gpu.shape
    block_size = (16,16,1)   
    grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[0])),
                 int(np.ceil(np.float32(sz[1])/block_size[1])))

    sx_before = np.array([sx[1],sx[2]])
    sx_after  = np.array(sz)
    if any(np.array([sx[1],sx[2]])-(np.array(sz))<offset):
        raise IOError('Size missmatch: Size after - size before < offset')
    

    if dtype == 'real':

        if x_gpu.dtype != np.float32:
            x_gpu = x_gpu.real

        mod = cu.module_from_buffer(cubin)
        crop_stack_Kernel = mod.get_function("crop_stack_Kernel")

        xc_gpu = cua.zeros(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.float32)

    if dtype == 'complex':
     
        mod = cu.module_from_buffer(cubin)
        crop_stack_Kernel = mod.get_function("crop_stack_ComplexKernel")
        xc_gpu = cua.empty(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.complex64)
        
    crop_stack_Kernel(xc_gpu.gpudata, np.int32(sx[0]),
                                      np.int32(sz[0]),     np.int32(sz[1]),
                       x_gpu.gpudata, np.int32(sx[1]),     np.int32(sx[2]),
                                      np.int32(offset[0]), np.int32(offset[1]),
                                      block=block_size, grid=grid_size)
        
    return xc_gpu
Exemple #16
0
def chop_pad_GPU_test(x, csf, sw, nhop, sz=None, offset=(0,0), dtype='real'):
    
    sx = x.shape

    if sz == None:
        sz = 2**np.ceil(np.log2(sw))

    block_size = (32,32,1)
    grid_size = (int(np.ceil(np.float32(sz[0]*sz[1])/block_size[0])),
                 int(np.ceil(np.float32(np.prod(csf))/block_size[1])))

    #print block_size
    #print grid_size
    #print csf

    sxp = np.array(nhop*csf+sw-nhop)
    sxp = ((int(sxp[0]), int(sxp[1])))

    x_gpu = pad_cpu2gpu(x, sxp, dtype='real')
    
    if dtype == 'real':
        mod = cu.module_from_buffer(cubin)
        chop_pad_Kernel = mod.get_function("chop_pad_Kernel_test")
        xs_gpu = cua.empty(((int(np.prod(csf)), int(sz[0]),int(sz[1]))),
                           np.float32)
    elif dtype == 'complex':
        mod = cu.module_from_buffer(cubin)
        chop_pad_Kernel = mod.get_function("chop_pad_ComplexKernel_test")     
        xs_gpu = cua.empty(((int(np.prod(csf)), int(sz[0]),int(sz[1]))),
                           np.complex64)
        
    sz = xs_gpu.shape        
    chop_pad_Kernel(xs_gpu.gpudata, np.int32(sz[1]),
                               np.int32(sz[2]), np.int32(sz[0]), x_gpu.gpudata,
                               np.int32(sxp[0]), np.int32(sxp[1]),
                               np.int32(sw[0]), np.int32(sw[1]),
                               np.int32(offset[0]), np.int32(offset[1]),
                               np.int32(csf[0]), np.int32(csf[1]),
                               np.int32(nhop[0]), np.int32(nhop[1]),
                               block=block_size, grid=grid_size)

    return xs_gpu
Exemple #17
0
def pad_stack_GPU(x, sz, offset=(0,0), dtype='real'):

    if x.__class__ == np.ndarray:
        x = np.array(x).astype(np.float32)
        x_gpu = cua.to_gpu(x)
    elif x.__class__ == cua.GPUArray:
        x_gpu = x

    sx = x_gpu.shape

    block_size = (16,16,1)   
    grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[0])),
                 int(np.ceil(np.float32(sz[1])/block_size[1])))

    if dtype == 'real':

        if x_gpu.dtype != np.float32:
            x_gpu = x_gpu.real

        mod = cu.module_from_buffer(cubin)
        pad_stack_Kernel = mod.get_function("pad_stack_Kernel")

        xp_gpu = cua.empty(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.float32)
        pad_stack_Kernel(x_gpu.gpudata, np.int32(sx[0]),
                                        np.int32(sx[1]),     np.int32(sx[2]),
                        xp_gpu.gpudata, np.int32(sz[0]),     np.int32(sz[1]),
                                        np.int32(offset[0]), np.int32(offset[1]),
                                        block=block_size, grid=grid_size)

    if dtype == 'complex':

        mod = cu.module_from_buffer(cubin)
        pad_stack_Kernel = mod.get_function("pad_stack_ComplexKernel")

        xp_gpu = cua.empty(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.complex64)    
        pad_stack_Kernel(x_gpu.gpudata, np.int32(sx[0]),
                                        np.int32(sx[1]),     np.int32(sx[2]),
                        xp_gpu.gpudata, np.int32(sz[0]),     np.int32(sz[1]),
                                        np.int32(offset[0]), np.int32(offset[1]),
                                        block=block_size, grid=grid_size)
        
    return xp_gpu
Exemple #18
0
def crop_stack_GPU(x, sz, offset=(0,0), dtype='real'):
    
    if x.__class__ == np.ndarray:
        x = np.array(x).astype(np.float32)
        x_gpu = cua.to_gpu(x)
    elif x.__class__ == cua.GPUArray:
        x_gpu = x

    sx = x_gpu.shape
    block_size = (16,16,1)   
    grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[0])),
                 int(np.ceil(np.float32(sz[1])/block_size[1])))

    sx_before = np.array([sx[1],sx[2]])
    sx_after  = np.array(sz)
    if any(np.array([sx[1],sx[2]])-(np.array(sz))<offset):
        raise IOError('Size missmatch: Size after - size before < offset')
    

    if dtype == 'real':

        if x_gpu.dtype != np.float32:
            x_gpu = x_gpu.real

        mod = cu.module_from_buffer(cubin)
        crop_stack_Kernel = mod.get_function("crop_stack_Kernel")

        xc_gpu = cua.zeros(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.float32)

    if dtype == 'complex':
     
        mod = cu.module_from_buffer(cubin)
        crop_stack_Kernel = mod.get_function("crop_stack_ComplexKernel")
        xc_gpu = cua.empty(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.complex64)
        
    crop_stack_Kernel(xc_gpu.gpudata, np.int32(sx[0]),
                                      np.int32(sz[0]),     np.int32(sz[1]),
                       x_gpu.gpudata, np.int32(sx[1]),     np.int32(sx[2]),
                                      np.int32(offset[0]), np.int32(offset[1]),
                                      block=block_size, grid=grid_size)
        
    return xc_gpu
Exemple #19
0
def pad_stack_GPU(x, sz, offset=(0,0), dtype='real'):

    if x.__class__ == np.ndarray:
        x = np.array(x).astype(np.float32)
        x_gpu = cua.to_gpu(x)
    elif x.__class__ == cua.GPUArray:
        x_gpu = x

    sx = x_gpu.shape

    block_size = (16,16,1)   
    grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[0])),
                 int(np.ceil(np.float32(sz[1])/block_size[1])))

    if dtype == 'real':

        if x_gpu.dtype != np.float32:
            x_gpu = x_gpu.real

        mod = cu.module_from_buffer(cubin)
        pad_stack_Kernel = mod.get_function("pad_stack_Kernel")

        xp_gpu = cua.empty(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.float32)
        pad_stack_Kernel(x_gpu.gpudata, np.int32(sx[0]),
                                        np.int32(sx[1]),     np.int32(sx[2]),
                        xp_gpu.gpudata, np.int32(sz[0]),     np.int32(sz[1]),
                                        np.int32(offset[0]), np.int32(offset[1]),
                                        block=block_size, grid=grid_size)

    if dtype == 'complex':

        mod = cu.module_from_buffer(cubin)
        pad_stack_Kernel = mod.get_function("pad_stack_ComplexKernel")

        xp_gpu = cua.empty(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.complex64)    
        pad_stack_Kernel(x_gpu.gpudata, np.int32(sx[0]),
                                        np.int32(sx[1]),     np.int32(sx[2]),
                        xp_gpu.gpudata, np.int32(sz[0]),     np.int32(sz[1]),
                                        np.int32(offset[0]), np.int32(offset[1]),
                                        block=block_size, grid=grid_size)
        
    return xp_gpu
def cropGPU(gpuArray, size, offset=(0, 0), block_size=(32, 32, 1)):
    """
    Crop an image array that is on the GPU to a new size.

    :param gpuArray: image array to be cropped
    :type gpuArray: GPU array
    :param size: size to which the array is crooped to (y, x)
    :type size: tuple
    :param offset: apply offset?
    :type offset: tuple
    :param block_size: CUDA block_size
    :param block_size: tuple

    :return: cropped array
    :rtype: GPU array
    """

    sfft = gpuArray.shape

    grid_size = (int(np.ceil(float(sfft[1])/block_size[1])),
                 int(np.ceil(float(sfft[0])/block_size[0])))

    if gpuArray.dtype == np.float32:
        mod = cuda.module_from_buffer(cubin)
        cropKernel = mod.get_function("crop_Kernel")

    elif gpuArray.dtype == np.complex64:
        mod = cuda.module_from_buffer(cubin)
        cropKernel = mod.get_function("crop_ComplexKernel")
    else:
        print 'Incorrect data type in cropGPU'
        return None

    x_cropped_gpu = cua.empty(tuple((int(size[0]),int(size[1]))), np.float32)

    cropKernel(x_cropped_gpu.gpudata, np.int32(size[0]), np.int32(size[1]),
               gpuArray.gpudata, np.int32(sfft[0]), np.int32(sfft[1]),
               np.int32(offset[0]), np.int32(offset[1]),
               block=block_size, grid=grid_size)

    return x_cropped_gpu
Exemple #21
0
    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
            no_extern_c=False, arch=None, code=None, cache_dir=None,
            include_dirs=[]):
        self._check_arch(arch)

        cubin = compile(source, nvcc, options, keep, no_extern_c,
                arch, code, cache_dir, include_dirs)

        from pycuda.driver import module_from_buffer
        self.module = module_from_buffer(cubin)

        self._bind_module()
Exemple #22
0
def pad_cpu2gpu(x, sz, offset=(0,0), dtype='real'):

    block_size = (16, 16 ,1)
    grid_size = (int(np.ceil(np.float32(sz[1])/block_size[1])),
                 int(np.ceil(np.float32(sz[0])/block_size[0])))

    sx = x.shape

    if x.__class__ == np.ndarray:
        x  = np.array(x).astype(np.float32)        
        x_gpu = cua.to_gpu(x)        
    elif x.__class__ == cua.GPUArray:       
        x_gpu = x

    if dtype == 'real':

        mod = cu.module_from_buffer(cubin)
        zeroPadKernel = mod.get_function("zeroPadKernel")

        x_padded_gpu = cua.zeros(tuple((int(sz[0]),int(sz[1]))), np.float32)
        
        zeroPadKernel(x_padded_gpu.gpudata, np.int32(sz[0]),     np.int32(sz[1]),
                             x_gpu.gpudata, np.int32(sx[0]),     np.int32(sx[1]),
                                            np.int32(offset[0]), np.int32(offset[1]),
                                            block=block_size, grid=grid_size)
    elif dtype == 'complex':

        mod = cu.module_from_buffer(cubin)
        #mod = SourceModule(open('gputools.cu').read(), keep=True)
        zeroPadComplexKernel = mod.get_function("zeroPadComplexKernel")

        x_padded_gpu = cua.zeros(tuple((int(sz[0]),int(sz[1]))), np.complex64)
        
        zeroPadComplexKernel(x_padded_gpu.gpudata, np.int32(sz[0]),     np.int32(sz[1]),
                                    x_gpu.gpudata, np.int32(sx[0]),     np.int32(sx[1]),
                                                   np.int32(offset[0]), np.int32(offset[1]),
                                                   block=block_size, grid=grid_size)

    return x_padded_gpu
Exemple #23
0
def pad_cpu2gpu(x, sz, offset=(0,0), dtype='real'):

    block_size = (16, 16 ,1)
    grid_size = (int(np.ceil(np.float32(sz[1])/block_size[1])),
                 int(np.ceil(np.float32(sz[0])/block_size[0])))

    sx = x.shape

    if x.__class__ == np.ndarray:
        x  = np.array(x).astype(np.float32)        
        x_gpu = cua.to_gpu(x)        
    elif x.__class__ == cua.GPUArray:       
        x_gpu = x

    if dtype == 'real':

        mod = cu.module_from_buffer(cubin)
        zeroPadKernel = mod.get_function("zeroPadKernel")

        x_padded_gpu = cua.zeros(tuple((int(sz[0]),int(sz[1]))), np.float32)
        
        zeroPadKernel(x_padded_gpu.gpudata, np.int32(sz[0]),     np.int32(sz[1]),
                             x_gpu.gpudata, np.int32(sx[0]),     np.int32(sx[1]),
                                            np.int32(offset[0]), np.int32(offset[1]),
                                            block=block_size, grid=grid_size)
    elif dtype == 'complex':

        mod = cu.module_from_buffer(cubin)
        #mod = SourceModule(open('gputools.cu').read(), keep=True)
        zeroPadComplexKernel = mod.get_function("zeroPadComplexKernel")

        x_padded_gpu = cua.zeros(tuple((int(sz[0]),int(sz[1]))), np.complex64)
        
        zeroPadComplexKernel(x_padded_gpu.gpudata, np.int32(sz[0]),     np.int32(sz[1]),
                                    x_gpu.gpudata, np.int32(sx[0]),     np.int32(sx[1]),
                                                   np.int32(offset[0]), np.int32(offset[1]),
                                                   block=block_size, grid=grid_size)

    return x_padded_gpu
Exemple #24
0
def crop_gpu2cpu(x_gpu, sz, offset=(0,0)):

    sfft = x_gpu.shape

    block_size = (16, 16 ,1)
    grid_size = (int(np.ceil(np.float32(sfft[1])/block_size[1])),
                 int(np.ceil(np.float32(sfft[0])/block_size[0])))

    if x_gpu.dtype == np.float32:
        mod = cu.module_from_buffer(cubin)
        cropKernel = mod.get_function("crop_Kernel")

    elif x_gpu.dtype == np.complex64:
        mod = cu.module_from_buffer(cubin)
        cropKernel = mod.get_function("crop_ComplexKernel")

    x_cropped_gpu = cua.empty(tuple((int(sz[0]),int(sz[1]))), np.float32)
        
    cropKernel(x_cropped_gpu.gpudata,   np.int32(sz[0]),       np.int32(sz[1]),
                       x_gpu.gpudata, np.int32(sfft[0]),     np.int32(sfft[1]),
                                    np.int32(offset[0]), np.int32(offset[1]),
                                    block=block_size   , grid=grid_size)

    return x_cropped_gpu
Exemple #25
0
    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
            no_extern_c=False, arch=None, code=None, cache_dir=None,
            include_dirs=[]):
        self._check_arch(arch)

        cubin = compile(source, nvcc, options, keep, no_extern_c,
                arch, code, cache_dir, include_dirs)

        from pycuda.driver import module_from_buffer
        self.module = module_from_buffer(cubin)

        self.get_global = self.module.get_global
        self.get_texref = self.module.get_texref
        if hasattr(self.module, "get_surfref"):
            self.get_surfref = self.module.get_surfref
Exemple #26
0
def crop_gpu2cpu(x_gpu, sz, offset=(0,0)):

    sfft = x_gpu.shape

    block_size = (16, 16 ,1)
    grid_size = (int(np.ceil(np.float32(sfft[1])/block_size[1])),
                 int(np.ceil(np.float32(sfft[0])/block_size[0])))

    if x_gpu.dtype == np.float32:
        mod = cu.module_from_buffer(cubin)
        cropKernel = mod.get_function("crop_Kernel")

    elif x_gpu.dtype == np.complex64:
        mod = cu.module_from_buffer(cubin)
        cropKernel = mod.get_function("crop_ComplexKernel")

    x_cropped_gpu = cua.empty(tuple((int(sz[0]),int(sz[1]))), np.float32)
        
    cropKernel(x_cropped_gpu.gpudata,   np.int32(sz[0]),       np.int32(sz[1]),
                       x_gpu.gpudata, np.int32(sfft[0]),     np.int32(sfft[1]),
                                    np.int32(offset[0]), np.int32(offset[1]),
                                    block=block_size   , grid=grid_size)

    return x_cropped_gpu
Exemple #27
0
    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
            no_extern_c=False, arch=None, code=None, cache_dir=None,
            include_dirs=[]):
        self._check_arch(arch)

        cubin = compile(source, nvcc, options, keep, no_extern_c,
                arch, code, cache_dir, include_dirs)

        from pycuda.driver import module_from_buffer
        self.module = module_from_buffer(cubin)

        self.get_global = self.module.get_global
        self.get_texref = self.module.get_texref
        if hasattr(self.module, "get_surfref"):
            self.get_surfref = self.module.get_surfref
Exemple #28
0
 def init_mod(cls):
     if cls.__dict__.get('mod') is None:
         cls.radix_size = 1 << cls.radix_bits
         code = _CODE.substitute(group_size=cls.group_size,
                 radix_bits=cls.radix_bits, radix_size=cls.radix_size)
         cubin = pycuda.compiler.compile(code)
         cls.mod = cuda.module_from_buffer(cubin)
         with open('/tmp/sort_kern.cubin', 'wb') as fp:
             fp.write(cubin)
         for name in ['prefix_scan', 'prefix_sum_condense',
                      'prefix_sum_inner', 'prefix_sum_distribute',
                      'binary_search', 'prefix_scan_repair']:
             f = cls.mod.get_function(name)
             setattr(cls, name, f)
             f.set_cache_config(cuda.func_cache.PREFER_L1)
         cls.calc_local_pfxs = cls.mod.get_function('calc_local_pfxs')
         cls.radix_sort = cls.mod.get_function('radix_sort')
Exemple #29
0
def get_CUDA_kernel(device_id, src_format, dst_format):
    init_module()
    start = time.time()
    k = KERNELS_MAP.get((src_format, dst_format))
    assert k is not None, "no kernel found for %s to %s" % (src_format, dst_format)
    function_name, ksrc = k
    global KERNEL_cubins
    cubin = KERNEL_cubins.get((device_id, function_name))
    if cubin is None:
        debug("compiling for device %s: %s=%s", device_id, function_name, ksrc)
        cubin = compile(ksrc)
        KERNEL_cubins[(device_id, function_name)] = cubin
    #now load from cubin:
    mod = driver.module_from_buffer(cubin)
    CUDA_function = mod.get_function(function_name)
    end = time.time()
    debug("compilation of %s took %.1fms", function_name, 1000.0*(end-start))
    return function_name, CUDA_function
Exemple #30
0
def ola_GPU(xs_gpu, sy, csf, hop):

    y_gpu = cua.empty(sy, np.float32)

    block_size = (16,16,1)   
    grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[1])),
                 int(np.ceil(np.float32(sz[1])/block_size[0])))

    mod = cu.module_from_buffer(cubin)
    copy_Kernel = mod.get_function("copy_Kernel")

    for i in range(csf[0]):
        for j in range(csf[1]):
            copy_Kernel(y_gpu,  np.uint32(sy[0]), np.uint32(sy[0]),
                        xs_gpu, np.uint32(sx[0]), np.uint32(sx[1]), np.uint32(sx[2]),
                        np.uint32(offset[0]), np.uint32(offset[1]), np.uint32(startrow), 
                        block=block_size, grid=grid_size)

    return np.real(y_gpu.get())
Exemple #31
0
def ola_GPU(xs_gpu, sy, csf, hop):

    y_gpu = cua.empty(sy, np.float32)

    block_size = (16,16,1)   
    grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[1])),
                 int(np.ceil(np.float32(sz[1])/block_size[0])))

    mod = cu.module_from_buffer(cubin)
    copy_Kernel = mod.get_function("copy_Kernel")

    for i in range(csf[0]):
        for j in range(csf[1]):
            copy_Kernel(y_gpu,  np.uint32(sy[0]), np.uint32(sy[0]),
                        xs_gpu, np.uint32(sx[0]), np.uint32(sx[1]), np.uint32(sx[2]),
                        np.uint32(offset[0]), np.uint32(offset[1]), np.uint32(startrow), 
                        block=block_size, grid=grid_size)

    return np.real(y_gpu.get())
Exemple #32
0
def _prepared_gfunc_from_llvm_kernel(llvm_kernel, capability=(1,1),
                                     cuda_module_options=[]):
    from pycuda.driver import module_from_buffer
    cpu = 'sm_%d%d' % capability
    ptxtm = le.TargetMachine.lookup(arch='nvptx64', cpu=cpu)
    pm = lp.build_pass_managers(ptxtm, opt=3, fpm=False).pm
    pm.run(llvm_kernel.module)
    asm = ptxtm.emit_assembly(llvm_kernel.module)

    #XXX: Hack. llvm 3.2 doesn't set map_f64_to_f32 for cpu < sm_13 as it
    # should
    if capability < (1, 3):
        target_str = '.target ' + cpu
        asm = asm.replace(target_str, target_str + ', map_f64_to_f32')

    mod = module_from_buffer(asm, options=cuda_module_options)
    gfunc = mod.get_function(llvm_kernel.name)
    gfunc.prepare('P'*(len(llvm_kernel.args)-1) + 'i')
    return gfunc
Exemple #33
0
 def init_mod(cls):
     if cls.__dict__.get('mod') is None:
         cls.radix_size = 1 << cls.radix_bits
         code = _CODE.substitute(group_size=cls.group_size,
                                 radix_bits=cls.radix_bits,
                                 radix_size=cls.radix_size)
         cubin = pycuda.compiler.compile(code)
         cls.mod = cuda.module_from_buffer(cubin)
         with open('/tmp/sort_kern.cubin', 'wb') as fp:
             fp.write(cubin)
         for name in [
                 'prefix_scan', 'prefix_sum_condense', 'prefix_sum_inner',
                 'prefix_sum_distribute', 'binary_search',
                 'prefix_scan_repair'
         ]:
             f = cls.mod.get_function(name)
             setattr(cls, name, f)
             f.set_cache_config(cuda.func_cache.PREFER_L1)
         cls.calc_local_pfxs = cls.mod.get_function('calc_local_pfxs')
         cls.radix_sort = cls.mod.get_function('radix_sort')
Exemple #34
0
def get_CUDA_function(device_id, function_name, kernel_source):
    """
        Returns the compiled kernel for the given device
        and kernel key.
        Kernels may be pre-compiled with compile_all.
    """
    global KERNEL_cubins
    cubin = KERNEL_cubins.get((device_id, function_name))
    if cubin is None:
        start = time.time()
        log("compiling for device %s: %s=%s", device_id, function_name, kernel_source)
        cubin = compile(kernel_source)
        KERNEL_cubins[(device_id, function_name)] = cubin
        end = time.time()
        log("compilation of %s took %.1fms", function_name, 1000.0*(end-start))
    #now load from cubin:
    start = time.time()
    mod = driver.module_from_buffer(cubin)
    CUDA_function = mod.get_function(function_name)
    end = time.time()
    log("loading function %s from pre-compiled cubin took %.1fms", function_name, 1000.0*(end-start))
    return CUDA_function
Exemple #35
0
    def FloydWarshall(self, switches, links):
        adj_graph = dict()
        for switch1 in switches:
            adj_graph[switch1.dp.id] = dict()
            adj_graph[switch1.dp.id][switch1.dp.id] = 0
            for link in links:
                if link.src.dpid == switch1.dp.id:
                    adj_graph[switch1.dp.id][link.dst.dpid] = float(link.delay)

        N=max(adj_graph)+1
        adj_array = numpy.full(N*N, float("inf")).astype(numpy.float32)
        for key1, row in adj_graph.iteritems():
            for key2, value in row.iteritems():
                adj_array[key1 * N + key2] = value

        adj_gpu = cuda.mem_alloc(adj_array.size * adj_array.dtype.itemsize)
        cuda.memcpy_htod(adj_gpu, adj_array)

        next_array = [ i % N for i in range(N*N) ]
        next_np = numpy.array(next_array).astype(numpy.int32)
        next_gpu = cuda.mem_alloc(next_np.size * next_np.dtype.itemsize)
        cuda.memcpy_htod(next_gpu, next_np)

        mod = cuda.module_from_buffer(self.result_data)
        func = mod.get_function("fw")
        for k in range(1,N):
            func(adj_gpu, next_gpu, numpy.int32(k), numpy.int32(N), block=(N, N, 1), grid=(1, 1), shared=0)

        cuda.memcpy_dtoh(next_np, next_gpu)
        #cuda.memcpy_dtoh(adj_array, adj_gpu)

        next_gpu.free()
        adj_gpu.free()

        autoinit.patch_finish()

        #self.logger.info("%s", adj_array)
        #self.logger.info("%s", next_np)
        return next_np
Exemple #36
0
def remove_empty_anchor(view, anchors, limit):
    # input:
    # ahchors: (N, 4) 4->(y1, x1, y2, x2) (x > y)
    # view: (W, H, C) 

    mod = cuda.module_from_buffer(module_buff)
    func = mod.get_function('_Z12remove_emptyPfPiS_S0_S0_')

    anchors_shape = np.array(anchors.shape).astype(np.int32)
    view_shape = np.array(view.shape).astype(np.int32)
    index = np.zeros((anchors.shape[0], view_shape[2])).astype(np.float32)
    func(
        cuda.InOut(index), 
        cuda.In(anchors), 
        cuda.In(view), 
        cuda.In(anchors_shape), 
        cuda.In(view_shape), 
        block=(int(view_shape[2]), 1, 1),  # a thread <-> a value in a specific 2d pos(need to sum the channel)
        grid=(int(anchors_shape[0]), 50, 1)  # a grid <-> an anchor and a line(x)
        # 50 must > anchors width
    )
    index = np.sum(index, axis=1)
    return np.where(index > limit)[0]
    def get_module(self, kernel_filename,
                    include_dirs=[], \
                    defines={}, \
                    compile_args={'no_extern_c', True}, jit_compile_args={}):
        """
        Helper function to print compilation output
        """
        def cuda_compile_message_handler(compile_success_bool, info_str,
                                         error_str):
            self.logger.debug("Compilation returned %s",
                              str(compile_success_bool))
            if info_str:
                self.logger.debug("Info: %s", info_str)
            if error_str:
                self.logger.debug("Error: %s", error_str)

        kernel_filename = os.path.normpath(kernel_filename)
        kernel_path = os.path.abspath(
            os.path.join(self.module_path, kernel_filename))
        #self.logger.debug("Getting %s", kernel_filename)

        # Create a hash of the kernel options
        options_hasher = hashlib.md5()
        options_hasher.update(
            str(defines).encode('utf-8') + str(compile_args).encode('utf-8'))
        options_hash = options_hasher.hexdigest()

        # Create hash of kernel souce
        source_hash = CudaContext.hash_kernel( \
                    kernel_path, \
                    include_dirs=[self.module_path] + include_dirs)

        # Create final hash
        root, ext = os.path.splitext(kernel_filename)
        kernel_hash = root \
                + "_" + source_hash \
                + "_" + options_hash \
                + ext
        cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)

        # If we have the kernel in our hashmap, return it
        if (kernel_hash in self.modules.keys()):
            self.logger.debug("Found kernel %s cached in hashmap (%s)",
                              kernel_filename, kernel_hash)
            return self.modules[kernel_hash]

        # If we have it on disk, return it
        elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
            self.logger.debug("Found kernel %s cached on disk (%s)",
                              kernel_filename, kernel_hash)

            with io.open(cached_kernel_filename, "rb") as file:
                file_str = file.read()
                module = cuda.module_from_buffer(
                    file_str,
                    message_handler=cuda_compile_message_handler,
                    **jit_compile_args)

            self.modules[kernel_hash] = module
            return module

        # Otherwise, compile it from source
        else:
            self.logger.debug("Compiling %s (%s)", kernel_filename,
                              kernel_hash)

            #Create kernel string
            kernel_string = ""
            for key, value in defines.items():
                kernel_string += "#define {:s} {:s}\n".format(
                    str(key), str(value))
            kernel_string += '#include "{:s}"'.format(
                os.path.join(self.module_path, kernel_filename))
            if (self.use_cache):
                cached_kernel_dir = os.path.dirname(cached_kernel_filename)
                if not os.path.isdir(cached_kernel_dir):
                    os.mkdir(cached_kernel_dir)
                with io.open(cached_kernel_filename + ".txt", "w") as file:
                    file.write(kernel_string)

            with Common.Timer("compiler") as timer:
                import warnings
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        "ignore",
                        message=
                        "The CUDA compiler succeeded, but said the following:\nkernel.cu",
                        category=UserWarning)
                    cubin = cuda_compiler.compile(kernel_string,
                                                  include_dirs=include_dirs,
                                                  cache_dir=False,
                                                  **compile_args)
                module = cuda.module_from_buffer(
                    cubin,
                    message_handler=cuda_compile_message_handler,
                    **jit_compile_args)
                if (self.use_cache):
                    with io.open(cached_kernel_filename, "wb") as file:
                        file.write(cubin)

            self.modules[kernel_hash] = module
            return module
Exemple #38
0
def basic_add_performance_2():
    """Measures memory latency for certain operations."""

    base_src = Template("""
    .entry $FNAME ( .param .u32 out )
    {
        .reg .u32 base, off, clka, clkb, clkoa, clkob, clks, tmp, iter;
        .reg .pred p;

        mov.u32         iter,   $RUNS;
        mov.u32         clks,   0;
        mov.u32         tmp,    0;

        ld.const.u32    base,   [scratch];
        $MULT
        mov.u32         lcg_state,  scratch;

    warmup:
        mov.u32         clka,   %clock;
        $OPER
        sub.u32         iter,   iter,   1;
        setp.ne.u32     p,      iter,   0;
    @p  bra.uni         warmup;

        mov.u32         clkoa,  %clock;
        mov.u32         iter,   $RUNS;
    loop:
        //call.uni        (tmp),  lcg_rounds, (100);
        $LCGROUNDS
        mov.u32         clka,   %clock;
        $OPER
        xor.b32         clka,   clka,   tmp;
        mov.u32         clkb,   %clock;
        xor.b32         clka,   clka,   tmp;
        sub.u32         clka,   clkb,   clka;
        add.u32         clks,   clks,   clka;
        sub.u32         iter,   iter,   1;
        setp.ne.u32     p,      iter,   0;
    @p  bra.uni         loop;
        mov.u32         clkob,  %clock;
        sub.u32         clkoa,  clkob,  clkoa;

        mov.u32         iter,   $RUNS;
    cooldown:
        $OPER
        sub.u32         iter,   iter,   1;
        setp.ne.u32     p,      iter,   0;
    @p  bra.uni         cooldown;

        ld.param.u32    base,   [out];
        call.uni        (off),  get_gtid,   ();
        shr.u32         off,    off,    5;
        mad24.lo.u32    base,   off,    8,  base;
        call.uni        (tmp),  lcg_rounds, (1);
        st.volatile.global.b32  [base], tmp;
        st.volatile.global.b32  [base], clks;

        add.u32         base,   base,   4;
        st.global.b32   [base], clkoa;
    }
    """)

    addrtypes = {
            'single': {'label': "all conflicts",  'ADDRTYPE': "single",
                       'MULT': "mov.u32 off, %smid;" +
                               "mad24.lo.u32 base, off, 128, base;"},
            'uncoa':  {'label': "uncoalesced",    'ADDRTYPE': "uncoa",
                       'MULT': "call.uni        (off),  get_gtid,   ();" +
                               "mad24.lo.u32 base, off, 128, base;"},
            'coa':    {'label': "coalesced",      'ADDRTYPE': "coa",
                       'MULT': "call.uni        (off),  get_gtid,   ();" +
                               "mad24.lo.u32 base, off, 4, base;"},
            }

    # Evil, I know, DRY and all
    addrtypesorder = ['single', 'uncoa', 'coa']

    opertypes = {
            'atomic':       "atom.global.add.u32 tmp, [base], tmp;",
            'red':          "red.global.add.u32     [base], clks;",
            'store':        "st.global.u32 [base], clks;",
            'load':         "ld.global.u32 tmp, [base];",
            'load_store': """
                ld.global.u32 tmp, [base];
                add.u32 tmp, tmp, clks;
                st.global.u32 [base], tmp;
                """
            }

    opertypesorder = ['load', 'store', 'load_store', 'red', 'atomic']

    lcgtext = "mad.lo.u32  lcg_state,  lcg_state, 1664525, 1013904223;\n"*50

    order = []
    for va in addrtypesorder:
        for k in sorted(opertypes.keys()):
            order.append((va, k))

    runs = 512
    rounds = 4
    mod = stdlib + "\n.const .u32 scratch;"
    for (addr, oper) in order:
        c = dict(addrtypes[addr])
        c['otype'] = oper
        c['OPER'] = opertypes[oper]
        c['RUNS'] = runs
        c['FNAME'] = "%s_%s" % (addr, oper)
        c['LCGROUNDS'] = lcgtext
        mod += base_src.substitute(c)
    for i in enumerate(mod.split('\n')):
        print "%3d %s" % i
    disassemble(mod)
    mod = cuda.module_from_buffer(mod)
    figs = []
    barwidth = 0.3

    scratch = cuda.mem_alloc(1024*16*30*128)
    scratchptr = mod.get_global('scratch')
    cuda.memset_d32(scratchptr[0], int(scratch), 1)

    def plot(title, names, vals, errs):
        N=len(vals[0])
        bw=2*.9/len(names)
        fig = plt.figure()
        ax = fig.add_subplot(111, title=title)
        ax.set_ylabel('Clocks')
        ax.set_xlabel('Warps/SM')
        ax.set_xticks(range(N))
        ax.set_xticklabels([1<<i for i in range(N)])
        for idx, (name,val,err) in enumerate(zip(names, vals, errs)):
            ax.bar([i+bw*(idx/2)-.45 for i in range(N)], val, bw, yerr=err,
                     color=colors[idx], label=name, zorder=-idx)
        ax.axis(ymin=0)
        ax.legend(loc=0)
        return fig

    for addr in addrtypesorder:
        addrlbl = addrtypes[addr]['label']
        print "Access pattern:", addrlbl
        interms, interes, totalms, totales = [], [], [], []
        for operidx, oper in enumerate(opertypesorder):
            interm, intere, totalm, totale = [], [], [], []
            for dim in ((1, 1), (2, 1), (4, 1), (8, 1), (8, 2), (8, 4)):
                vals = numpy.zeros( (dim[0] * dim[1] * 30, 2) )
                fn = mod.get_function('%s_%s' % (addr, oper))
                for round in range(rounds+1):
                    a = numpy.zeros_like(vals).astype(numpy.int32)
                    fn(cuda.InOut(a), block=(32 * dim[0], 1, 1),
                                      grid=(30 * dim[1], 1))
                    if round != 0: vals += a
                    time.sleep(.005)
                means = scipy.mean(vals, axis=0) / (runs*rounds)
                stds = scipy.std(vals, axis=0) / (runs*rounds)
                # this is just gross
                interm.append(means[0])
                totalm.append(means[1])
                intere.append(stds[0])
                totale.append(stds[1])
                print "%16s: %1.7f±%1.6f" % (oper, means[0], stds[0])
                print "%16s: %1.7f±%1.6f" % (oper+' total', means[1], stds[1])
            interms.append(interm)
            interes.append(intere)
            interms.append(totalm)
            interes.append(totale)

        names = []
        for i in opertypesorder:
            names.append(i)
            names.append(i + ' total')

        fig1 = plot('Compute memory latency, %s access pattern' % addrlbl,
                    names, interms, interes)
        figs.append((addr, fig1))

    return figs
Exemple #39
0
def consecutive_clocks():
    """Measures a few rounds of sampling consecutive clocks."""

    ptx = stdlib + """
    .entry consecutive_clocks ( .param .u32 out )
    {

        .reg .u32 base, off, clka, clkb, clks, iter;
        .reg .pred p;

        mov.u32         iter,   256;
        mov.u32         clks,   0;

    loop:
        mov.u32         clka,   %clock;
        mov.u32         clkb,   %clock;
        sub.u32         clka,   clkb,   clka;
        add.u32         clks,   clks,   clka;
        sub.u32         iter,   iter,   1;
        setp.ne.u32     p,      iter,   0;
    @p  bra.uni         loop;

        ld.param.u32    base,   [out];
        call.uni        (off),  get_gtid,   ();
        mad24.lo.u32    base,   off,    4,  base;

        st.global.b32   [base], clks;

    }
    """

    fn = get_func(cuda.module_from_buffer(ptx), 'consecutive_clocks')

    fig = plt.figure()
    ax = fig.add_subplot(111, title='Clocks from consecutive operations, 256 iterations/thread')
    ax.set_ylabel('Clocks')
    ax.set_xlabel('Block width')
    ax.set_xticks(range(10))
    ax.set_xticklabels([str(1 << i) for i in range(10)])

    for grid in range(5):
        gridw = 1 << grid
        allres = []
        allerr = []
        for width in range(10):
            widthw = 1 << width
            if widthw * gridw > 1024: continue
            all_calc = numpy.zeros( (gridw * 30 * widthw,) ).astype(numpy.int32)

            for run in range(5):
                a = numpy.empty( (gridw * 30 * widthw,) ).astype(numpy.int32)
                fn(cuda.InOut(a), block=(widthw, 1, 1), grid=(gridw * 30, 1))
                all_calc += a

            print "%dx%d: %f ± %f" % (gridw, widthw, scipy.mean(all_calc),
                                    scipy.std(all_calc))

            allres.append(scipy.mean(all_calc)/256/5)
            allerr.append(scipy.std(all_calc)/(256*5))

        #ax.plot(range(len(allres)), allres, keys[grid], label=str(gridw))
        ax.errorbar(range(len(allres)), allres, yerr=allerr, fmt=keys[grid],
                    label=str(gridw))
    ax.legend(loc=0, title="Blocks/SM")
    return fig
def zeropadToGPU(array, size, offset=(0, 0), dtype='real', block_size=(32, 32, 1)):
    """
    Zero pad the input array and transfer it to the GPU memory if not there yet

    :param array: input array to be zeropadded and transferred
    :type array: ndarray
    :param size: size of the array (y, x)
    :type size: tuple
    :param offset: apply offset?
    :type offset: tuple
    :param dtype: data type, either real or complex
    :type: str
    :param block_size: CUDA block_size
    :param block_size: tuple

    :return: zero padded array that resides in the GPU memory
    :rtype: GPUarray
    """
    grid_size = (int(np.ceil(float(size[1])/block_size[1])),
                 int(np.ceil(float(size[0])/block_size[0])))

    ay, ax = array.shape
    ay = np.int32(ay)
    ax = np.int32(ax)

    offsetx = np.int32(offset[0])
    offsety = np.int32(offset[1])

    sy = np.int32(size[0])
    sx = np.int32(size[1])

    if array.__class__ == np.ndarray:
        #array = np.array(array).astype(np.float32)
        array_gpu = cua.to_gpu(array)
        #array_gpu = cua.to_gpu_async(array)
    elif array.__class__ == cua.GPUArray:
        array_gpu = array
    else:
        print 'ERROR: Array type neither NumPy or GPUArray'
        return None

    if dtype == 'real':
        mod = cuda.module_from_buffer(cubin)
        zeroPadKernel = mod.get_function("zeroPadKernel")

        output = cua.zeros(size, np.float32)

        zeroPadKernel(output.gpudata, sy, sx, array_gpu.gpudata, ay, ax,
                      offsetx, offsety, block=block_size, grid=grid_size)
    elif dtype == 'complex':
        mod = cuda.module_from_buffer(cubin)
        zeroPadComplexKernel = mod.get_function("zeroPadComplexKernel")

        output = cua.zeros(size, np.complex64)

        zeroPadComplexKernel(output.gpudata, sy, sx, array_gpu.gpudata, ay, ax,
                             offsetx, offsety, block=block_size, grid=grid_size)
    else:
        print 'Incorrect data type in zeropadToGPU'
        return None

    return output
Exemple #41
0
def zeropadToGPU(array,
                 size,
                 offset=(0, 0),
                 dtype='real',
                 block_size=(32, 32, 1)):
    """
    Zero pad the input array and transfer it to the GPU memory if not there yet

    :param array: input array to be zeropadded and transferred
    :type array: ndarray
    :param size: size of the array (y, x)
    :type size: tuple
    :param offset: apply offset?
    :type offset: tuple
    :param dtype: data type, either real or complex
    :type: str
    :param block_size: CUDA block_size
    :param block_size: tuple

    :return: zero padded array that resides in the GPU memory
    :rtype: GPUarray
    """
    grid_size = (int(np.ceil(float(size[1]) / block_size[1])),
                 int(np.ceil(float(size[0]) / block_size[0])))

    ay, ax = array.shape
    ay = np.int32(ay)
    ax = np.int32(ax)

    offsetx = np.int32(offset[0])
    offsety = np.int32(offset[1])

    sy = np.int32(size[0])
    sx = np.int32(size[1])

    if array.__class__ == np.ndarray:
        #array = np.array(array).astype(np.float32)
        array_gpu = cua.to_gpu(array)
        #array_gpu = cua.to_gpu_async(array)
    elif array.__class__ == cua.GPUArray:
        array_gpu = array
    else:
        print 'ERROR: Array type neither NumPy or GPUArray'
        return None

    if dtype == 'real':
        mod = cuda.module_from_buffer(cubin)
        zeroPadKernel = mod.get_function("zeroPadKernel")

        output = cua.zeros(size, np.float32)

        zeroPadKernel(output.gpudata,
                      sy,
                      sx,
                      array_gpu.gpudata,
                      ay,
                      ax,
                      offsetx,
                      offsety,
                      block=block_size,
                      grid=grid_size)
    elif dtype == 'complex':
        mod = cuda.module_from_buffer(cubin)
        zeroPadComplexKernel = mod.get_function("zeroPadComplexKernel")

        output = cua.zeros(size, np.complex64)

        zeroPadComplexKernel(output.gpudata,
                             sy,
                             sx,
                             array_gpu.gpudata,
                             ay,
                             ax,
                             offsetx,
                             offsety,
                             block=block_size,
                             grid=grid_size)
    else:
        print 'Incorrect data type in zeropadToGPU'
        return None

    return output
Exemple #42
0
def lidar_to_top_cuda(lidar):
    # input:
    # lidar: (N, 4) 4->(x,y,z,i) in lidar coordinate
    lidar = np.copy(lidar)
    mod = cuda.module_from_buffer(module_buff)
    func = mod.get_function('_Z12lidar_to_topPfPiS0_S0_S_S_S0_')
    func_density = mod.get_function('_Z20lidar_to_top_densityPfPiS0_S0_S0_')
    # trunc
    idx = np.where(lidar[:, 0] > TOP_X_MIN)
    lidar = lidar[idx]
    idx = np.where(lidar[:, 0] < TOP_X_MAX)
    lidar = lidar[idx]

    idx = np.where(lidar[:, 1] > TOP_Y_MIN)
    lidar = lidar[idx]
    idx = np.where(lidar[:, 1] < TOP_Y_MAX)
    lidar = lidar[idx]

    idx = np.where(lidar[:, 2] > TOP_Z_MIN)
    lidar = lidar[idx]
    idx = np.where(lidar[:, 2] < TOP_Z_MAX)
    lidar = lidar[idx]
    # shape
    X0, Xn = 0, int((TOP_X_MAX - TOP_X_MIN) // TOP_X_DIVISION) + 1
    Y0, Yn = 0, int((TOP_Y_MAX - TOP_Y_MIN) // TOP_Y_DIVISION) + 1
    Z0, Zn = 0, int((TOP_Z_MAX - TOP_Z_MIN) / TOP_Z_DIVISION)
    height = Xn - X0
    width = Yn - Y0
    channel = Zn - Z0 + 2
    # intensity and density channel do not cal seperately in kernel function
    top = np.zeros(shape=(height, width, channel), dtype=np.float32)
    top_density = np.zeros(shape=(height, width, 1), dtype=np.float32)
    top_shape = np.array(top.shape).astype(np.int32)
    lidar_shape = np.array(lidar.shape).astype(np.int32)

    # voxelize lidar
    lidar[:,
          0] = ((lidar[:, 0] - TOP_X_MIN) // TOP_X_DIVISION).astype(np.int32)
    lidar[:,
          1] = ((lidar[:, 1] - TOP_Y_MIN) // TOP_Y_DIVISION).astype(np.int32)
    lidar[:, 2] = (lidar[:, 2] - TOP_Z_MIN) / TOP_Z_DIVISION

    lidar = lidar[np.lexsort((lidar[:, 2], lidar[:, 1], lidar[:, 0])), :]
    lidar_x = np.ascontiguousarray(lidar[:, 0].astype(np.int32))
    lidar_y = np.ascontiguousarray(lidar[:, 1].astype(np.int32))
    lidar_z = np.ascontiguousarray(lidar[:, 2])
    lidar_i = np.ascontiguousarray(lidar[:, 3])

    func(
        cuda.InOut(top),
        cuda.In(top_shape),
        cuda.In(lidar_x),
        cuda.In(lidar_y),
        cuda.In(lidar_z),
        cuda.In(lidar_i),
        cuda.In(lidar_shape),
        #intensity and density channel do not cal seperately
        block=(channel, 1, 1),  # a thread <-> a channel 
        grid=(int(lidar_shape[0]), 1, 1)  # a grid <-> a point in laser scan  
    )
    func_density(cuda.InOut(top_density),
                 cuda.In(lidar_x),
                 cuda.In(lidar_y),
                 cuda.In(lidar_shape),
                 cuda.In(top_shape),
                 block=(1, 1, 1),
                 grid=(1, 1, 1))
    top_density = (np.log(top_density.astype(np.int32) + 1) /
                   math.log(32)).clip(max=1).astype(np.float32)
    return np.dstack([top[:, :, :-1], top_density])
Exemple #43
0
def lidar_to_front_cuda(lidar):
    # input:
    # lidar: (N, 4) 4->(x,y,z,i) in lidar coordinate

    mod = cuda.module_from_buffer(module_buff)
    func_add_points = mod.get_function('_Z25lidar_to_front_add_pointsPiS_S_S_')
    func_fill_front = mod.get_function(
        '_Z25lidar_to_front_fill_frontPfS_PiS0_')

    def cal_height(points):
        return np.clip(points[:, 2] + cfg.VELODYNE_HEIGHT, a_min=0,
                       a_max=None).astype(np.float32).reshape((-1, 1))

    def cal_distance(points):
        return np.sqrt(np.sum(points**2, axis=1)).astype(np.float32).reshape(
            (-1, 1))

    def cal_intensity(points):
        return points[:, 3].astype(np.float32).reshape((-1, 1))

    def to_front(points):
        return np.array([
            np.arctan2(points[:, 1], points[:, 0])/cfg.VELODYNE_ANGULAR_RESOLUTION,
            np.arctan2(points[:, 2], np.sqrt(points[:, 0]**2 + points[:, 1]**2)) \
                /cfg.VELODYNE_VERTICAL_RESOLUTION
        ], dtype=np.int32).T

    # using the same crop method as top view
    idx = np.where(lidar[:, 0] > TOP_X_MIN)
    lidar = lidar[idx]
    idx = np.where(lidar[:, 0] < TOP_X_MAX)
    lidar = lidar[idx]

    idx = np.where(lidar[:, 1] > TOP_Y_MIN)
    lidar = lidar[idx]
    idx = np.where(lidar[:, 1] < TOP_Y_MAX)
    lidar = lidar[idx]

    idx = np.where(lidar[:, 2] > TOP_Z_MIN)
    lidar = lidar[idx]
    idx = np.where(lidar[:, 2] < TOP_Z_MAX)
    lidar = lidar[idx]

    points = to_front(lidar)
    ind = np.where(cfg.FRONT_C_MIN < points[:, 0])
    points, lidar = points[ind], lidar[ind]
    ind = np.where(points[:, 0] < cfg.FRONT_C_MAX)
    points, lidar = points[ind], lidar[ind]
    ind = np.where(cfg.FRONT_R_MIN < points[:, 1])
    points, lidar = points[ind], lidar[ind]
    ind = np.where(points[:, 1] < cfg.FRONT_R_MAX)
    points, lidar = points[ind], lidar[ind]

    points[:, 0] += int(cfg.FRONT_C_OFFSET)
    points[:, 1] += int(cfg.FRONT_R_OFFSET)
    #points //= 2

    ind = np.where(0 <= points[:, 0])
    points, lidar = points[ind], lidar[ind]
    ind = np.where(points[:, 0] < cfg.FRONT_WIDTH)
    points, lidar = points[ind], lidar[ind]
    ind = np.where(0 <= points[:, 1])
    points, lidar = points[ind], lidar[ind]
    ind = np.where(points[:, 1] < cfg.FRONT_HEIGHT)
    points, lidar = points[ind], lidar[ind]

    # sort for mem friendly
    idx = np.lexsort((points[:, 1], points[:, 0]))
    points = points[idx, :]
    lidar = lidar[idx, :]

    channel = 3  # height, distance, intencity
    front = np.zeros((cfg.FRONT_WIDTH, cfg.FRONT_HEIGHT, channel),
                     dtype=np.float32)
    weight_mask = np.zeros_like(front[:, :, 0]).astype(np.int32)
    # def _add(x):
    #     weight_mask[int(x[0]), int(x[1])] += 1
    # def _fill(x):
    #     front[int(x[0]), int(x[1]), :] += x[2:]
    # np.apply_along_axis(_add, 1, points)
    buf = np.hstack((points, cal_height(lidar), cal_distance(lidar),
                     cal_intensity(lidar))).astype(np.float32)
    # np.apply_along_axis(_fill, 1, buf)

    func_add_points(
        cuda.InOut(weight_mask),
        cuda.In(points),
        cuda.In(np.array(weight_mask.shape).astype(np.int32)),
        cuda.In(np.array(points.shape).astype(np.int32)),
        block=(1, 1, 1),
        grid=(1, 1, 1),  # points
    )
    weight_mask[weight_mask == 0] = 1  # 0 and 1 are both 1
    func_fill_front(
        cuda.InOut(front),
        cuda.In(buf),
        cuda.In(np.array(front.shape).astype(np.int32)),
        cuda.In(np.array(buf.shape).astype(np.int32)),
        block=(3, 1, 1),  # channel 
        grid=(1, 1, 1)  # points 
    )

    front /= weight_mask[:, :, np.newaxis]
    return front
Exemple #44
0
 def get_kernel(self, kernel_filename, include_dirs=[], no_extern_c=True, defines={}):
     """
     Helper function to print compilation output
     """
     def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
         self.logger.debug("Compilation returned %s", str(compile_success_bool))
         if info_str:
             self.logger.debug("Info: %s", info_str)
         if error_str:
             self.logger.debug("Error: %s", error_str)
     
     self.logger.debug("Getting %s", kernel_filename)
         
     # Create a hash of the kernel (and its includes)
     defines_hasher = hashlib.md5()
     defines_hasher.update(str(defines).encode('utf-8'));
     defines_hash = defines_hasher.hexdigest()
     defines_hasher = None
     root, ext = os.path.splitext(kernel_filename)
     kernel_path = os.path.abspath(os.path.join(self.module_path, "gpu_kernels", kernel_filename))
     kernel_hash = root \
             + "_" + CUDAContext.hash_kernel( \
                 kernel_path, \
                 include_dirs=[os.path.join(self.module_path, "../kernels")] + include_dirs) \
             + "_" + defines_hash \
             + ext
     cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
     
     # If we have the kernel in our hashmap, return it
     if (kernel_hash in self.kernels.keys()):
         self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
         return self.kernels[kernel_hash]
     
     # If we have it on disk, return it
     elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
         self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
             
         with io.open(cached_kernel_filename, "rb") as file:
             file_str = file.read()
             module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
             
         self.kernels[kernel_hash] = module
         return self.kernels[kernel_hash]
         
     # Otherwise, compile it from source
     else:
         self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
             
         #Create kernel string
         kernel_string = ""
         for key, value in defines.items():
             kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
         kernel_string += '#include "{:s}"'.format(str(kernel_path))
         if (self.use_cache):
             with io.open(cached_kernel_filename + ".txt", "w") as file:
                 #Why is kernel_string a bytes object in Python 3.5.2?
                 #Bugfix here
                 if isinstance(kernel_string, bytes):
                     kernel_string = bytes.decode(kernel_string)
                 file.write(kernel_string)
             
         
         with Timer("compiler") as timer:
             cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
             module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
             if (self.use_cache):
                 with io.open(cached_kernel_filename, "wb") as file:
                     file.write(cubin)
             
         self.kernels[kernel_hash] = module
         
         return self.kernels[kernel_hash]
Exemple #45
0
 def get_kernel(self, kernel_filename, include_dirs=[], defines={}, compile_args={'no_extern_c': True}, jit_compile_args={}):
     """
     Helper function to print compilation output
     """
     def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
         self.logger.debug("Compilation returned %s", str(compile_success_bool))
         if info_str:
             self.logger.debug("Info: %s", info_str)
         if error_str:
             self.logger.debug("Error: %s", error_str)
     
     self.logger.debug("Getting %s", kernel_filename)
         
     # Create a hash of the kernel (and its includes)
     options_hasher = hashlib.md5()
     options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'));
     options_hash = options_hasher.hexdigest()
     options_hasher = None
     root, ext = os.path.splitext(kernel_filename)
     kernel_path = os.path.abspath(os.path.join(self.module_path, "gpu_kernels", kernel_filename))
     kernel_hash = root \
             + "_" + CUDAContext.hash_kernel( \
                 kernel_path, \
                 include_dirs=[os.path.join(self.module_path, "../kernels")] + include_dirs) \
             + "_" + options_hash \
             + ext
     cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
     
     # If we have the kernel in our hashmap, return it
     if (kernel_hash in self.kernels.keys()):
         self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
         return self.kernels[kernel_hash]
     
     # If we have it on disk, return it
     elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
         self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
             
         with io.open(cached_kernel_filename, "rb") as file:
             file_str = file.read()
             module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args)
             
         self.kernels[kernel_hash] = module
         return self.kernels[kernel_hash]
         
     # Otherwise, compile it from source
     else:
         self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
             
         #Create kernel string
         kernel_string = ""
         for key, value in defines.items():
             kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
         kernel_string += '#include "{:s}"'.format(str(kernel_path))
         if (self.use_cache):
             with io.open(cached_kernel_filename + ".txt", "w") as file:
                 #Why is kernel_string a bytes object in Python 3.5.2?
                 #Bugfix here
                 if isinstance(kernel_string, bytes):
                     kernel_string = bytes.decode(kernel_string)
                 file.write(kernel_string)
             
         
         with Timer("compiler") as timer:
             cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args)
             module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args)
             if (self.use_cache):
                 with io.open(cached_kernel_filename, "wb") as file:
                     file.write(cubin)
             
         self.kernels[kernel_hash] = module
         
         return self.kernels[kernel_hash]
Exemple #46
0
 def load(cls, name=None):
     if cls.mod is None:
         if name is None:
             name = cls.__name__.lower()
         cubin = compile(name, assemble_code(cls.lib))
         cls.mod = cuda.module_from_buffer(cubin)
Exemple #47
0
 def load(cls, name=None):
     if cls.mod is None:
         if name is None:
             name = cls.__name__.lower()
         cubin = compile(name, assemble_code(cls.lib))
         cls.mod = cuda.module_from_buffer(cubin)
Exemple #48
0
 def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
                 prepared_call_args, \
                 include_dirs=[], no_extern_c=True, 
                 **kwargs):
     """
     Helper function to print compilation output
     """
     def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
         self.logger.debug("Compilation returned %s", str(compile_success_bool))
         if info_str:
             self.logger.debug("Info: %s", info_str)
         if error_str:
             self.logger.debug("Error: %s", error_str)
     
     kernel_filename = os.path.normpath(kernel_filename)
     #self.logger.debug("Getting %s", kernel_filename)
         
     # Create a hash of the kernel (and its includes)
     kwargs_hasher = hashlib.md5()
     kwargs_hasher.update(str(kwargs).encode('utf-8'));
     kwargs_hash = kwargs_hasher.hexdigest()
     kwargs_hasher = None
     root, ext = os.path.splitext(kernel_filename)
     kernel_hash = root \
             + "_" + CudaContext.hash_kernel( \
                 os.path.join(self.module_path, kernel_filename), \
                 include_dirs=[self.module_path] + include_dirs) \
             + "_" + kwargs_hash \
             + ext
     cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
     
     # If we have the kernel in our hashmap, return it
     if (kernel_hash in self.kernels.keys()):
         self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
         return self.kernels[kernel_hash]
     
     # If we have it on disk, return it
     elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
         self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
             
         with io.open(cached_kernel_filename, "rb") as file:
             file_str = file.read()
             module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
             
         kernel = module.get_function(kernel_function_name)
         kernel.prepare(prepared_call_args)
         self.kernels[kernel_hash] = kernel
         return kernel
         
     # Otherwise, compile it from source
     else:
         self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
             
         #Create kernel string
         kernel_string = ""
         for key, value in kwargs.items():
             kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
         kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
         if (self.use_cache):
             cached_kernel_dir = os.path.dirname(cached_kernel_filename)
             if not os.path.isdir(cached_kernel_dir):
                 os.mkdir(cached_kernel_dir)
             with io.open(cached_kernel_filename + ".txt", "w") as file:
                 file.write(kernel_string)
             
         
         with Common.Timer("compiler") as timer:
             cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
             module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
             if (self.use_cache):
                 with io.open(cached_kernel_filename, "wb") as file:
                     file.write(cubin)
             
         kernel = module.get_function(kernel_function_name)
         kernel.prepare(prepared_call_args)
         self.kernels[kernel_hash] = kernel
         
         
         return kernel