Ejemplo n.º 1
0
def get_CUDA_function(device_id, function_name, kernel_source):
    """
        Returns the compiled kernel for the given device
        and kernel key.
        Kernels may be pre-compiled with compile_all.
    """
    global KERNEL_cubins
    cubin = KERNEL_cubins.get((device_id, function_name))
    if cubin is None:
        start = time.time()
        log("compiling for device %s: %s=%s", device_id, function_name,
            kernel_source)
        cubin = compile(kernel_source)
        KERNEL_cubins[(device_id, function_name)] = cubin
        end = time.time()
        log("compilation of %s took %.1fms", function_name,
            1000.0 * (end - start))
    #now load from cubin:
    start = time.time()
    mod = driver.module_from_buffer(cubin)
    CUDA_function = mod.get_function(function_name)
    end = time.time()
    log("loading function %s from pre-compiled cubin took %.1fms",
        function_name, 1000.0 * (end - start))
    return CUDA_function
Ejemplo n.º 2
0
def compile_all(function_name, kernel_src, device_ids=None):
    """
        Pre-compiles kernel source on the given devices,
        so we can then call get_CUDA_function quickly
        to get the function to call.
    """
    global KERNEL_cubins
    if device_ids is None:
        device_ids = init_all_devices()
    cf = driver.ctx_flags
    for device_id in device_ids:
        device = None
        context = None
        try:
            device = driver.Device(device_id)
            context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
            cubin = KERNEL_cubins.get((device_id, function_name))
            if cubin is None:
                start = time.time()
                log("compiling for device %s: %s=%s", device_id, function_name, kernel_src)
                cubin = compile(kernel_src)
                end = time.time()
                log("compilation of %s took %.1fms", function_name, 1000.0*(end-start))
                KERNEL_cubins[(device_id, function_name)] = cubin
        finally:
            if context:
                context.pop()
Ejemplo n.º 3
0
def compile_all(function_name, kernel_src, device_ids=None):
    """
        Pre-compiles kernel source on the given devices,
        so we can then call get_CUDA_function quickly
        to get the function to call.
    """
    global KERNEL_cubins
    if device_ids is None:
        device_ids = init_all_devices()
    cf = driver.ctx_flags
    for device_id in device_ids:
        device = None
        context = None
        try:
            device = driver.Device(device_id)
            context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
            cubin = KERNEL_cubins.get((device_id, function_name))
            if cubin is None:
                start = time.time()
                log("compiling for device %s: %s=%s", device_id, function_name,
                    kernel_src)
                cubin = compile(kernel_src)
                end = time.time()
                log("compilation of %s took %.1fms", function_name,
                    1000.0 * (end - start))
                KERNEL_cubins[(device_id, function_name)] = cubin
        finally:
            if context:
                context.pop()
def load_cuda():
    md5 = hashlib.md5()
    md5.update(mod.encode("utf-8"))
    filename = md5.hexdigest() + ".cubin"
    path = pathlib.Path(__file__).resolve().parent / filename
    if not path.exists():
        try:
            cubin = compiler.compile(mod, no_extern_c=True)
            with open(str(path), "wb") as handle:
                handle.write(cubin)
        except cuda.CompileError as ce:
            print(f"{ce}")

    return cuda.module_from_file(str(path))
Ejemplo n.º 5
0
def check_and_build_cuda(dpath, src_dir, **kwargs):
    code_type = "cu"
    build_dict = load_build_yaml(dpath)
    build_dpath = join(dpath, "cuda", "build")

    obj_suffix = "cubin"
    env = build_dict[code_type]
    flags = "" if env["flags"] == None else env["flags"]

    #
    # CUDA environment
    #
    from pycuda.compiler import compile

    #
    # Build
    #
    for target in sorted(build_dict["target"]):
        joined_code = get_joined_code(target, build_dict, code_type, dpath)

        #
        # Check if revisionsed
        #
        obj_fpath = join(build_dpath, target + "." + obj_suffix)
        joined_target_fpath = join(build_dpath, target + "." + code_type)

        new_compile = True
        if exists(obj_fpath) and exists(joined_target_fpath):
            with open(joined_target_fpath, "r") as f:
                if joined_code == f.read():
                    new_compile = False

        if new_compile:
            with open(joined_target_fpath, "w") as f:
                f.write(joined_code)

        #
        # Compile
        #
        if new_compile:
            print("[compile] {} using the PyCUDA build".format(basename(joined_target_fpath)))
            cubin = compile(joined_code)
            with open(obj_fpath, "wb") as f:
                f.write(cubin)

        else:
            print("{} is up to date.".format(joined_target_fpath))
Ejemplo n.º 6
0
def load_cuda_code_individual():
    global _update_individuals_fn

    md5 = hashlib.md5()
    md5.update(mod.encode("utf-8"))
    filename = md5.hexdigest() + ".cubin"
    path = pathlib.Path(__file__).resolve().parent / filename
    if not path.exists():
        try:
            cubin = compiler.compile(mod, no_extern_c=True)
            with open(str(path), "wb") as handle:
                handle.write(cubin)
        except cuda.CompileError as ce:
            print(f"{ce}")

    _cuda_module = cuda.module_from_file(str(path))

    _update_individuals_fn = _cuda_module.get_function("update_individuals")
Ejemplo n.º 7
0
def get_CUDA_kernel(device_id, src_format, dst_format):
    init_module()
    start = time.time()
    k = KERNELS_MAP.get((src_format, dst_format))
    assert k is not None, "no kernel found for %s to %s" % (src_format, dst_format)
    function_name, ksrc = k
    global KERNEL_cubins
    cubin = KERNEL_cubins.get((device_id, function_name))
    if cubin is None:
        debug("compiling for device %s: %s=%s", device_id, function_name, ksrc)
        cubin = compile(ksrc)
        KERNEL_cubins[(device_id, function_name)] = cubin
    #now load from cubin:
    mod = driver.module_from_buffer(cubin)
    CUDA_function = mod.get_function(function_name)
    end = time.time()
    debug("compilation of %s took %.1fms", function_name, 1000.0*(end-start))
    return function_name, CUDA_function
Ejemplo n.º 8
0
def get_CUDA_function(device_id, function_name, kernel_source):
    """
        Returns the compiled kernel for the given device
        and kernel key.
        Kernels may be pre-compiled with compile_all.
    """
    global KERNEL_cubins
    cubin = KERNEL_cubins.get((device_id, function_name))
    if cubin is None:
        start = time.time()
        log("compiling for device %s: %s=%s", device_id, function_name, kernel_source)
        cubin = compile(kernel_source)
        KERNEL_cubins[(device_id, function_name)] = cubin
        end = time.time()
        log("compilation of %s took %.1fms", function_name, 1000.0*(end-start))
    #now load from cubin:
    start = time.time()
    mod = driver.module_from_buffer(cubin)
    CUDA_function = mod.get_function(function_name)
    end = time.time()
    log("loading function %s from pre-compiled cubin took %.1fms", function_name, 1000.0*(end-start))
    return CUDA_function
test_kernel = compile("""
    const int BLOCKSIZE = 32;
    
    float sigmoid(float in){
    return 1.0 / (1 + exp(-1 * in));
    }
    
    //Tiled version of matrix multiply
    __global__ void MatrixMultiplyKernel(float *devA, float *devB, float *devC, int rows, int cols, int k, float alpha, float beta)
    {
    //Get the thread's x and y locations for its run
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;
    
    //Allocate shared memory to hold parts of A and B
    __shared__ float tileA[BLOCKSIZE][BLOCKSIZE];
    __shared__ float tileB[BLOCKSIZE][BLOCKSIZE];
    
    //Use sum to get the result for a specific element
    float sum = 0.0;
    
    //Use iter to see if the loop should be run again
    int iter = 0;
    
    do{
    //Check if the x thread falls within bounds of the matrices
    if ((idy < rows) && (threadIdx.x + BLOCKSIZE*iter < k)){
    tileA[threadIdx.y][threadIdx.x] = devA[threadIdx.x + idy*k + BLOCKSIZE*iter];
    }
    else {
    tileA[threadIdx.y][threadIdx.x] = 0.0;
    }
    
    //Check if the y thread falls within bounds of the matrices
    if ((threadIdx.y + BLOCKSIZE*iter < k) && (idx < cols)){
    tileB[threadIdx.y][threadIdx.x] = devB[idx + (threadIdx.y + BLOCKSIZE*iter)*cols];
    }
    else {
    tileB[threadIdx.y][threadIdx.x] = 0.0;
    }
    
    //Sync to ensure that all of the data has been grabbed for the tiles in this warp
    __syncthreads();
    
    //Sum the elements related to the element in C corresponding to idx and idy
    for (int i = 0; i < BLOCKSIZE; i++){
    sum += tileA[threadIdx.y][i] * tileB[i][threadIdx.x];
    }
    
    //Iterate the number done
    iter++;
    
    //Sync the threads again to ensure they have all done their work before going through the loop to get data
    __syncthreads();
    
    //Check if the tiles have covered all of C
    } while (BLOCKSIZE*iter < k);
    
    //If the thread falls within the matrix C, fill in its element, scaled by alpha and beta
    if ((idy < rows) && (idx < cols)){
    devC[idx + idy*cols] = sum * alpha + devC[idx + idy*cols] * beta;
    }
    }
    
    __global__ void distKernel(float *devA, float *devB, float *devC, int K)
    {
    int idy = threadIdx.y + blockIdx.y * blockDim.y;
    
    if ((idy < K)){
    devC[idy] = (devA[idy] - devB[idy])*(devA[idy] - devB[idy]);
    }
    }
    
    //Element wise subtraction of matrix A and B, stored in matrix C
    __global__ void sub_sigKernel(float *A, float *B, float *C, int rows)
    {
    int i = threadIdx.y + blockIdx.y * blockDim.y;
    
    //Ensure the thread is in bounds
    if (i < rows){
    C[i] = (1.0 / (1 + exp(-1 * B[i])));
    C[i] = A[i] - C[i];
    }
    }
    """)
Ejemplo n.º 10
0
    def get_module(self, kernel_filename,
                    include_dirs=[], \
                    defines={}, \
                    compile_args={'no_extern_c', True}, jit_compile_args={}):
        """
        Helper function to print compilation output
        """
        def cuda_compile_message_handler(compile_success_bool, info_str,
                                         error_str):
            self.logger.debug("Compilation returned %s",
                              str(compile_success_bool))
            if info_str:
                self.logger.debug("Info: %s", info_str)
            if error_str:
                self.logger.debug("Error: %s", error_str)

        kernel_filename = os.path.normpath(kernel_filename)
        kernel_path = os.path.abspath(
            os.path.join(self.module_path, kernel_filename))
        #self.logger.debug("Getting %s", kernel_filename)

        # Create a hash of the kernel options
        options_hasher = hashlib.md5()
        options_hasher.update(
            str(defines).encode('utf-8') + str(compile_args).encode('utf-8'))
        options_hash = options_hasher.hexdigest()

        # Create hash of kernel souce
        source_hash = CudaContext.hash_kernel( \
                    kernel_path, \
                    include_dirs=[self.module_path] + include_dirs)

        # Create final hash
        root, ext = os.path.splitext(kernel_filename)
        kernel_hash = root \
                + "_" + source_hash \
                + "_" + options_hash \
                + ext
        cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)

        # If we have the kernel in our hashmap, return it
        if (kernel_hash in self.modules.keys()):
            self.logger.debug("Found kernel %s cached in hashmap (%s)",
                              kernel_filename, kernel_hash)
            return self.modules[kernel_hash]

        # If we have it on disk, return it
        elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
            self.logger.debug("Found kernel %s cached on disk (%s)",
                              kernel_filename, kernel_hash)

            with io.open(cached_kernel_filename, "rb") as file:
                file_str = file.read()
                module = cuda.module_from_buffer(
                    file_str,
                    message_handler=cuda_compile_message_handler,
                    **jit_compile_args)

            self.modules[kernel_hash] = module
            return module

        # Otherwise, compile it from source
        else:
            self.logger.debug("Compiling %s (%s)", kernel_filename,
                              kernel_hash)

            #Create kernel string
            kernel_string = ""
            for key, value in defines.items():
                kernel_string += "#define {:s} {:s}\n".format(
                    str(key), str(value))
            kernel_string += '#include "{:s}"'.format(
                os.path.join(self.module_path, kernel_filename))
            if (self.use_cache):
                cached_kernel_dir = os.path.dirname(cached_kernel_filename)
                if not os.path.isdir(cached_kernel_dir):
                    os.mkdir(cached_kernel_dir)
                with io.open(cached_kernel_filename + ".txt", "w") as file:
                    file.write(kernel_string)

            with Common.Timer("compiler") as timer:
                import warnings
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        "ignore",
                        message=
                        "The CUDA compiler succeeded, but said the following:\nkernel.cu",
                        category=UserWarning)
                    cubin = cuda_compiler.compile(kernel_string,
                                                  include_dirs=include_dirs,
                                                  cache_dir=False,
                                                  **compile_args)
                module = cuda.module_from_buffer(
                    cubin,
                    message_handler=cuda_compile_message_handler,
                    **jit_compile_args)
                if (self.use_cache):
                    with io.open(cached_kernel_filename, "wb") as file:
                        file.write(cubin)

            self.modules[kernel_hash] = module
            return module
Ejemplo n.º 11
0
:version: 0.2

:author: Sami-Matias Niemi
:contact: [email protected]
"""
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as cua
from pycuda.compiler import compile
import pyfft.cuda as cufft
import numpy as np

#currently hardcoded...
cubin = compile(
    open('/Users/sammy/EUCLID/vissim-python/support/gputools.cu').read(),
    keep=True)


def convolve(image, kernel, mode='same', saveMemory=False):
    """
    Convolves the input image with a given kernel.
    Current forces the image and kernel to np.float32.

    :param image: image to be convolved
    :type image: 2D ndarray, float32
    :param kernel: kernel to be used in the convolution
    :type kernel: 2D ndarray, float32
    :param mode: output array, either valid, same, or full [same]
    :param saveMemory: if mode is not full memory can be saved by making smaller zero padding
    :type saveMemory: bool
Ejemplo n.º 12
0
import pycuda.elementwise as cuelement
from pycuda.compiler import compile
from pycuda.compiler import SourceModule
import pylab
import scipy.io
import scipy.misc
import scipy.ndimage.interpolation
import scipy.signal
import scipy.sparse
import time

import gputools
import imagetools
import optGPU

cubin = compile(open('gputools.cu').read(), keep=True)
edgetaper_code = open('edgetaper.cu').read()
kernel_code = open('kernel.cu').read()
resize_code = open('resize.cu').read()
projection_code = open('projection.cu').read()

def _generate_preproc(dtype, shape=None):
  
  if dtype == np.float32:
    preproc = '#define real float\n'
  elif dtype == np.float64:
    preproc = '#define real double\n'

  if shape != None:
    preproc += '#define ROWS %d\n' % shape[0]
    preproc += '#define COLS %d\n' % shape[1]
Ejemplo n.º 13
0
test_kernel = compile("""
   #include <thrust/extrema.h>
   #include <thrust/device_ptr.h>
   #include <thrust/execution_policy.h>
   #include <thrust/sort.h>
const int BLOCKSIZE = 32;

extern "C"{   
__global__ void distKernel(float *devA, float *devB, float *devC, int rows, int cols, int K)
{
int idy = threadIdx.y + blockIdx.y * blockDim.y;
int idx = threadIdx.x + blockIdx.x * blockDim.x;

__shared__ float tileA[BLOCKSIZE][BLOCKSIZE];
__shared__ float tileB[BLOCKSIZE];

//Use sum to get the result for a specific element
float sum = 0.0;

//Use iter to see if the loop should be run again
int iter = 0;

do{
//Check if the x thread falls within bounds of the matrices
if ((idy < rows) && (threadIdx.x + BLOCKSIZE*iter < K)){
tileA[threadIdx.y][threadIdx.x] = devA[threadIdx.x + idy*K + BLOCKSIZE*iter];
}
else {
tileA[threadIdx.y][threadIdx.x] = 0.0;
}

//Check if the y thread falls within bounds of the matrices
if ((threadIdx.y + BLOCKSIZE*iter < K)){
tileB[threadIdx.y] = devB[(threadIdx.y + BLOCKSIZE*iter)*cols];
}
else {
tileB[threadIdx.y] = 0.0;
}

//Sync to ensure that all of the data has been grabbed for the tiles in this warp
__syncthreads();

//Sum the squared distance between the terms
for (int i = 0; i < BLOCKSIZE; i++){
sum += (tileA[threadIdx.y][i] - tileB[i])*(tileA[threadIdx.y][i] - tileB[i]);
}

//Iterate the number done
iter++;

//Sync the threads again to ensure they have all done their work before going through the loop to get data
__syncthreads();

//Check if the tiles have covered all of C
} while (BLOCKSIZE*iter < K);

//If the thread falls within the matrix C, fill in its element, scaled by alpha and beta
if ((idy < rows) && (idx < cols)){
devC[idx + idy*cols] = sum;
}
} 

__global__ void getKLabels(float* dist, int* labels, int* kToReturn, int npoints, int k){  
    thrust::device_ptr<float> A(dist); 
    thrust::device_ptr<float> offset; float bigNum = 10000000000000.0;

    for (int i = 0; i < k; i++){
        offset = thrust::min_element(thrust::device, A, A + npoints);
        *(dist+(offset-A)) = bigNum;
	kToReturn[i] = *(labels+(offset-A));
    }
}
__global__ void sort(float* dist, int* labels, int* kLabels,  int npoints, int k){  
    thrust::device_ptr<float> A(dist); thrust::device_ptr<int> B(labels);
    //thrust::sort_by_key(thrust::device, A, A + npoints, B);
    thrust::sort(thrust::seq, dist, dist + npoints);

    for (int i = 0; i < k; i++){
        kLabels[i] = labels[i];
    }
}

 }
    """,
                      no_extern_c=True)
Ejemplo n.º 14
0
 def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
                 prepared_call_args, \
                 include_dirs=[], no_extern_c=True, 
                 **kwargs):
     """
     Helper function to print compilation output
     """
     def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
         self.logger.debug("Compilation returned %s", str(compile_success_bool))
         if info_str:
             self.logger.debug("Info: %s", info_str)
         if error_str:
             self.logger.debug("Error: %s", error_str)
     
     kernel_filename = os.path.normpath(kernel_filename)
     #self.logger.debug("Getting %s", kernel_filename)
         
     # Create a hash of the kernel (and its includes)
     kwargs_hasher = hashlib.md5()
     kwargs_hasher.update(str(kwargs).encode('utf-8'));
     kwargs_hash = kwargs_hasher.hexdigest()
     kwargs_hasher = None
     root, ext = os.path.splitext(kernel_filename)
     kernel_hash = root \
             + "_" + CudaContext.hash_kernel( \
                 os.path.join(self.module_path, kernel_filename), \
                 include_dirs=[self.module_path] + include_dirs) \
             + "_" + kwargs_hash \
             + ext
     cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
     
     # If we have the kernel in our hashmap, return it
     if (kernel_hash in self.kernels.keys()):
         self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
         return self.kernels[kernel_hash]
     
     # If we have it on disk, return it
     elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
         self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
             
         with io.open(cached_kernel_filename, "rb") as file:
             file_str = file.read()
             module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
             
         kernel = module.get_function(kernel_function_name)
         kernel.prepare(prepared_call_args)
         self.kernels[kernel_hash] = kernel
         return kernel
         
     # Otherwise, compile it from source
     else:
         self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
             
         #Create kernel string
         kernel_string = ""
         for key, value in kwargs.items():
             kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
         kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
         if (self.use_cache):
             cached_kernel_dir = os.path.dirname(cached_kernel_filename)
             if not os.path.isdir(cached_kernel_dir):
                 os.mkdir(cached_kernel_dir)
             with io.open(cached_kernel_filename + ".txt", "w") as file:
                 file.write(kernel_string)
             
         
         with Common.Timer("compiler") as timer:
             cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
             module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
             if (self.use_cache):
                 with io.open(cached_kernel_filename, "wb") as file:
                     file.write(cubin)
             
         kernel = module.get_function(kernel_function_name)
         kernel.prepare(prepared_call_args)
         self.kernels[kernel_hash] = kernel
         
         
         return kernel
Ejemplo n.º 15
0
import pycuda.autoinit
from numpy.testing import assert_array_equal as a_equal
from numpy.testing import assert_array_almost_equal as aa_equal


# kernel compile and import
kernel = '''
__global__ void daxpy(int nx, double a, double *x, double *y) {
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx < nx) y[idx] = a*x[idx] + y[idx];
}
'''
from pycuda.compiler import SourceModule, compile
#mod = SourceModule(kernel, cache_dir='./')

cubin = compile(kernel, cache_dir='./')
#mod = cuda.module_from_buffer(cubin)

with open('daxpy.cubin', 'wb') as f: f.write(cubin)
mod = cuda.module_from_file('daxpy.cubin')
daxpy = mod.get_function('daxpy')

dev = pycuda.autoinit.device
print(dev.compute_capability())
#cuda.device_attribute['COMPUTE_CAPABILITY_MAJOR']
#cuda.device_attribute['COMPUTE_CAPABILITY_MINOR']


# setup
nx = 2**20
Ejemplo n.º 16
0
import pycuda.elementwise as cuelement
from pycuda.compiler import compile
from pycuda.compiler import SourceModule
import pylab
import scipy.io
import scipy.misc
import scipy.ndimage.interpolation
import scipy.signal
import scipy.sparse
import time

import gputools
import imagetools
import optGPU

cubin = compile(open('gputools.cu').read(), keep=True)
edgetaper_code = open('edgetaper.cu').read()
kernel_code = open('kernel.cu').read()
resize_code = open('resize.cu').read()
projection_code = open('projection.cu').read()

def _generate_preproc(dtype, shape=None):
  
  if dtype == np.float32:
    preproc = '#define real float\n'
  elif dtype == np.float64:
    preproc = '#define real double\n'

  if shape != None:
    preproc += '#define ROWS %d\n' % shape[0]
    preproc += '#define COLS %d\n' % shape[1]
:requires: NumPy

:version: 0.2

:author: Sami-Matias Niemi
:contact: [email protected]
"""
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as cua
from pycuda.compiler import compile
import pyfft.cuda as cufft
import numpy as np

#currently hardcoded...
cubin = compile(open('/Users/sammy/EUCLID/vissim-python/support/gputools.cu').read(), keep=True)


def convolve(image, kernel, mode='same', saveMemory=False):
    """
    Convolves the input image with a given kernel.
    Current forces the image and kernel to np.float32.

    :param image: image to be convolved
    :type image: 2D ndarray, float32
    :param kernel: kernel to be used in the convolution
    :type kernel: 2D ndarray, float32
    :param mode: output array, either valid, same, or full [same]
    :param saveMemory: if mode is not full memory can be saved by making smaller zero padding
    :type saveMemory: bool
Ejemplo n.º 18
0
 def get_kernel(self, kernel_filename, include_dirs=[], no_extern_c=True, defines={}):
     """
     Helper function to print compilation output
     """
     def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
         self.logger.debug("Compilation returned %s", str(compile_success_bool))
         if info_str:
             self.logger.debug("Info: %s", info_str)
         if error_str:
             self.logger.debug("Error: %s", error_str)
     
     self.logger.debug("Getting %s", kernel_filename)
         
     # Create a hash of the kernel (and its includes)
     defines_hasher = hashlib.md5()
     defines_hasher.update(str(defines).encode('utf-8'));
     defines_hash = defines_hasher.hexdigest()
     defines_hasher = None
     root, ext = os.path.splitext(kernel_filename)
     kernel_path = os.path.abspath(os.path.join(self.module_path, "gpu_kernels", kernel_filename))
     kernel_hash = root \
             + "_" + CUDAContext.hash_kernel( \
                 kernel_path, \
                 include_dirs=[os.path.join(self.module_path, "../kernels")] + include_dirs) \
             + "_" + defines_hash \
             + ext
     cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
     
     # If we have the kernel in our hashmap, return it
     if (kernel_hash in self.kernels.keys()):
         self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
         return self.kernels[kernel_hash]
     
     # If we have it on disk, return it
     elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
         self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
             
         with io.open(cached_kernel_filename, "rb") as file:
             file_str = file.read()
             module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
             
         self.kernels[kernel_hash] = module
         return self.kernels[kernel_hash]
         
     # Otherwise, compile it from source
     else:
         self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
             
         #Create kernel string
         kernel_string = ""
         for key, value in defines.items():
             kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
         kernel_string += '#include "{:s}"'.format(str(kernel_path))
         if (self.use_cache):
             with io.open(cached_kernel_filename + ".txt", "w") as file:
                 #Why is kernel_string a bytes object in Python 3.5.2?
                 #Bugfix here
                 if isinstance(kernel_string, bytes):
                     kernel_string = bytes.decode(kernel_string)
                 file.write(kernel_string)
             
         
         with Timer("compiler") as timer:
             cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
             module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
             if (self.use_cache):
                 with io.open(cached_kernel_filename, "wb") as file:
                     file.write(cubin)
             
         self.kernels[kernel_hash] = module
         
         return self.kernels[kernel_hash]
Ejemplo n.º 19
0
 def get_kernel(self, kernel_filename, include_dirs=[], defines={}, compile_args={'no_extern_c': True}, jit_compile_args={}):
     """
     Helper function to print compilation output
     """
     def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
         self.logger.debug("Compilation returned %s", str(compile_success_bool))
         if info_str:
             self.logger.debug("Info: %s", info_str)
         if error_str:
             self.logger.debug("Error: %s", error_str)
     
     self.logger.debug("Getting %s", kernel_filename)
         
     # Create a hash of the kernel (and its includes)
     options_hasher = hashlib.md5()
     options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'));
     options_hash = options_hasher.hexdigest()
     options_hasher = None
     root, ext = os.path.splitext(kernel_filename)
     kernel_path = os.path.abspath(os.path.join(self.module_path, "gpu_kernels", kernel_filename))
     kernel_hash = root \
             + "_" + CUDAContext.hash_kernel( \
                 kernel_path, \
                 include_dirs=[os.path.join(self.module_path, "../kernels")] + include_dirs) \
             + "_" + options_hash \
             + ext
     cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
     
     # If we have the kernel in our hashmap, return it
     if (kernel_hash in self.kernels.keys()):
         self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
         return self.kernels[kernel_hash]
     
     # If we have it on disk, return it
     elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
         self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
             
         with io.open(cached_kernel_filename, "rb") as file:
             file_str = file.read()
             module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args)
             
         self.kernels[kernel_hash] = module
         return self.kernels[kernel_hash]
         
     # Otherwise, compile it from source
     else:
         self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
             
         #Create kernel string
         kernel_string = ""
         for key, value in defines.items():
             kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
         kernel_string += '#include "{:s}"'.format(str(kernel_path))
         if (self.use_cache):
             with io.open(cached_kernel_filename + ".txt", "w") as file:
                 #Why is kernel_string a bytes object in Python 3.5.2?
                 #Bugfix here
                 if isinstance(kernel_string, bytes):
                     kernel_string = bytes.decode(kernel_string)
                 file.write(kernel_string)
             
         
         with Timer("compiler") as timer:
             cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args)
             module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args)
             if (self.use_cache):
                 with io.open(cached_kernel_filename, "wb") as file:
                     file.write(cubin)
             
         self.kernels[kernel_hash] = module
         
         return self.kernels[kernel_hash]