def get_CUDA_function(device_id, function_name, kernel_source): """ Returns the compiled kernel for the given device and kernel key. Kernels may be pre-compiled with compile_all. """ global KERNEL_cubins cubin = KERNEL_cubins.get((device_id, function_name)) if cubin is None: start = time.time() log("compiling for device %s: %s=%s", device_id, function_name, kernel_source) cubin = compile(kernel_source) KERNEL_cubins[(device_id, function_name)] = cubin end = time.time() log("compilation of %s took %.1fms", function_name, 1000.0 * (end - start)) #now load from cubin: start = time.time() mod = driver.module_from_buffer(cubin) CUDA_function = mod.get_function(function_name) end = time.time() log("loading function %s from pre-compiled cubin took %.1fms", function_name, 1000.0 * (end - start)) return CUDA_function
def compile_all(function_name, kernel_src, device_ids=None): """ Pre-compiles kernel source on the given devices, so we can then call get_CUDA_function quickly to get the function to call. """ global KERNEL_cubins if device_ids is None: device_ids = init_all_devices() cf = driver.ctx_flags for device_id in device_ids: device = None context = None try: device = driver.Device(device_id) context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) cubin = KERNEL_cubins.get((device_id, function_name)) if cubin is None: start = time.time() log("compiling for device %s: %s=%s", device_id, function_name, kernel_src) cubin = compile(kernel_src) end = time.time() log("compilation of %s took %.1fms", function_name, 1000.0*(end-start)) KERNEL_cubins[(device_id, function_name)] = cubin finally: if context: context.pop()
def compile_all(function_name, kernel_src, device_ids=None): """ Pre-compiles kernel source on the given devices, so we can then call get_CUDA_function quickly to get the function to call. """ global KERNEL_cubins if device_ids is None: device_ids = init_all_devices() cf = driver.ctx_flags for device_id in device_ids: device = None context = None try: device = driver.Device(device_id) context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) cubin = KERNEL_cubins.get((device_id, function_name)) if cubin is None: start = time.time() log("compiling for device %s: %s=%s", device_id, function_name, kernel_src) cubin = compile(kernel_src) end = time.time() log("compilation of %s took %.1fms", function_name, 1000.0 * (end - start)) KERNEL_cubins[(device_id, function_name)] = cubin finally: if context: context.pop()
def load_cuda(): md5 = hashlib.md5() md5.update(mod.encode("utf-8")) filename = md5.hexdigest() + ".cubin" path = pathlib.Path(__file__).resolve().parent / filename if not path.exists(): try: cubin = compiler.compile(mod, no_extern_c=True) with open(str(path), "wb") as handle: handle.write(cubin) except cuda.CompileError as ce: print(f"{ce}") return cuda.module_from_file(str(path))
def check_and_build_cuda(dpath, src_dir, **kwargs): code_type = "cu" build_dict = load_build_yaml(dpath) build_dpath = join(dpath, "cuda", "build") obj_suffix = "cubin" env = build_dict[code_type] flags = "" if env["flags"] == None else env["flags"] # # CUDA environment # from pycuda.compiler import compile # # Build # for target in sorted(build_dict["target"]): joined_code = get_joined_code(target, build_dict, code_type, dpath) # # Check if revisionsed # obj_fpath = join(build_dpath, target + "." + obj_suffix) joined_target_fpath = join(build_dpath, target + "." + code_type) new_compile = True if exists(obj_fpath) and exists(joined_target_fpath): with open(joined_target_fpath, "r") as f: if joined_code == f.read(): new_compile = False if new_compile: with open(joined_target_fpath, "w") as f: f.write(joined_code) # # Compile # if new_compile: print("[compile] {} using the PyCUDA build".format(basename(joined_target_fpath))) cubin = compile(joined_code) with open(obj_fpath, "wb") as f: f.write(cubin) else: print("{} is up to date.".format(joined_target_fpath))
def load_cuda_code_individual(): global _update_individuals_fn md5 = hashlib.md5() md5.update(mod.encode("utf-8")) filename = md5.hexdigest() + ".cubin" path = pathlib.Path(__file__).resolve().parent / filename if not path.exists(): try: cubin = compiler.compile(mod, no_extern_c=True) with open(str(path), "wb") as handle: handle.write(cubin) except cuda.CompileError as ce: print(f"{ce}") _cuda_module = cuda.module_from_file(str(path)) _update_individuals_fn = _cuda_module.get_function("update_individuals")
def get_CUDA_kernel(device_id, src_format, dst_format): init_module() start = time.time() k = KERNELS_MAP.get((src_format, dst_format)) assert k is not None, "no kernel found for %s to %s" % (src_format, dst_format) function_name, ksrc = k global KERNEL_cubins cubin = KERNEL_cubins.get((device_id, function_name)) if cubin is None: debug("compiling for device %s: %s=%s", device_id, function_name, ksrc) cubin = compile(ksrc) KERNEL_cubins[(device_id, function_name)] = cubin #now load from cubin: mod = driver.module_from_buffer(cubin) CUDA_function = mod.get_function(function_name) end = time.time() debug("compilation of %s took %.1fms", function_name, 1000.0*(end-start)) return function_name, CUDA_function
def get_CUDA_function(device_id, function_name, kernel_source): """ Returns the compiled kernel for the given device and kernel key. Kernels may be pre-compiled with compile_all. """ global KERNEL_cubins cubin = KERNEL_cubins.get((device_id, function_name)) if cubin is None: start = time.time() log("compiling for device %s: %s=%s", device_id, function_name, kernel_source) cubin = compile(kernel_source) KERNEL_cubins[(device_id, function_name)] = cubin end = time.time() log("compilation of %s took %.1fms", function_name, 1000.0*(end-start)) #now load from cubin: start = time.time() mod = driver.module_from_buffer(cubin) CUDA_function = mod.get_function(function_name) end = time.time() log("loading function %s from pre-compiled cubin took %.1fms", function_name, 1000.0*(end-start)) return CUDA_function
test_kernel = compile(""" const int BLOCKSIZE = 32; float sigmoid(float in){ return 1.0 / (1 + exp(-1 * in)); } //Tiled version of matrix multiply __global__ void MatrixMultiplyKernel(float *devA, float *devB, float *devC, int rows, int cols, int k, float alpha, float beta) { //Get the thread's x and y locations for its run int idx = threadIdx.x + blockIdx.x * blockDim.x; int idy = threadIdx.y + blockIdx.y * blockDim.y; //Allocate shared memory to hold parts of A and B __shared__ float tileA[BLOCKSIZE][BLOCKSIZE]; __shared__ float tileB[BLOCKSIZE][BLOCKSIZE]; //Use sum to get the result for a specific element float sum = 0.0; //Use iter to see if the loop should be run again int iter = 0; do{ //Check if the x thread falls within bounds of the matrices if ((idy < rows) && (threadIdx.x + BLOCKSIZE*iter < k)){ tileA[threadIdx.y][threadIdx.x] = devA[threadIdx.x + idy*k + BLOCKSIZE*iter]; } else { tileA[threadIdx.y][threadIdx.x] = 0.0; } //Check if the y thread falls within bounds of the matrices if ((threadIdx.y + BLOCKSIZE*iter < k) && (idx < cols)){ tileB[threadIdx.y][threadIdx.x] = devB[idx + (threadIdx.y + BLOCKSIZE*iter)*cols]; } else { tileB[threadIdx.y][threadIdx.x] = 0.0; } //Sync to ensure that all of the data has been grabbed for the tiles in this warp __syncthreads(); //Sum the elements related to the element in C corresponding to idx and idy for (int i = 0; i < BLOCKSIZE; i++){ sum += tileA[threadIdx.y][i] * tileB[i][threadIdx.x]; } //Iterate the number done iter++; //Sync the threads again to ensure they have all done their work before going through the loop to get data __syncthreads(); //Check if the tiles have covered all of C } while (BLOCKSIZE*iter < k); //If the thread falls within the matrix C, fill in its element, scaled by alpha and beta if ((idy < rows) && (idx < cols)){ devC[idx + idy*cols] = sum * alpha + devC[idx + idy*cols] * beta; } } __global__ void distKernel(float *devA, float *devB, float *devC, int K) { int idy = threadIdx.y + blockIdx.y * blockDim.y; if ((idy < K)){ devC[idy] = (devA[idy] - devB[idy])*(devA[idy] - devB[idy]); } } //Element wise subtraction of matrix A and B, stored in matrix C __global__ void sub_sigKernel(float *A, float *B, float *C, int rows) { int i = threadIdx.y + blockIdx.y * blockDim.y; //Ensure the thread is in bounds if (i < rows){ C[i] = (1.0 / (1 + exp(-1 * B[i]))); C[i] = A[i] - C[i]; } } """)
def get_module(self, kernel_filename, include_dirs=[], \ defines={}, \ compile_args={'no_extern_c', True}, jit_compile_args={}): """ Helper function to print compilation output """ def cuda_compile_message_handler(compile_success_bool, info_str, error_str): self.logger.debug("Compilation returned %s", str(compile_success_bool)) if info_str: self.logger.debug("Info: %s", info_str) if error_str: self.logger.debug("Error: %s", error_str) kernel_filename = os.path.normpath(kernel_filename) kernel_path = os.path.abspath( os.path.join(self.module_path, kernel_filename)) #self.logger.debug("Getting %s", kernel_filename) # Create a hash of the kernel options options_hasher = hashlib.md5() options_hasher.update( str(defines).encode('utf-8') + str(compile_args).encode('utf-8')) options_hash = options_hasher.hexdigest() # Create hash of kernel souce source_hash = CudaContext.hash_kernel( \ kernel_path, \ include_dirs=[self.module_path] + include_dirs) # Create final hash root, ext = os.path.splitext(kernel_filename) kernel_hash = root \ + "_" + source_hash \ + "_" + options_hash \ + ext cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) # If we have the kernel in our hashmap, return it if (kernel_hash in self.modules.keys()): self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) return self.modules[kernel_hash] # If we have it on disk, return it elif (self.use_cache and os.path.isfile(cached_kernel_filename)): self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash) with io.open(cached_kernel_filename, "rb") as file: file_str = file.read() module = cuda.module_from_buffer( file_str, message_handler=cuda_compile_message_handler, **jit_compile_args) self.modules[kernel_hash] = module return module # Otherwise, compile it from source else: self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash) #Create kernel string kernel_string = "" for key, value in defines.items(): kernel_string += "#define {:s} {:s}\n".format( str(key), str(value)) kernel_string += '#include "{:s}"'.format( os.path.join(self.module_path, kernel_filename)) if (self.use_cache): cached_kernel_dir = os.path.dirname(cached_kernel_filename) if not os.path.isdir(cached_kernel_dir): os.mkdir(cached_kernel_dir) with io.open(cached_kernel_filename + ".txt", "w") as file: file.write(kernel_string) with Common.Timer("compiler") as timer: import warnings with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message= "The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning) cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args) module = cuda.module_from_buffer( cubin, message_handler=cuda_compile_message_handler, **jit_compile_args) if (self.use_cache): with io.open(cached_kernel_filename, "wb") as file: file.write(cubin) self.modules[kernel_hash] = module return module
:version: 0.2 :author: Sami-Matias Niemi :contact: [email protected] """ import pycuda.autoinit import pycuda.driver as cuda import pycuda.gpuarray as cua from pycuda.compiler import compile import pyfft.cuda as cufft import numpy as np #currently hardcoded... cubin = compile( open('/Users/sammy/EUCLID/vissim-python/support/gputools.cu').read(), keep=True) def convolve(image, kernel, mode='same', saveMemory=False): """ Convolves the input image with a given kernel. Current forces the image and kernel to np.float32. :param image: image to be convolved :type image: 2D ndarray, float32 :param kernel: kernel to be used in the convolution :type kernel: 2D ndarray, float32 :param mode: output array, either valid, same, or full [same] :param saveMemory: if mode is not full memory can be saved by making smaller zero padding :type saveMemory: bool
import pycuda.elementwise as cuelement from pycuda.compiler import compile from pycuda.compiler import SourceModule import pylab import scipy.io import scipy.misc import scipy.ndimage.interpolation import scipy.signal import scipy.sparse import time import gputools import imagetools import optGPU cubin = compile(open('gputools.cu').read(), keep=True) edgetaper_code = open('edgetaper.cu').read() kernel_code = open('kernel.cu').read() resize_code = open('resize.cu').read() projection_code = open('projection.cu').read() def _generate_preproc(dtype, shape=None): if dtype == np.float32: preproc = '#define real float\n' elif dtype == np.float64: preproc = '#define real double\n' if shape != None: preproc += '#define ROWS %d\n' % shape[0] preproc += '#define COLS %d\n' % shape[1]
test_kernel = compile(""" #include <thrust/extrema.h> #include <thrust/device_ptr.h> #include <thrust/execution_policy.h> #include <thrust/sort.h> const int BLOCKSIZE = 32; extern "C"{ __global__ void distKernel(float *devA, float *devB, float *devC, int rows, int cols, int K) { int idy = threadIdx.y + blockIdx.y * blockDim.y; int idx = threadIdx.x + blockIdx.x * blockDim.x; __shared__ float tileA[BLOCKSIZE][BLOCKSIZE]; __shared__ float tileB[BLOCKSIZE]; //Use sum to get the result for a specific element float sum = 0.0; //Use iter to see if the loop should be run again int iter = 0; do{ //Check if the x thread falls within bounds of the matrices if ((idy < rows) && (threadIdx.x + BLOCKSIZE*iter < K)){ tileA[threadIdx.y][threadIdx.x] = devA[threadIdx.x + idy*K + BLOCKSIZE*iter]; } else { tileA[threadIdx.y][threadIdx.x] = 0.0; } //Check if the y thread falls within bounds of the matrices if ((threadIdx.y + BLOCKSIZE*iter < K)){ tileB[threadIdx.y] = devB[(threadIdx.y + BLOCKSIZE*iter)*cols]; } else { tileB[threadIdx.y] = 0.0; } //Sync to ensure that all of the data has been grabbed for the tiles in this warp __syncthreads(); //Sum the squared distance between the terms for (int i = 0; i < BLOCKSIZE; i++){ sum += (tileA[threadIdx.y][i] - tileB[i])*(tileA[threadIdx.y][i] - tileB[i]); } //Iterate the number done iter++; //Sync the threads again to ensure they have all done their work before going through the loop to get data __syncthreads(); //Check if the tiles have covered all of C } while (BLOCKSIZE*iter < K); //If the thread falls within the matrix C, fill in its element, scaled by alpha and beta if ((idy < rows) && (idx < cols)){ devC[idx + idy*cols] = sum; } } __global__ void getKLabels(float* dist, int* labels, int* kToReturn, int npoints, int k){ thrust::device_ptr<float> A(dist); thrust::device_ptr<float> offset; float bigNum = 10000000000000.0; for (int i = 0; i < k; i++){ offset = thrust::min_element(thrust::device, A, A + npoints); *(dist+(offset-A)) = bigNum; kToReturn[i] = *(labels+(offset-A)); } } __global__ void sort(float* dist, int* labels, int* kLabels, int npoints, int k){ thrust::device_ptr<float> A(dist); thrust::device_ptr<int> B(labels); //thrust::sort_by_key(thrust::device, A, A + npoints, B); thrust::sort(thrust::seq, dist, dist + npoints); for (int i = 0; i < k; i++){ kLabels[i] = labels[i]; } } } """, no_extern_c=True)
def get_prepared_kernel(self, kernel_filename, kernel_function_name, \ prepared_call_args, \ include_dirs=[], no_extern_c=True, **kwargs): """ Helper function to print compilation output """ def cuda_compile_message_handler(compile_success_bool, info_str, error_str): self.logger.debug("Compilation returned %s", str(compile_success_bool)) if info_str: self.logger.debug("Info: %s", info_str) if error_str: self.logger.debug("Error: %s", error_str) kernel_filename = os.path.normpath(kernel_filename) #self.logger.debug("Getting %s", kernel_filename) # Create a hash of the kernel (and its includes) kwargs_hasher = hashlib.md5() kwargs_hasher.update(str(kwargs).encode('utf-8')); kwargs_hash = kwargs_hasher.hexdigest() kwargs_hasher = None root, ext = os.path.splitext(kernel_filename) kernel_hash = root \ + "_" + CudaContext.hash_kernel( \ os.path.join(self.module_path, kernel_filename), \ include_dirs=[self.module_path] + include_dirs) \ + "_" + kwargs_hash \ + ext cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) # If we have the kernel in our hashmap, return it if (kernel_hash in self.kernels.keys()): self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) return self.kernels[kernel_hash] # If we have it on disk, return it elif (self.use_cache and os.path.isfile(cached_kernel_filename)): self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash) with io.open(cached_kernel_filename, "rb") as file: file_str = file.read() module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler) kernel = module.get_function(kernel_function_name) kernel.prepare(prepared_call_args) self.kernels[kernel_hash] = kernel return kernel # Otherwise, compile it from source else: self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash) #Create kernel string kernel_string = "" for key, value in kwargs.items(): kernel_string += "#define {:s} {:s}\n".format(str(key), str(value)) kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename)) if (self.use_cache): cached_kernel_dir = os.path.dirname(cached_kernel_filename) if not os.path.isdir(cached_kernel_dir): os.mkdir(cached_kernel_dir) with io.open(cached_kernel_filename + ".txt", "w") as file: file.write(kernel_string) with Common.Timer("compiler") as timer: cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False) module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler) if (self.use_cache): with io.open(cached_kernel_filename, "wb") as file: file.write(cubin) kernel = module.get_function(kernel_function_name) kernel.prepare(prepared_call_args) self.kernels[kernel_hash] = kernel return kernel
import pycuda.autoinit from numpy.testing import assert_array_equal as a_equal from numpy.testing import assert_array_almost_equal as aa_equal # kernel compile and import kernel = ''' __global__ void daxpy(int nx, double a, double *x, double *y) { int idx = blockIdx.x*blockDim.x + threadIdx.x; if (idx < nx) y[idx] = a*x[idx] + y[idx]; } ''' from pycuda.compiler import SourceModule, compile #mod = SourceModule(kernel, cache_dir='./') cubin = compile(kernel, cache_dir='./') #mod = cuda.module_from_buffer(cubin) with open('daxpy.cubin', 'wb') as f: f.write(cubin) mod = cuda.module_from_file('daxpy.cubin') daxpy = mod.get_function('daxpy') dev = pycuda.autoinit.device print(dev.compute_capability()) #cuda.device_attribute['COMPUTE_CAPABILITY_MAJOR'] #cuda.device_attribute['COMPUTE_CAPABILITY_MINOR'] # setup nx = 2**20
:requires: NumPy :version: 0.2 :author: Sami-Matias Niemi :contact: [email protected] """ import pycuda.autoinit import pycuda.driver as cuda import pycuda.gpuarray as cua from pycuda.compiler import compile import pyfft.cuda as cufft import numpy as np #currently hardcoded... cubin = compile(open('/Users/sammy/EUCLID/vissim-python/support/gputools.cu').read(), keep=True) def convolve(image, kernel, mode='same', saveMemory=False): """ Convolves the input image with a given kernel. Current forces the image and kernel to np.float32. :param image: image to be convolved :type image: 2D ndarray, float32 :param kernel: kernel to be used in the convolution :type kernel: 2D ndarray, float32 :param mode: output array, either valid, same, or full [same] :param saveMemory: if mode is not full memory can be saved by making smaller zero padding :type saveMemory: bool
def get_kernel(self, kernel_filename, include_dirs=[], no_extern_c=True, defines={}): """ Helper function to print compilation output """ def cuda_compile_message_handler(compile_success_bool, info_str, error_str): self.logger.debug("Compilation returned %s", str(compile_success_bool)) if info_str: self.logger.debug("Info: %s", info_str) if error_str: self.logger.debug("Error: %s", error_str) self.logger.debug("Getting %s", kernel_filename) # Create a hash of the kernel (and its includes) defines_hasher = hashlib.md5() defines_hasher.update(str(defines).encode('utf-8')); defines_hash = defines_hasher.hexdigest() defines_hasher = None root, ext = os.path.splitext(kernel_filename) kernel_path = os.path.abspath(os.path.join(self.module_path, "gpu_kernels", kernel_filename)) kernel_hash = root \ + "_" + CUDAContext.hash_kernel( \ kernel_path, \ include_dirs=[os.path.join(self.module_path, "../kernels")] + include_dirs) \ + "_" + defines_hash \ + ext cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) # If we have the kernel in our hashmap, return it if (kernel_hash in self.kernels.keys()): self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) return self.kernels[kernel_hash] # If we have it on disk, return it elif (self.use_cache and os.path.isfile(cached_kernel_filename)): self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash) with io.open(cached_kernel_filename, "rb") as file: file_str = file.read() module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler) self.kernels[kernel_hash] = module return self.kernels[kernel_hash] # Otherwise, compile it from source else: self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash) #Create kernel string kernel_string = "" for key, value in defines.items(): kernel_string += "#define {:s} {:s}\n".format(str(key), str(value)) kernel_string += '#include "{:s}"'.format(str(kernel_path)) if (self.use_cache): with io.open(cached_kernel_filename + ".txt", "w") as file: #Why is kernel_string a bytes object in Python 3.5.2? #Bugfix here if isinstance(kernel_string, bytes): kernel_string = bytes.decode(kernel_string) file.write(kernel_string) with Timer("compiler") as timer: cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False) module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler) if (self.use_cache): with io.open(cached_kernel_filename, "wb") as file: file.write(cubin) self.kernels[kernel_hash] = module return self.kernels[kernel_hash]
def get_kernel(self, kernel_filename, include_dirs=[], defines={}, compile_args={'no_extern_c': True}, jit_compile_args={}): """ Helper function to print compilation output """ def cuda_compile_message_handler(compile_success_bool, info_str, error_str): self.logger.debug("Compilation returned %s", str(compile_success_bool)) if info_str: self.logger.debug("Info: %s", info_str) if error_str: self.logger.debug("Error: %s", error_str) self.logger.debug("Getting %s", kernel_filename) # Create a hash of the kernel (and its includes) options_hasher = hashlib.md5() options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8')); options_hash = options_hasher.hexdigest() options_hasher = None root, ext = os.path.splitext(kernel_filename) kernel_path = os.path.abspath(os.path.join(self.module_path, "gpu_kernels", kernel_filename)) kernel_hash = root \ + "_" + CUDAContext.hash_kernel( \ kernel_path, \ include_dirs=[os.path.join(self.module_path, "../kernels")] + include_dirs) \ + "_" + options_hash \ + ext cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) # If we have the kernel in our hashmap, return it if (kernel_hash in self.kernels.keys()): self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) return self.kernels[kernel_hash] # If we have it on disk, return it elif (self.use_cache and os.path.isfile(cached_kernel_filename)): self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash) with io.open(cached_kernel_filename, "rb") as file: file_str = file.read() module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args) self.kernels[kernel_hash] = module return self.kernels[kernel_hash] # Otherwise, compile it from source else: self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash) #Create kernel string kernel_string = "" for key, value in defines.items(): kernel_string += "#define {:s} {:s}\n".format(str(key), str(value)) kernel_string += '#include "{:s}"'.format(str(kernel_path)) if (self.use_cache): with io.open(cached_kernel_filename + ".txt", "w") as file: #Why is kernel_string a bytes object in Python 3.5.2? #Bugfix here if isinstance(kernel_string, bytes): kernel_string = bytes.decode(kernel_string) file.write(kernel_string) with Timer("compiler") as timer: cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args) module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args) if (self.use_cache): with io.open(cached_kernel_filename, "wb") as file: file.write(cubin) self.kernels[kernel_hash] = module return self.kernels[kernel_hash]