def ola_GPU_test(xs_gpu, csf, sw, nhop, offset=(0,0)): sxs = xs_gpu.shape sx = np.array(nhop*csf+sw-nhop) sx = ((int(sx[0]), int(sx[1]))) block_size = (16,16,1) grid_size = (int(np.ceil(float(sx[1])/block_size[1])), int(np.ceil(float(sx[0])/block_size[0]))) if xs_gpu.dtype == np.float32: mod = cu.module_from_buffer(cubin) ola_Kernel = mod.get_function("ola_Kernel_test") elif xs_gpu.dtype == np.complex64: mod = cu.module_from_buffer(cubin) ola_Kernel = mod.get_function("ola_ComplexKernel_test") x_gpu = cua.zeros(sx, np.float32) ola_Kernel(x_gpu.gpudata, xs_gpu.gpudata, np.uint32(sx[0]), np.uint32(sx[1]), np.uint32(sxs[1]), np.uint32(sxs[2]), np.uint32(sw[0]), np.uint32(sw[1]), np.uint32(offset[0]), np.uint32(offset[1]), np.uint32(csf[0]), np.uint32(csf[1]), np.uint32(nhop[0]), np.uint32(nhop[1]), block=block_size, grid=grid_size) return x_gpu
def create_function(ptxcode, iomap, arg_nametypes): m = drv.module_from_buffer(ptxcode) #print(ptxcode) stub_function = m.get_function('stub') iofun = { IOTracker.nio: drv.In, IOTracker.i: drv.In, IOTracker.o: drv.Out, IOTracker.io: drv.InOut } def param_wrapper(bsz, gsz): def stub_wrapper(*args): assert len(args) == len( arg_nametypes), 'error : invalid number argument' wrapped_args = [] for i, arg in enumerate(args): arg_name, arg_type = arg_nametypes[i] if isinstance(arg_type, nvtype.pointer): wrapped_args.append(iofun[iomap[arg_name]](arg)) else: wrapped_args.append(arg) stub_function(*tuple(wrapped_args), block=bsz, grid=gsz) return return stub_wrapper return param_wrapper
def get_CUDA_function(device_id, function_name): """ Returns the compiled kernel for the given device and kernel key. """ global KERNELS data = KERNELS.get(function_name) if data is None: from xpra.platform.paths import default_get_app_dir from xpra.os_util import load_binary_file cubin_file = os.path.join(default_get_app_dir(), "cuda", "%s.fatbin" % function_name) log("get_CUDA_function(%s, %s) cubin file=%s", device_id, function_name, cubin_file) data = load_binary_file(cubin_file) if not data: log.error("failed to load CUDA bin file %s", cubin_file) return None log(" loaded %s bytes", len(data)) KERNELS[function_name] = data #now load from cubin: start = time.time() mod = driver.module_from_buffer(data) log("get_CUDA_function(%s, %s) module=%s", device_id, function_name, mod) try: CUDA_function = mod.get_function(function_name) except driver.LogicError as e: raise Exception("failed to load '%s' from %s: %s" % (function_name, mod, e)) end = time.time() log("loading function %s from pre-compiled cubin took %.1fms", function_name, 1000.0 * (end - start)) return CUDA_function
def __init__( self, source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], ): self._check_arch(arch) cubin = compile( source, nvcc, options, keep, no_extern_c, arch, code, cache_dir, include_dirs, ) from pycuda.driver import module_from_buffer self.module = module_from_buffer(cubin) self._bind_module()
def comp_ola_sdeconv(gx_gpu, gy_gpu, xx_gpu, xy_gpu, Ftpy_gpu, f_gpu, L_gpu, alpha, beta, gamma=0): """ Computes the division in Fourier space needed for sparse deconvolution """ sfft = xx_gpu.shape block_size = (16,16,1) grid_size = (int(np.ceil(np.float32(sfft[0]*sfft[1])/block_size[0])), int(np.ceil(np.float32(sfft[2])/block_size[1]))) mod = cu.module_from_buffer(cubin) comp_ola_sdeconv_Kernel = mod.get_function("comp_ola_sdeconv_Kernel") z_gpu = cua.zeros(sfft, np.complex64) comp_ola_sdeconv_Kernel(z_gpu.gpudata, np.int32(sfft[0]), np.int32(sfft[1]), np.int32(sfft[2]), gx_gpu.gpudata, gy_gpu.gpudata, xx_gpu.gpudata, xy_gpu.gpudata, Ftpy_gpu.gpudata, f_gpu.gpudata, L_gpu.gpudata, np.float32(alpha), np.float32(beta), np.float32(gamma), block=block_size, grid=grid_size) return z_gpu
def get_CUDA_function(device_id, function_name): """ Returns the compiled kernel for the given device and kernel key. """ global KERNELS data = KERNELS.get(function_name) if data is None: from xpra.platform.paths import get_resources_dir cubin_file = os.path.join(get_resources_dir(), "cuda", "%s.fatbin" % function_name) log("get_CUDA_function(%s, %s) cubin file=%s", device_id, function_name, cubin_file) data = load_binary_file(cubin_file) if not data: log.error("Error: failed to load CUDA bin file '%s'", cubin_file) return None log(" loaded %s bytes", len(data)) KERNELS[function_name] = data #now load from cubin: start = monotonic_time() try: mod = driver.module_from_buffer(data) except Exception as e: log("module_from_buffer(%s)", data, exc_info=True) log.error("Error: failed to load module from buffer for '%s'", function_name) log.error(" %s", e) return None log("get_CUDA_function(%s, %s) module=%s", device_id, function_name, mod) try: fn = function_name CUDA_function = mod.get_function(fn) except driver.LogicError as e: raise Exception("failed to load '%s' from %s: %s" % (function_name, mod, e)) from None end = monotonic_time() log("loading function %s from pre-compiled cubin took %.1fms", function_name, 1000.0*(end-start)) return CUDA_function
def get_CUDA_function(device_id, function_name): """ Returns the compiled kernel for the given device and kernel key. """ global KERNELS data = KERNELS.get(function_name) if data is None: from xpra.platform.paths import default_get_app_dir from xpra.os_util import load_binary_file cubin_file = os.path.join(default_get_app_dir(), "cuda", "%s.fatbin" % function_name) log("get_CUDA_function(%s, %s) cubin file=%s", device_id, function_name, cubin_file) data = load_binary_file(cubin_file) if not data: log.error("failed to load CUDA bin file %s", cubin_file) return None log(" loaded %s bytes", len(data)) KERNELS[function_name] = data #now load from cubin: start = time.time() mod = driver.module_from_buffer(data) log("get_CUDA_function(%s, %s) module=%s", device_id, function_name, mod) try: CUDA_function = mod.get_function(function_name) except driver.LogicError as e: raise Exception("failed to load '%s' from %s: %s" % (function_name, mod, e)) end = time.time() log("loading function %s from pre-compiled cubin took %.1fms", function_name, 1000.0*(end-start)) return CUDA_function
def get_CUDA_function(device_id, function_name, kernel_source): """ Returns the compiled kernel for the given device and kernel key. Kernels may be pre-compiled with compile_all. """ global KERNEL_cubins cubin = KERNEL_cubins.get((device_id, function_name)) if cubin is None: start = time.time() log("compiling for device %s: %s=%s", device_id, function_name, kernel_source) cubin = compile(kernel_source) KERNEL_cubins[(device_id, function_name)] = cubin end = time.time() log("compilation of %s took %.1fms", function_name, 1000.0 * (end - start)) #now load from cubin: start = time.time() mod = driver.module_from_buffer(cubin) CUDA_function = mod.get_function(function_name) end = time.time() log("loading function %s from pre-compiled cubin took %.1fms", function_name, 1000.0 * (end - start)) return CUDA_function
def cropGPU(gpuArray, size, offset=(0, 0), block_size=(32, 32, 1)): """ Crop an image array that is on the GPU to a new size. :param gpuArray: image array to be cropped :type gpuArray: GPU array :param size: size to which the array is crooped to (y, x) :type size: tuple :param offset: apply offset? :type offset: tuple :param block_size: CUDA block_size :param block_size: tuple :return: cropped array :rtype: GPU array """ sfft = gpuArray.shape grid_size = (int(np.ceil(float(sfft[1]) / block_size[1])), int(np.ceil(float(sfft[0]) / block_size[0]))) if gpuArray.dtype == np.float32: mod = cuda.module_from_buffer(cubin) cropKernel = mod.get_function("crop_Kernel") elif gpuArray.dtype == np.complex64: mod = cuda.module_from_buffer(cubin) cropKernel = mod.get_function("crop_ComplexKernel") else: print 'Incorrect data type in cropGPU' return None x_cropped_gpu = cua.empty(tuple((int(size[0]), int(size[1]))), np.float32) cropKernel(x_cropped_gpu.gpudata, np.int32(size[0]), np.int32(size[1]), gpuArray.gpudata, np.int32(sfft[0]), np.int32(sfft[1]), np.int32(offset[0]), np.int32(offset[1]), block=block_size, grid=grid_size) return x_cropped_gpu
def load(self, cubin): if cubin in self._modrefs: return self._modrefs[cubin] mod = cuda.module_from_buffer(self.cubin) if len(self._modrefs) > self.MAX_MODREFS: self._modrefs.clear() self._modrefs[cubin] = mod return mod
def chop_pad_GPU_test(x, csf, sw, nhop, sz=None, offset=(0,0), dtype='real'): sx = x.shape if sz == None: sz = 2**np.ceil(np.log2(sw)) block_size = (32,32,1) grid_size = (int(np.ceil(np.float32(sz[0]*sz[1])/block_size[0])), int(np.ceil(np.float32(np.prod(csf))/block_size[1]))) #print block_size #print grid_size #print csf sxp = np.array(nhop*csf+sw-nhop) sxp = ((int(sxp[0]), int(sxp[1]))) x_gpu = pad_cpu2gpu(x, sxp, dtype='real') if dtype == 'real': mod = cu.module_from_buffer(cubin) chop_pad_Kernel = mod.get_function("chop_pad_Kernel_test") xs_gpu = cua.empty(((int(np.prod(csf)), int(sz[0]),int(sz[1]))), np.float32) elif dtype == 'complex': mod = cu.module_from_buffer(cubin) chop_pad_Kernel = mod.get_function("chop_pad_ComplexKernel_test") xs_gpu = cua.empty(((int(np.prod(csf)), int(sz[0]),int(sz[1]))), np.complex64) sz = xs_gpu.shape chop_pad_Kernel(xs_gpu.gpudata, np.int32(sz[1]), np.int32(sz[2]), np.int32(sz[0]), x_gpu.gpudata, np.int32(sxp[0]), np.int32(sxp[1]), np.int32(sw[0]), np.int32(sw[1]), np.int32(offset[0]), np.int32(offset[1]), np.int32(csf[0]), np.int32(csf[1]), np.int32(nhop[0]), np.int32(nhop[1]), block=block_size, grid=grid_size) return xs_gpu
def crop_stack_GPU(x, sz, offset=(0,0), dtype='real'): if x.__class__ == np.ndarray: x = np.array(x).astype(np.float32) x_gpu = cua.to_gpu(x) elif x.__class__ == cua.GPUArray: x_gpu = x sx = x_gpu.shape block_size = (16,16,1) grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[0])), int(np.ceil(np.float32(sz[1])/block_size[1]))) sx_before = np.array([sx[1],sx[2]]) sx_after = np.array(sz) if any(np.array([sx[1],sx[2]])-(np.array(sz))<offset): raise IOError('Size missmatch: Size after - size before < offset') if dtype == 'real': if x_gpu.dtype != np.float32: x_gpu = x_gpu.real mod = cu.module_from_buffer(cubin) crop_stack_Kernel = mod.get_function("crop_stack_Kernel") xc_gpu = cua.zeros(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.float32) if dtype == 'complex': mod = cu.module_from_buffer(cubin) crop_stack_Kernel = mod.get_function("crop_stack_ComplexKernel") xc_gpu = cua.empty(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.complex64) crop_stack_Kernel(xc_gpu.gpudata, np.int32(sx[0]), np.int32(sz[0]), np.int32(sz[1]), x_gpu.gpudata, np.int32(sx[1]), np.int32(sx[2]), np.int32(offset[0]), np.int32(offset[1]), block=block_size, grid=grid_size) return xc_gpu
def pad_stack_GPU(x, sz, offset=(0,0), dtype='real'): if x.__class__ == np.ndarray: x = np.array(x).astype(np.float32) x_gpu = cua.to_gpu(x) elif x.__class__ == cua.GPUArray: x_gpu = x sx = x_gpu.shape block_size = (16,16,1) grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[0])), int(np.ceil(np.float32(sz[1])/block_size[1]))) if dtype == 'real': if x_gpu.dtype != np.float32: x_gpu = x_gpu.real mod = cu.module_from_buffer(cubin) pad_stack_Kernel = mod.get_function("pad_stack_Kernel") xp_gpu = cua.empty(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.float32) pad_stack_Kernel(x_gpu.gpudata, np.int32(sx[0]), np.int32(sx[1]), np.int32(sx[2]), xp_gpu.gpudata, np.int32(sz[0]), np.int32(sz[1]), np.int32(offset[0]), np.int32(offset[1]), block=block_size, grid=grid_size) if dtype == 'complex': mod = cu.module_from_buffer(cubin) pad_stack_Kernel = mod.get_function("pad_stack_ComplexKernel") xp_gpu = cua.empty(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.complex64) pad_stack_Kernel(x_gpu.gpudata, np.int32(sx[0]), np.int32(sx[1]), np.int32(sx[2]), xp_gpu.gpudata, np.int32(sz[0]), np.int32(sz[1]), np.int32(offset[0]), np.int32(offset[1]), block=block_size, grid=grid_size) return xp_gpu
def cropGPU(gpuArray, size, offset=(0, 0), block_size=(32, 32, 1)): """ Crop an image array that is on the GPU to a new size. :param gpuArray: image array to be cropped :type gpuArray: GPU array :param size: size to which the array is crooped to (y, x) :type size: tuple :param offset: apply offset? :type offset: tuple :param block_size: CUDA block_size :param block_size: tuple :return: cropped array :rtype: GPU array """ sfft = gpuArray.shape grid_size = (int(np.ceil(float(sfft[1])/block_size[1])), int(np.ceil(float(sfft[0])/block_size[0]))) if gpuArray.dtype == np.float32: mod = cuda.module_from_buffer(cubin) cropKernel = mod.get_function("crop_Kernel") elif gpuArray.dtype == np.complex64: mod = cuda.module_from_buffer(cubin) cropKernel = mod.get_function("crop_ComplexKernel") else: print 'Incorrect data type in cropGPU' return None x_cropped_gpu = cua.empty(tuple((int(size[0]),int(size[1]))), np.float32) cropKernel(x_cropped_gpu.gpudata, np.int32(size[0]), np.int32(size[1]), gpuArray.gpudata, np.int32(sfft[0]), np.int32(sfft[1]), np.int32(offset[0]), np.int32(offset[1]), block=block_size, grid=grid_size) return x_cropped_gpu
def __init__(self, source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): self._check_arch(arch) cubin = compile(source, nvcc, options, keep, no_extern_c, arch, code, cache_dir, include_dirs) from pycuda.driver import module_from_buffer self.module = module_from_buffer(cubin) self._bind_module()
def pad_cpu2gpu(x, sz, offset=(0,0), dtype='real'): block_size = (16, 16 ,1) grid_size = (int(np.ceil(np.float32(sz[1])/block_size[1])), int(np.ceil(np.float32(sz[0])/block_size[0]))) sx = x.shape if x.__class__ == np.ndarray: x = np.array(x).astype(np.float32) x_gpu = cua.to_gpu(x) elif x.__class__ == cua.GPUArray: x_gpu = x if dtype == 'real': mod = cu.module_from_buffer(cubin) zeroPadKernel = mod.get_function("zeroPadKernel") x_padded_gpu = cua.zeros(tuple((int(sz[0]),int(sz[1]))), np.float32) zeroPadKernel(x_padded_gpu.gpudata, np.int32(sz[0]), np.int32(sz[1]), x_gpu.gpudata, np.int32(sx[0]), np.int32(sx[1]), np.int32(offset[0]), np.int32(offset[1]), block=block_size, grid=grid_size) elif dtype == 'complex': mod = cu.module_from_buffer(cubin) #mod = SourceModule(open('gputools.cu').read(), keep=True) zeroPadComplexKernel = mod.get_function("zeroPadComplexKernel") x_padded_gpu = cua.zeros(tuple((int(sz[0]),int(sz[1]))), np.complex64) zeroPadComplexKernel(x_padded_gpu.gpudata, np.int32(sz[0]), np.int32(sz[1]), x_gpu.gpudata, np.int32(sx[0]), np.int32(sx[1]), np.int32(offset[0]), np.int32(offset[1]), block=block_size, grid=grid_size) return x_padded_gpu
def crop_gpu2cpu(x_gpu, sz, offset=(0,0)): sfft = x_gpu.shape block_size = (16, 16 ,1) grid_size = (int(np.ceil(np.float32(sfft[1])/block_size[1])), int(np.ceil(np.float32(sfft[0])/block_size[0]))) if x_gpu.dtype == np.float32: mod = cu.module_from_buffer(cubin) cropKernel = mod.get_function("crop_Kernel") elif x_gpu.dtype == np.complex64: mod = cu.module_from_buffer(cubin) cropKernel = mod.get_function("crop_ComplexKernel") x_cropped_gpu = cua.empty(tuple((int(sz[0]),int(sz[1]))), np.float32) cropKernel(x_cropped_gpu.gpudata, np.int32(sz[0]), np.int32(sz[1]), x_gpu.gpudata, np.int32(sfft[0]), np.int32(sfft[1]), np.int32(offset[0]), np.int32(offset[1]), block=block_size , grid=grid_size) return x_cropped_gpu
def __init__(self, source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): self._check_arch(arch) cubin = compile(source, nvcc, options, keep, no_extern_c, arch, code, cache_dir, include_dirs) from pycuda.driver import module_from_buffer self.module = module_from_buffer(cubin) self.get_global = self.module.get_global self.get_texref = self.module.get_texref if hasattr(self.module, "get_surfref"): self.get_surfref = self.module.get_surfref
def init_mod(cls): if cls.__dict__.get('mod') is None: cls.radix_size = 1 << cls.radix_bits code = _CODE.substitute(group_size=cls.group_size, radix_bits=cls.radix_bits, radix_size=cls.radix_size) cubin = pycuda.compiler.compile(code) cls.mod = cuda.module_from_buffer(cubin) with open('/tmp/sort_kern.cubin', 'wb') as fp: fp.write(cubin) for name in ['prefix_scan', 'prefix_sum_condense', 'prefix_sum_inner', 'prefix_sum_distribute', 'binary_search', 'prefix_scan_repair']: f = cls.mod.get_function(name) setattr(cls, name, f) f.set_cache_config(cuda.func_cache.PREFER_L1) cls.calc_local_pfxs = cls.mod.get_function('calc_local_pfxs') cls.radix_sort = cls.mod.get_function('radix_sort')
def get_CUDA_kernel(device_id, src_format, dst_format): init_module() start = time.time() k = KERNELS_MAP.get((src_format, dst_format)) assert k is not None, "no kernel found for %s to %s" % (src_format, dst_format) function_name, ksrc = k global KERNEL_cubins cubin = KERNEL_cubins.get((device_id, function_name)) if cubin is None: debug("compiling for device %s: %s=%s", device_id, function_name, ksrc) cubin = compile(ksrc) KERNEL_cubins[(device_id, function_name)] = cubin #now load from cubin: mod = driver.module_from_buffer(cubin) CUDA_function = mod.get_function(function_name) end = time.time() debug("compilation of %s took %.1fms", function_name, 1000.0*(end-start)) return function_name, CUDA_function
def ola_GPU(xs_gpu, sy, csf, hop): y_gpu = cua.empty(sy, np.float32) block_size = (16,16,1) grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[1])), int(np.ceil(np.float32(sz[1])/block_size[0]))) mod = cu.module_from_buffer(cubin) copy_Kernel = mod.get_function("copy_Kernel") for i in range(csf[0]): for j in range(csf[1]): copy_Kernel(y_gpu, np.uint32(sy[0]), np.uint32(sy[0]), xs_gpu, np.uint32(sx[0]), np.uint32(sx[1]), np.uint32(sx[2]), np.uint32(offset[0]), np.uint32(offset[1]), np.uint32(startrow), block=block_size, grid=grid_size) return np.real(y_gpu.get())
def _prepared_gfunc_from_llvm_kernel(llvm_kernel, capability=(1,1), cuda_module_options=[]): from pycuda.driver import module_from_buffer cpu = 'sm_%d%d' % capability ptxtm = le.TargetMachine.lookup(arch='nvptx64', cpu=cpu) pm = lp.build_pass_managers(ptxtm, opt=3, fpm=False).pm pm.run(llvm_kernel.module) asm = ptxtm.emit_assembly(llvm_kernel.module) #XXX: Hack. llvm 3.2 doesn't set map_f64_to_f32 for cpu < sm_13 as it # should if capability < (1, 3): target_str = '.target ' + cpu asm = asm.replace(target_str, target_str + ', map_f64_to_f32') mod = module_from_buffer(asm, options=cuda_module_options) gfunc = mod.get_function(llvm_kernel.name) gfunc.prepare('P'*(len(llvm_kernel.args)-1) + 'i') return gfunc
def init_mod(cls): if cls.__dict__.get('mod') is None: cls.radix_size = 1 << cls.radix_bits code = _CODE.substitute(group_size=cls.group_size, radix_bits=cls.radix_bits, radix_size=cls.radix_size) cubin = pycuda.compiler.compile(code) cls.mod = cuda.module_from_buffer(cubin) with open('/tmp/sort_kern.cubin', 'wb') as fp: fp.write(cubin) for name in [ 'prefix_scan', 'prefix_sum_condense', 'prefix_sum_inner', 'prefix_sum_distribute', 'binary_search', 'prefix_scan_repair' ]: f = cls.mod.get_function(name) setattr(cls, name, f) f.set_cache_config(cuda.func_cache.PREFER_L1) cls.calc_local_pfxs = cls.mod.get_function('calc_local_pfxs') cls.radix_sort = cls.mod.get_function('radix_sort')
def get_CUDA_function(device_id, function_name, kernel_source): """ Returns the compiled kernel for the given device and kernel key. Kernels may be pre-compiled with compile_all. """ global KERNEL_cubins cubin = KERNEL_cubins.get((device_id, function_name)) if cubin is None: start = time.time() log("compiling for device %s: %s=%s", device_id, function_name, kernel_source) cubin = compile(kernel_source) KERNEL_cubins[(device_id, function_name)] = cubin end = time.time() log("compilation of %s took %.1fms", function_name, 1000.0*(end-start)) #now load from cubin: start = time.time() mod = driver.module_from_buffer(cubin) CUDA_function = mod.get_function(function_name) end = time.time() log("loading function %s from pre-compiled cubin took %.1fms", function_name, 1000.0*(end-start)) return CUDA_function
def FloydWarshall(self, switches, links): adj_graph = dict() for switch1 in switches: adj_graph[switch1.dp.id] = dict() adj_graph[switch1.dp.id][switch1.dp.id] = 0 for link in links: if link.src.dpid == switch1.dp.id: adj_graph[switch1.dp.id][link.dst.dpid] = float(link.delay) N=max(adj_graph)+1 adj_array = numpy.full(N*N, float("inf")).astype(numpy.float32) for key1, row in adj_graph.iteritems(): for key2, value in row.iteritems(): adj_array[key1 * N + key2] = value adj_gpu = cuda.mem_alloc(adj_array.size * adj_array.dtype.itemsize) cuda.memcpy_htod(adj_gpu, adj_array) next_array = [ i % N for i in range(N*N) ] next_np = numpy.array(next_array).astype(numpy.int32) next_gpu = cuda.mem_alloc(next_np.size * next_np.dtype.itemsize) cuda.memcpy_htod(next_gpu, next_np) mod = cuda.module_from_buffer(self.result_data) func = mod.get_function("fw") for k in range(1,N): func(adj_gpu, next_gpu, numpy.int32(k), numpy.int32(N), block=(N, N, 1), grid=(1, 1), shared=0) cuda.memcpy_dtoh(next_np, next_gpu) #cuda.memcpy_dtoh(adj_array, adj_gpu) next_gpu.free() adj_gpu.free() autoinit.patch_finish() #self.logger.info("%s", adj_array) #self.logger.info("%s", next_np) return next_np
def remove_empty_anchor(view, anchors, limit): # input: # ahchors: (N, 4) 4->(y1, x1, y2, x2) (x > y) # view: (W, H, C) mod = cuda.module_from_buffer(module_buff) func = mod.get_function('_Z12remove_emptyPfPiS_S0_S0_') anchors_shape = np.array(anchors.shape).astype(np.int32) view_shape = np.array(view.shape).astype(np.int32) index = np.zeros((anchors.shape[0], view_shape[2])).astype(np.float32) func( cuda.InOut(index), cuda.In(anchors), cuda.In(view), cuda.In(anchors_shape), cuda.In(view_shape), block=(int(view_shape[2]), 1, 1), # a thread <-> a value in a specific 2d pos(need to sum the channel) grid=(int(anchors_shape[0]), 50, 1) # a grid <-> an anchor and a line(x) # 50 must > anchors width ) index = np.sum(index, axis=1) return np.where(index > limit)[0]
def get_module(self, kernel_filename, include_dirs=[], \ defines={}, \ compile_args={'no_extern_c', True}, jit_compile_args={}): """ Helper function to print compilation output """ def cuda_compile_message_handler(compile_success_bool, info_str, error_str): self.logger.debug("Compilation returned %s", str(compile_success_bool)) if info_str: self.logger.debug("Info: %s", info_str) if error_str: self.logger.debug("Error: %s", error_str) kernel_filename = os.path.normpath(kernel_filename) kernel_path = os.path.abspath( os.path.join(self.module_path, kernel_filename)) #self.logger.debug("Getting %s", kernel_filename) # Create a hash of the kernel options options_hasher = hashlib.md5() options_hasher.update( str(defines).encode('utf-8') + str(compile_args).encode('utf-8')) options_hash = options_hasher.hexdigest() # Create hash of kernel souce source_hash = CudaContext.hash_kernel( \ kernel_path, \ include_dirs=[self.module_path] + include_dirs) # Create final hash root, ext = os.path.splitext(kernel_filename) kernel_hash = root \ + "_" + source_hash \ + "_" + options_hash \ + ext cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) # If we have the kernel in our hashmap, return it if (kernel_hash in self.modules.keys()): self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) return self.modules[kernel_hash] # If we have it on disk, return it elif (self.use_cache and os.path.isfile(cached_kernel_filename)): self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash) with io.open(cached_kernel_filename, "rb") as file: file_str = file.read() module = cuda.module_from_buffer( file_str, message_handler=cuda_compile_message_handler, **jit_compile_args) self.modules[kernel_hash] = module return module # Otherwise, compile it from source else: self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash) #Create kernel string kernel_string = "" for key, value in defines.items(): kernel_string += "#define {:s} {:s}\n".format( str(key), str(value)) kernel_string += '#include "{:s}"'.format( os.path.join(self.module_path, kernel_filename)) if (self.use_cache): cached_kernel_dir = os.path.dirname(cached_kernel_filename) if not os.path.isdir(cached_kernel_dir): os.mkdir(cached_kernel_dir) with io.open(cached_kernel_filename + ".txt", "w") as file: file.write(kernel_string) with Common.Timer("compiler") as timer: import warnings with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message= "The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning) cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args) module = cuda.module_from_buffer( cubin, message_handler=cuda_compile_message_handler, **jit_compile_args) if (self.use_cache): with io.open(cached_kernel_filename, "wb") as file: file.write(cubin) self.modules[kernel_hash] = module return module
def basic_add_performance_2(): """Measures memory latency for certain operations.""" base_src = Template(""" .entry $FNAME ( .param .u32 out ) { .reg .u32 base, off, clka, clkb, clkoa, clkob, clks, tmp, iter; .reg .pred p; mov.u32 iter, $RUNS; mov.u32 clks, 0; mov.u32 tmp, 0; ld.const.u32 base, [scratch]; $MULT mov.u32 lcg_state, scratch; warmup: mov.u32 clka, %clock; $OPER sub.u32 iter, iter, 1; setp.ne.u32 p, iter, 0; @p bra.uni warmup; mov.u32 clkoa, %clock; mov.u32 iter, $RUNS; loop: //call.uni (tmp), lcg_rounds, (100); $LCGROUNDS mov.u32 clka, %clock; $OPER xor.b32 clka, clka, tmp; mov.u32 clkb, %clock; xor.b32 clka, clka, tmp; sub.u32 clka, clkb, clka; add.u32 clks, clks, clka; sub.u32 iter, iter, 1; setp.ne.u32 p, iter, 0; @p bra.uni loop; mov.u32 clkob, %clock; sub.u32 clkoa, clkob, clkoa; mov.u32 iter, $RUNS; cooldown: $OPER sub.u32 iter, iter, 1; setp.ne.u32 p, iter, 0; @p bra.uni cooldown; ld.param.u32 base, [out]; call.uni (off), get_gtid, (); shr.u32 off, off, 5; mad24.lo.u32 base, off, 8, base; call.uni (tmp), lcg_rounds, (1); st.volatile.global.b32 [base], tmp; st.volatile.global.b32 [base], clks; add.u32 base, base, 4; st.global.b32 [base], clkoa; } """) addrtypes = { 'single': {'label': "all conflicts", 'ADDRTYPE': "single", 'MULT': "mov.u32 off, %smid;" + "mad24.lo.u32 base, off, 128, base;"}, 'uncoa': {'label': "uncoalesced", 'ADDRTYPE': "uncoa", 'MULT': "call.uni (off), get_gtid, ();" + "mad24.lo.u32 base, off, 128, base;"}, 'coa': {'label': "coalesced", 'ADDRTYPE': "coa", 'MULT': "call.uni (off), get_gtid, ();" + "mad24.lo.u32 base, off, 4, base;"}, } # Evil, I know, DRY and all addrtypesorder = ['single', 'uncoa', 'coa'] opertypes = { 'atomic': "atom.global.add.u32 tmp, [base], tmp;", 'red': "red.global.add.u32 [base], clks;", 'store': "st.global.u32 [base], clks;", 'load': "ld.global.u32 tmp, [base];", 'load_store': """ ld.global.u32 tmp, [base]; add.u32 tmp, tmp, clks; st.global.u32 [base], tmp; """ } opertypesorder = ['load', 'store', 'load_store', 'red', 'atomic'] lcgtext = "mad.lo.u32 lcg_state, lcg_state, 1664525, 1013904223;\n"*50 order = [] for va in addrtypesorder: for k in sorted(opertypes.keys()): order.append((va, k)) runs = 512 rounds = 4 mod = stdlib + "\n.const .u32 scratch;" for (addr, oper) in order: c = dict(addrtypes[addr]) c['otype'] = oper c['OPER'] = opertypes[oper] c['RUNS'] = runs c['FNAME'] = "%s_%s" % (addr, oper) c['LCGROUNDS'] = lcgtext mod += base_src.substitute(c) for i in enumerate(mod.split('\n')): print "%3d %s" % i disassemble(mod) mod = cuda.module_from_buffer(mod) figs = [] barwidth = 0.3 scratch = cuda.mem_alloc(1024*16*30*128) scratchptr = mod.get_global('scratch') cuda.memset_d32(scratchptr[0], int(scratch), 1) def plot(title, names, vals, errs): N=len(vals[0]) bw=2*.9/len(names) fig = plt.figure() ax = fig.add_subplot(111, title=title) ax.set_ylabel('Clocks') ax.set_xlabel('Warps/SM') ax.set_xticks(range(N)) ax.set_xticklabels([1<<i for i in range(N)]) for idx, (name,val,err) in enumerate(zip(names, vals, errs)): ax.bar([i+bw*(idx/2)-.45 for i in range(N)], val, bw, yerr=err, color=colors[idx], label=name, zorder=-idx) ax.axis(ymin=0) ax.legend(loc=0) return fig for addr in addrtypesorder: addrlbl = addrtypes[addr]['label'] print "Access pattern:", addrlbl interms, interes, totalms, totales = [], [], [], [] for operidx, oper in enumerate(opertypesorder): interm, intere, totalm, totale = [], [], [], [] for dim in ((1, 1), (2, 1), (4, 1), (8, 1), (8, 2), (8, 4)): vals = numpy.zeros( (dim[0] * dim[1] * 30, 2) ) fn = mod.get_function('%s_%s' % (addr, oper)) for round in range(rounds+1): a = numpy.zeros_like(vals).astype(numpy.int32) fn(cuda.InOut(a), block=(32 * dim[0], 1, 1), grid=(30 * dim[1], 1)) if round != 0: vals += a time.sleep(.005) means = scipy.mean(vals, axis=0) / (runs*rounds) stds = scipy.std(vals, axis=0) / (runs*rounds) # this is just gross interm.append(means[0]) totalm.append(means[1]) intere.append(stds[0]) totale.append(stds[1]) print "%16s: %1.7f±%1.6f" % (oper, means[0], stds[0]) print "%16s: %1.7f±%1.6f" % (oper+' total', means[1], stds[1]) interms.append(interm) interes.append(intere) interms.append(totalm) interes.append(totale) names = [] for i in opertypesorder: names.append(i) names.append(i + ' total') fig1 = plot('Compute memory latency, %s access pattern' % addrlbl, names, interms, interes) figs.append((addr, fig1)) return figs
def consecutive_clocks(): """Measures a few rounds of sampling consecutive clocks.""" ptx = stdlib + """ .entry consecutive_clocks ( .param .u32 out ) { .reg .u32 base, off, clka, clkb, clks, iter; .reg .pred p; mov.u32 iter, 256; mov.u32 clks, 0; loop: mov.u32 clka, %clock; mov.u32 clkb, %clock; sub.u32 clka, clkb, clka; add.u32 clks, clks, clka; sub.u32 iter, iter, 1; setp.ne.u32 p, iter, 0; @p bra.uni loop; ld.param.u32 base, [out]; call.uni (off), get_gtid, (); mad24.lo.u32 base, off, 4, base; st.global.b32 [base], clks; } """ fn = get_func(cuda.module_from_buffer(ptx), 'consecutive_clocks') fig = plt.figure() ax = fig.add_subplot(111, title='Clocks from consecutive operations, 256 iterations/thread') ax.set_ylabel('Clocks') ax.set_xlabel('Block width') ax.set_xticks(range(10)) ax.set_xticklabels([str(1 << i) for i in range(10)]) for grid in range(5): gridw = 1 << grid allres = [] allerr = [] for width in range(10): widthw = 1 << width if widthw * gridw > 1024: continue all_calc = numpy.zeros( (gridw * 30 * widthw,) ).astype(numpy.int32) for run in range(5): a = numpy.empty( (gridw * 30 * widthw,) ).astype(numpy.int32) fn(cuda.InOut(a), block=(widthw, 1, 1), grid=(gridw * 30, 1)) all_calc += a print "%dx%d: %f ± %f" % (gridw, widthw, scipy.mean(all_calc), scipy.std(all_calc)) allres.append(scipy.mean(all_calc)/256/5) allerr.append(scipy.std(all_calc)/(256*5)) #ax.plot(range(len(allres)), allres, keys[grid], label=str(gridw)) ax.errorbar(range(len(allres)), allres, yerr=allerr, fmt=keys[grid], label=str(gridw)) ax.legend(loc=0, title="Blocks/SM") return fig
def zeropadToGPU(array, size, offset=(0, 0), dtype='real', block_size=(32, 32, 1)): """ Zero pad the input array and transfer it to the GPU memory if not there yet :param array: input array to be zeropadded and transferred :type array: ndarray :param size: size of the array (y, x) :type size: tuple :param offset: apply offset? :type offset: tuple :param dtype: data type, either real or complex :type: str :param block_size: CUDA block_size :param block_size: tuple :return: zero padded array that resides in the GPU memory :rtype: GPUarray """ grid_size = (int(np.ceil(float(size[1])/block_size[1])), int(np.ceil(float(size[0])/block_size[0]))) ay, ax = array.shape ay = np.int32(ay) ax = np.int32(ax) offsetx = np.int32(offset[0]) offsety = np.int32(offset[1]) sy = np.int32(size[0]) sx = np.int32(size[1]) if array.__class__ == np.ndarray: #array = np.array(array).astype(np.float32) array_gpu = cua.to_gpu(array) #array_gpu = cua.to_gpu_async(array) elif array.__class__ == cua.GPUArray: array_gpu = array else: print 'ERROR: Array type neither NumPy or GPUArray' return None if dtype == 'real': mod = cuda.module_from_buffer(cubin) zeroPadKernel = mod.get_function("zeroPadKernel") output = cua.zeros(size, np.float32) zeroPadKernel(output.gpudata, sy, sx, array_gpu.gpudata, ay, ax, offsetx, offsety, block=block_size, grid=grid_size) elif dtype == 'complex': mod = cuda.module_from_buffer(cubin) zeroPadComplexKernel = mod.get_function("zeroPadComplexKernel") output = cua.zeros(size, np.complex64) zeroPadComplexKernel(output.gpudata, sy, sx, array_gpu.gpudata, ay, ax, offsetx, offsety, block=block_size, grid=grid_size) else: print 'Incorrect data type in zeropadToGPU' return None return output
def zeropadToGPU(array, size, offset=(0, 0), dtype='real', block_size=(32, 32, 1)): """ Zero pad the input array and transfer it to the GPU memory if not there yet :param array: input array to be zeropadded and transferred :type array: ndarray :param size: size of the array (y, x) :type size: tuple :param offset: apply offset? :type offset: tuple :param dtype: data type, either real or complex :type: str :param block_size: CUDA block_size :param block_size: tuple :return: zero padded array that resides in the GPU memory :rtype: GPUarray """ grid_size = (int(np.ceil(float(size[1]) / block_size[1])), int(np.ceil(float(size[0]) / block_size[0]))) ay, ax = array.shape ay = np.int32(ay) ax = np.int32(ax) offsetx = np.int32(offset[0]) offsety = np.int32(offset[1]) sy = np.int32(size[0]) sx = np.int32(size[1]) if array.__class__ == np.ndarray: #array = np.array(array).astype(np.float32) array_gpu = cua.to_gpu(array) #array_gpu = cua.to_gpu_async(array) elif array.__class__ == cua.GPUArray: array_gpu = array else: print 'ERROR: Array type neither NumPy or GPUArray' return None if dtype == 'real': mod = cuda.module_from_buffer(cubin) zeroPadKernel = mod.get_function("zeroPadKernel") output = cua.zeros(size, np.float32) zeroPadKernel(output.gpudata, sy, sx, array_gpu.gpudata, ay, ax, offsetx, offsety, block=block_size, grid=grid_size) elif dtype == 'complex': mod = cuda.module_from_buffer(cubin) zeroPadComplexKernel = mod.get_function("zeroPadComplexKernel") output = cua.zeros(size, np.complex64) zeroPadComplexKernel(output.gpudata, sy, sx, array_gpu.gpudata, ay, ax, offsetx, offsety, block=block_size, grid=grid_size) else: print 'Incorrect data type in zeropadToGPU' return None return output
def lidar_to_top_cuda(lidar): # input: # lidar: (N, 4) 4->(x,y,z,i) in lidar coordinate lidar = np.copy(lidar) mod = cuda.module_from_buffer(module_buff) func = mod.get_function('_Z12lidar_to_topPfPiS0_S0_S_S_S0_') func_density = mod.get_function('_Z20lidar_to_top_densityPfPiS0_S0_S0_') # trunc idx = np.where(lidar[:, 0] > TOP_X_MIN) lidar = lidar[idx] idx = np.where(lidar[:, 0] < TOP_X_MAX) lidar = lidar[idx] idx = np.where(lidar[:, 1] > TOP_Y_MIN) lidar = lidar[idx] idx = np.where(lidar[:, 1] < TOP_Y_MAX) lidar = lidar[idx] idx = np.where(lidar[:, 2] > TOP_Z_MIN) lidar = lidar[idx] idx = np.where(lidar[:, 2] < TOP_Z_MAX) lidar = lidar[idx] # shape X0, Xn = 0, int((TOP_X_MAX - TOP_X_MIN) // TOP_X_DIVISION) + 1 Y0, Yn = 0, int((TOP_Y_MAX - TOP_Y_MIN) // TOP_Y_DIVISION) + 1 Z0, Zn = 0, int((TOP_Z_MAX - TOP_Z_MIN) / TOP_Z_DIVISION) height = Xn - X0 width = Yn - Y0 channel = Zn - Z0 + 2 # intensity and density channel do not cal seperately in kernel function top = np.zeros(shape=(height, width, channel), dtype=np.float32) top_density = np.zeros(shape=(height, width, 1), dtype=np.float32) top_shape = np.array(top.shape).astype(np.int32) lidar_shape = np.array(lidar.shape).astype(np.int32) # voxelize lidar lidar[:, 0] = ((lidar[:, 0] - TOP_X_MIN) // TOP_X_DIVISION).astype(np.int32) lidar[:, 1] = ((lidar[:, 1] - TOP_Y_MIN) // TOP_Y_DIVISION).astype(np.int32) lidar[:, 2] = (lidar[:, 2] - TOP_Z_MIN) / TOP_Z_DIVISION lidar = lidar[np.lexsort((lidar[:, 2], lidar[:, 1], lidar[:, 0])), :] lidar_x = np.ascontiguousarray(lidar[:, 0].astype(np.int32)) lidar_y = np.ascontiguousarray(lidar[:, 1].astype(np.int32)) lidar_z = np.ascontiguousarray(lidar[:, 2]) lidar_i = np.ascontiguousarray(lidar[:, 3]) func( cuda.InOut(top), cuda.In(top_shape), cuda.In(lidar_x), cuda.In(lidar_y), cuda.In(lidar_z), cuda.In(lidar_i), cuda.In(lidar_shape), #intensity and density channel do not cal seperately block=(channel, 1, 1), # a thread <-> a channel grid=(int(lidar_shape[0]), 1, 1) # a grid <-> a point in laser scan ) func_density(cuda.InOut(top_density), cuda.In(lidar_x), cuda.In(lidar_y), cuda.In(lidar_shape), cuda.In(top_shape), block=(1, 1, 1), grid=(1, 1, 1)) top_density = (np.log(top_density.astype(np.int32) + 1) / math.log(32)).clip(max=1).astype(np.float32) return np.dstack([top[:, :, :-1], top_density])
def lidar_to_front_cuda(lidar): # input: # lidar: (N, 4) 4->(x,y,z,i) in lidar coordinate mod = cuda.module_from_buffer(module_buff) func_add_points = mod.get_function('_Z25lidar_to_front_add_pointsPiS_S_S_') func_fill_front = mod.get_function( '_Z25lidar_to_front_fill_frontPfS_PiS0_') def cal_height(points): return np.clip(points[:, 2] + cfg.VELODYNE_HEIGHT, a_min=0, a_max=None).astype(np.float32).reshape((-1, 1)) def cal_distance(points): return np.sqrt(np.sum(points**2, axis=1)).astype(np.float32).reshape( (-1, 1)) def cal_intensity(points): return points[:, 3].astype(np.float32).reshape((-1, 1)) def to_front(points): return np.array([ np.arctan2(points[:, 1], points[:, 0])/cfg.VELODYNE_ANGULAR_RESOLUTION, np.arctan2(points[:, 2], np.sqrt(points[:, 0]**2 + points[:, 1]**2)) \ /cfg.VELODYNE_VERTICAL_RESOLUTION ], dtype=np.int32).T # using the same crop method as top view idx = np.where(lidar[:, 0] > TOP_X_MIN) lidar = lidar[idx] idx = np.where(lidar[:, 0] < TOP_X_MAX) lidar = lidar[idx] idx = np.where(lidar[:, 1] > TOP_Y_MIN) lidar = lidar[idx] idx = np.where(lidar[:, 1] < TOP_Y_MAX) lidar = lidar[idx] idx = np.where(lidar[:, 2] > TOP_Z_MIN) lidar = lidar[idx] idx = np.where(lidar[:, 2] < TOP_Z_MAX) lidar = lidar[idx] points = to_front(lidar) ind = np.where(cfg.FRONT_C_MIN < points[:, 0]) points, lidar = points[ind], lidar[ind] ind = np.where(points[:, 0] < cfg.FRONT_C_MAX) points, lidar = points[ind], lidar[ind] ind = np.where(cfg.FRONT_R_MIN < points[:, 1]) points, lidar = points[ind], lidar[ind] ind = np.where(points[:, 1] < cfg.FRONT_R_MAX) points, lidar = points[ind], lidar[ind] points[:, 0] += int(cfg.FRONT_C_OFFSET) points[:, 1] += int(cfg.FRONT_R_OFFSET) #points //= 2 ind = np.where(0 <= points[:, 0]) points, lidar = points[ind], lidar[ind] ind = np.where(points[:, 0] < cfg.FRONT_WIDTH) points, lidar = points[ind], lidar[ind] ind = np.where(0 <= points[:, 1]) points, lidar = points[ind], lidar[ind] ind = np.where(points[:, 1] < cfg.FRONT_HEIGHT) points, lidar = points[ind], lidar[ind] # sort for mem friendly idx = np.lexsort((points[:, 1], points[:, 0])) points = points[idx, :] lidar = lidar[idx, :] channel = 3 # height, distance, intencity front = np.zeros((cfg.FRONT_WIDTH, cfg.FRONT_HEIGHT, channel), dtype=np.float32) weight_mask = np.zeros_like(front[:, :, 0]).astype(np.int32) # def _add(x): # weight_mask[int(x[0]), int(x[1])] += 1 # def _fill(x): # front[int(x[0]), int(x[1]), :] += x[2:] # np.apply_along_axis(_add, 1, points) buf = np.hstack((points, cal_height(lidar), cal_distance(lidar), cal_intensity(lidar))).astype(np.float32) # np.apply_along_axis(_fill, 1, buf) func_add_points( cuda.InOut(weight_mask), cuda.In(points), cuda.In(np.array(weight_mask.shape).astype(np.int32)), cuda.In(np.array(points.shape).astype(np.int32)), block=(1, 1, 1), grid=(1, 1, 1), # points ) weight_mask[weight_mask == 0] = 1 # 0 and 1 are both 1 func_fill_front( cuda.InOut(front), cuda.In(buf), cuda.In(np.array(front.shape).astype(np.int32)), cuda.In(np.array(buf.shape).astype(np.int32)), block=(3, 1, 1), # channel grid=(1, 1, 1) # points ) front /= weight_mask[:, :, np.newaxis] return front
def get_kernel(self, kernel_filename, include_dirs=[], no_extern_c=True, defines={}): """ Helper function to print compilation output """ def cuda_compile_message_handler(compile_success_bool, info_str, error_str): self.logger.debug("Compilation returned %s", str(compile_success_bool)) if info_str: self.logger.debug("Info: %s", info_str) if error_str: self.logger.debug("Error: %s", error_str) self.logger.debug("Getting %s", kernel_filename) # Create a hash of the kernel (and its includes) defines_hasher = hashlib.md5() defines_hasher.update(str(defines).encode('utf-8')); defines_hash = defines_hasher.hexdigest() defines_hasher = None root, ext = os.path.splitext(kernel_filename) kernel_path = os.path.abspath(os.path.join(self.module_path, "gpu_kernels", kernel_filename)) kernel_hash = root \ + "_" + CUDAContext.hash_kernel( \ kernel_path, \ include_dirs=[os.path.join(self.module_path, "../kernels")] + include_dirs) \ + "_" + defines_hash \ + ext cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) # If we have the kernel in our hashmap, return it if (kernel_hash in self.kernels.keys()): self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) return self.kernels[kernel_hash] # If we have it on disk, return it elif (self.use_cache and os.path.isfile(cached_kernel_filename)): self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash) with io.open(cached_kernel_filename, "rb") as file: file_str = file.read() module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler) self.kernels[kernel_hash] = module return self.kernels[kernel_hash] # Otherwise, compile it from source else: self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash) #Create kernel string kernel_string = "" for key, value in defines.items(): kernel_string += "#define {:s} {:s}\n".format(str(key), str(value)) kernel_string += '#include "{:s}"'.format(str(kernel_path)) if (self.use_cache): with io.open(cached_kernel_filename + ".txt", "w") as file: #Why is kernel_string a bytes object in Python 3.5.2? #Bugfix here if isinstance(kernel_string, bytes): kernel_string = bytes.decode(kernel_string) file.write(kernel_string) with Timer("compiler") as timer: cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False) module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler) if (self.use_cache): with io.open(cached_kernel_filename, "wb") as file: file.write(cubin) self.kernels[kernel_hash] = module return self.kernels[kernel_hash]
def get_kernel(self, kernel_filename, include_dirs=[], defines={}, compile_args={'no_extern_c': True}, jit_compile_args={}): """ Helper function to print compilation output """ def cuda_compile_message_handler(compile_success_bool, info_str, error_str): self.logger.debug("Compilation returned %s", str(compile_success_bool)) if info_str: self.logger.debug("Info: %s", info_str) if error_str: self.logger.debug("Error: %s", error_str) self.logger.debug("Getting %s", kernel_filename) # Create a hash of the kernel (and its includes) options_hasher = hashlib.md5() options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8')); options_hash = options_hasher.hexdigest() options_hasher = None root, ext = os.path.splitext(kernel_filename) kernel_path = os.path.abspath(os.path.join(self.module_path, "gpu_kernels", kernel_filename)) kernel_hash = root \ + "_" + CUDAContext.hash_kernel( \ kernel_path, \ include_dirs=[os.path.join(self.module_path, "../kernels")] + include_dirs) \ + "_" + options_hash \ + ext cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) # If we have the kernel in our hashmap, return it if (kernel_hash in self.kernels.keys()): self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) return self.kernels[kernel_hash] # If we have it on disk, return it elif (self.use_cache and os.path.isfile(cached_kernel_filename)): self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash) with io.open(cached_kernel_filename, "rb") as file: file_str = file.read() module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args) self.kernels[kernel_hash] = module return self.kernels[kernel_hash] # Otherwise, compile it from source else: self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash) #Create kernel string kernel_string = "" for key, value in defines.items(): kernel_string += "#define {:s} {:s}\n".format(str(key), str(value)) kernel_string += '#include "{:s}"'.format(str(kernel_path)) if (self.use_cache): with io.open(cached_kernel_filename + ".txt", "w") as file: #Why is kernel_string a bytes object in Python 3.5.2? #Bugfix here if isinstance(kernel_string, bytes): kernel_string = bytes.decode(kernel_string) file.write(kernel_string) with Timer("compiler") as timer: cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args) module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args) if (self.use_cache): with io.open(cached_kernel_filename, "wb") as file: file.write(cubin) self.kernels[kernel_hash] = module return self.kernels[kernel_hash]
def load(cls, name=None): if cls.mod is None: if name is None: name = cls.__name__.lower() cubin = compile(name, assemble_code(cls.lib)) cls.mod = cuda.module_from_buffer(cubin)
def get_prepared_kernel(self, kernel_filename, kernel_function_name, \ prepared_call_args, \ include_dirs=[], no_extern_c=True, **kwargs): """ Helper function to print compilation output """ def cuda_compile_message_handler(compile_success_bool, info_str, error_str): self.logger.debug("Compilation returned %s", str(compile_success_bool)) if info_str: self.logger.debug("Info: %s", info_str) if error_str: self.logger.debug("Error: %s", error_str) kernel_filename = os.path.normpath(kernel_filename) #self.logger.debug("Getting %s", kernel_filename) # Create a hash of the kernel (and its includes) kwargs_hasher = hashlib.md5() kwargs_hasher.update(str(kwargs).encode('utf-8')); kwargs_hash = kwargs_hasher.hexdigest() kwargs_hasher = None root, ext = os.path.splitext(kernel_filename) kernel_hash = root \ + "_" + CudaContext.hash_kernel( \ os.path.join(self.module_path, kernel_filename), \ include_dirs=[self.module_path] + include_dirs) \ + "_" + kwargs_hash \ + ext cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) # If we have the kernel in our hashmap, return it if (kernel_hash in self.kernels.keys()): self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) return self.kernels[kernel_hash] # If we have it on disk, return it elif (self.use_cache and os.path.isfile(cached_kernel_filename)): self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash) with io.open(cached_kernel_filename, "rb") as file: file_str = file.read() module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler) kernel = module.get_function(kernel_function_name) kernel.prepare(prepared_call_args) self.kernels[kernel_hash] = kernel return kernel # Otherwise, compile it from source else: self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash) #Create kernel string kernel_string = "" for key, value in kwargs.items(): kernel_string += "#define {:s} {:s}\n".format(str(key), str(value)) kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename)) if (self.use_cache): cached_kernel_dir = os.path.dirname(cached_kernel_filename) if not os.path.isdir(cached_kernel_dir): os.mkdir(cached_kernel_dir) with io.open(cached_kernel_filename + ".txt", "w") as file: file.write(kernel_string) with Common.Timer("compiler") as timer: cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False) module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler) if (self.use_cache): with io.open(cached_kernel_filename, "wb") as file: file.write(cubin) kernel = module.get_function(kernel_function_name) kernel.prepare(prepared_call_args) self.kernels[kernel_hash] = kernel return kernel