def init_the_device_if_needed(do_it_anyway=False): if do_it_anyway: print('import pycuda.autoinit') import pycuda.autoinit return try: Context.get_device() except: # Presumably, the line above failed because of something like that: # "LogicError: cuCtxGetDevice failed: not initialized" # -- initialize the device print('import pycuda.autoinit') import pycuda.autoinit
def init_the_device_if_needed(do_it_anyway=False): if do_it_anyway: print 'import pycuda.autoinit' import pycuda.autoinit return try: Context.get_device() except: # Presumably, the line above failed because of something like that: # "LogicError: cuCtxGetDevice failed: not initialized" # -- initialize the device print 'import pycuda.autoinit' import pycuda.autoinit
def mem_alloc(nbytes): """Allocates device memory of given size from memory pool. This function chooses memory pool corresponding to the current device. Args: nbytes (int): The size of memory in bytes. Returns: pycuda.tools.PooledDeviceAllocation: Allocated memory with additional ``device`` attribute. This attribute is used to determine on which GPU the memory resides. """ global _pools device = Context.get_device() pool = _pools.get(device, None) if pool is None: pool = drv.DeviceMemoryPool() _pools[device] = pool allocation = pool.allocate(nbytes) setattr(allocation, 'device', device) return allocation
def compile(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], target="cubin"): assert target in ["cubin", "ptx", "fatbin"] if not no_extern_c: source = 'extern "C" {\n%s\n}\n' % source if options is None: options = DEFAULT_NVCC_FLAGS options = options[:] if arch is None: from pycuda.driver import Error try: from pycuda.driver import Context arch = "sm_%d%d" % Context.get_device().compute_capability() except Error: pass from pycuda.driver import CUDA_DEBUGGING if CUDA_DEBUGGING: cache_dir = False keep = True options.extend(["-g", "-G"]) if cache_dir is None: from os.path import join import appdirs cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"), "compiler-cache-v1") from os import makedirs try: makedirs(cache_dir) except OSError as e: from errno import EEXIST if e.errno != EEXIST: raise if arch is not None: options.extend(["-arch", arch]) if code is not None: options.extend(["-code", code]) if 'darwin' in sys.platform and sys.maxint == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 2147483647: options.append('-m32') include_dirs = include_dirs + [_find_pycuda_include_path()] for i in include_dirs: options.append("-I"+i) return compile_plain(source, options, keep, nvcc, cache_dir, target)
def compile(source, nvcc="nvcc", options=[], keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): if not no_extern_c: source = 'extern "C" {\n%s\n}\n' % source options = options[:] if arch is None: try: from pycuda.driver import Context arch = "sm_%d%d" % Context.get_device().compute_capability() except RuntimeError: pass from pycuda.driver import CUDA_DEBUGGING if CUDA_DEBUGGING: cache_dir = False keep = True options.extend(["-g", "-G"]) if cache_dir is None: from os.path import join from tempfile import gettempdir cache_dir = join(gettempdir(), "pycuda-compiler-cache-v1-%s" % _get_per_user_string()) from os import mkdir try: mkdir(cache_dir) except OSError, e: from errno import EEXIST if e.errno != EEXIST: raise
def __init__(self, nvcc='nvcc', link_options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], message_handler=None, log_verbose=False, cuda_libdir=None): from pycuda.driver import Context compute_capability = Context.get_device().compute_capability() if compute_capability < (3, 5): raise Exception( 'Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % (compute_capability[0], compute_capability[1])) else: from pycuda.driver import Linker self.linker = Linker(message_handler, link_options, log_verbose) self._check_arch(arch) self.nvcc = nvcc self.keep = keep self.no_extern_c = no_extern_c self.arch = arch self.code = code self.cache_dir = cache_dir self.include_dirs = include_dirs self.cuda_libdir = cuda_libdir self.libdir, self.libptn = None, None self.module = None
def get_device(arg=None): """Gets the device from ID ''arg'' or given chainer's :class:`~pycuda.gpuarray.GPUArray`. Args: arg: Value to specify a GPU device. Returns: Device object specified by given ``arg``. The rule of device selection is following. ==================================== ===================================== Type of ``arg`` Return value ==================================== ===================================== ``None`` Current device ``int`` Device of ID ``arg`` :class:`~pycuda.driver.Device` ``arg`` :class:`~pycuda.gpuarray.GPUArray` Device given array was allocated on :class:`~numpy.ndarray` ``None`` ==================================== ===================================== """ if arg is None: return Context.get_device() elif isinstance(arg, Device): return arg elif isinstance(arg, numpy.ndarray): return None elif isinstance(arg, GPUArray): while not hasattr(arg.gpudata, 'device'): arg = arg.base return arg.gpudata.device return drv.Device(arg)
def init(device=None): """Initializes CUDA global state. Chainer maintains CUDA context, CUBLAS context, random number generator and device memory pool for each GPU device and for each process (the main process or a process forked by :mod:`multiprocessing`) as global states. When called for the first time on the process, this function initializes these global states. .. warning:: This function also initializes PyCUDA and scikits.cuda. Since these packages do not support forking after initialization, do not call this function before forking the process. This function also registers :func:`shutdown` to :mod:`atexit` slot. It also initializes random number generator. User can set fixed seed with ``CHAINER_SEED`` environment variable. Args: device (``int`` or :class:`~pycuda.driver.Device` or ``None``): Device ID to initialize on. """ global _contexts, _cublas_handles, _generators, _pid, _pools if not available: global _import_error raise RuntimeError( 'CUDA environment is not correctly set up. ' + 'The original import error said: ' + str(_import_error)) pid = os.getpid() if _pid == pid: # already initialized return drv.init() if device is None: # use default device context = cutools.make_default_context() device = Context.get_device() else: device = Device(device) context = device.make_context() _contexts = {device: context} _generators = {} _pools = {} _cublas_handles = {} cumisc.init(mem_alloc) seed(os.environ.get('CHAINER_SEED')) _pid = pid # mark as initialized atexit.register(shutdown)
def _check_arch(self, arch): if arch is None: return try: from pycuda.driver import Context capability = Context.get_device().compute_capability() if tuple(map(int, tuple(arch.split("_")[1]))) > capability: from warnings import warn warn("trying to compile for a compute capability " "higher than selected GPU") except: pass
def get_cublas_handle(): """Gets CUBLAS handle for the current device. Returns: CUBLAS handle. """ global _cublas_handles device = Context.get_device() if device in _cublas_handles: return _cublas_handles[device] handle = cublas.cublasCreate() _cublas_handles[device] = handle return handle
def __init__(self, nvcc='nvcc', link_options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], message_handler=None, log_verbose=False, cuda_libdir=None): from pycuda.driver import Context compute_capability = Context.get_device().compute_capability() if compute_capability < (3,5): raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % (compute_capability[0], compute_capability[1])) else: from pycuda.driver import Linker self.linker = Linker(message_handler, link_options, log_verbose) self._check_arch(arch) self.nvcc = nvcc self.keep = keep self.no_extern_c = no_extern_c self.arch = arch self.code = code self.cache_dir = cache_dir self.include_dirs = include_dirs self.cuda_libdir = cuda_libdir self.libdir, self.libptn = None, None self.module = None
def compile(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): if not no_extern_c: source = 'extern "C" {\n%s\n}\n' % source if options is None: options = DEFAULT_NVCC_FLAGS options = options[:] if arch is None: try: from pycuda.driver import Context arch = "sm_%d%d" % Context.get_device().compute_capability() except RuntimeError: pass from pycuda.driver import CUDA_DEBUGGING if CUDA_DEBUGGING: cache_dir = False keep = True options.extend(["-g", "-G"]) if cache_dir is None: from os.path import join import appdirs cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"), "compiler-cache-v1") from os import makedirs try: makedirs(cache_dir) except OSError, e: from errno import EEXIST if e.errno != EEXIST: raise
def has_double_support(): from pycuda.driver import Context return Context.get_device().compute_capability() >= (1, 3)
def has_stack(): from pycuda.driver import Context return Context.get_device().compute_capability() >= (2, 0)
def compile(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], target="cubin"): assert target in ["cubin", "ptx", "fatbin"] if not no_extern_c: source = 'extern "C" {\n%s\n}\n' % source if options is None: options = DEFAULT_NVCC_FLAGS options = options[:] if arch is None: from pycuda.driver import Error try: from pycuda.driver import Context arch = "sm_%d%d" % Context.get_device().compute_capability() except Error: pass from pycuda.driver import CUDA_DEBUGGING if CUDA_DEBUGGING: cache_dir = False keep = True options.extend(["-g", "-G"]) if "PYCUDA_CACHE_DIR" in os.environ and cache_dir is None: cache_dir = os.environ["PYCUDA_CACHE_DIR"] if "PYCUDA_DISABLE_CACHE" in os.environ: cache_dir = False if cache_dir is None: from os.path import join import appdirs cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"), "compiler-cache-v1") from os import makedirs try: makedirs(cache_dir) except OSError as e: from errno import EEXIST if e.errno != EEXIST: raise if arch is not None: options.extend(["-arch", arch]) if code is not None: options.extend(["-code", code]) if 'darwin' in sys.platform and sys.maxsize == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 2147483647: options.append('-m32') include_dirs = include_dirs + [_find_pycuda_include_path()] for i in include_dirs: options.append("-I" + i) return compile_plain(source, options, keep, nvcc, cache_dir, target)
Ts[i_20]+=An[8]/my_factorial; Ts[i_21]+=An[9]/my_factorial; Ts[i_22]+=An[10]/my_factorial; Ts[i_23]+=An[11]/my_factorial; } } } """ try: Context.get_device() except: import pycuda.autoinit mod = SourceModule(krnl, no_extern_c=True) _gpu_expm = mod.get_function("expm") def gpu_expm(As, Ts_vectorized, p=12): N = len(As) if Ts_vectorized.ndim != 2 or Ts_vectorized.shape[1] != 12: raise ValueError(Ts_vectorized.shape) # threadsPerBlock=1024 # Regardless of the value of N, # for some reasons this gives errors, # (only) on the machines with the good graphics
def compile(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): if not no_extern_c: source = 'extern "C" {\n%s\n}\n' % source if options is None: options = DEFAULT_NVCC_FLAGS options = options[:] if arch is None: try: from pycuda.driver import Context arch = "sm_%d%d" % Context.get_device().compute_capability() except RuntimeError: pass from pycuda.driver import CUDA_DEBUGGING if CUDA_DEBUGGING: cache_dir = False keep = True options.extend(["-g", "-G"]) if cache_dir is None: from os.path import join from tempfile import gettempdir cache_dir = join( gettempdir(), "pycuda-compiler-cache-v1-%s" % _get_per_user_string()) from os import mkdir try: mkdir(cache_dir) except OSError as e: from errno import EEXIST if e.errno != EEXIST: raise if arch is not None: options.extend(["-arch", arch]) if code is not None: options.extend(["-code", code]) if 'darwin' in sys.platform and sys.maxint == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 2147483647: options.append('-m32') include_dirs = include_dirs + [_find_pycuda_include_path()] for i in include_dirs: options.append("-I" + i) return compile_plain(source, options, keep, nvcc, cache_dir)
#!/usr/bin/env python """ Created on Wed Sep 3 11:08:37 2014 Author: Oren Freifeld Email: [email protected] """ from pycuda.compiler import SourceModule from pycuda.driver import Context try: Context.get_device() except: import pycuda.autoinit class KernelThinWrapper(object): def __init__(self, gpu_kernel, include_dirs=[]): self._gpu_kernel = gpu_kernel self._src_module = SourceModule(gpu_kernel, include_dirs=include_dirs) def _get_function_from_src_module(self, func_name): self.__dict__["_gpu_" + func_name] = self._src_module.get_function(func_name) def __call__(self, *args, **kwargs): msg = """ You need to customize this method in the derived class. The customized method will usually have 3 parts: