def test_LRU_cache6(self): # test if each device has a separate cache cache0 = self.caches[0] cache1 = self.caches[1] # ensure a fresh state assert cache0.get_curr_size() == 0 <= cache0.get_size() assert cache1.get_curr_size() == 0 <= cache1.get_size() # do some computation on GPU 0 with device.Device(0): a = testing.shaped_random((10, ), cupy, cupy.float32) cupy.fft.fft(a) assert cache0.get_curr_size() == 1 <= cache0.get_size() assert cache1.get_curr_size() == 0 <= cache1.get_size() # do some computation on GPU 1 with device.Device(1): c = testing.shaped_random((16, ), cupy, cupy.float64) cupy.fft.fft(c) assert cache0.get_curr_size() == 1 <= cache0.get_size() assert cache1.get_curr_size() == 1 <= cache1.get_size() # reset device 0 cache0.clear() assert cache0.get_curr_size() == 0 <= cache0.get_size() assert cache1.get_curr_size() == 1 <= cache1.get_size() # reset device 1 cache1.clear() assert cache0.get_curr_size() == 0 <= cache0.get_size() assert cache1.get_curr_size() == 0 <= cache1.get_size()
def check(self, device_id): if (cupy.cuda.runtime.is_hip and self.allocator is memory.malloc_managed): raise unittest.SkipTest('HIP does not support managed memory') size = 24 shape = (2, 3) dtype = cupy.float32 with device.Device(device_id): src_mem_ptr = self.allocator(size) src_ptr = src_mem_ptr.ptr args = (src_ptr, size, src_mem_ptr) kwargs = {} if self.specify_device_id: kwargs = {'device_id': device_id} unowned_mem = memory.UnownedMemory(*args, **kwargs) assert unowned_mem.size == size assert unowned_mem.ptr == src_ptr assert unowned_mem.device_id == device_id arr = cupy.ndarray(shape, dtype, memory.MemoryPointer(unowned_mem, 0)) # Delete the source object del src_mem_ptr with device.Device(device_id): arr[:] = 2 assert (arr == 2).all()
def check(self, device_id): if cupy.cuda.runtime.is_hip: if self.allocator is memory.malloc_managed: raise unittest.SkipTest('HIP does not support managed memory') if self.allocator is memory.malloc_async: raise unittest.SkipTest('HIP does not support async mempool') elif cupy.cuda.driver.get_build_version() < 11020: raise unittest.SkipTest('malloc_async is supported since ' 'CUDA 11.2') size = 24 shape = (2, 3) dtype = cupy.float32 with device.Device(device_id): src_mem_ptr = self.allocator(size) src_ptr = src_mem_ptr.ptr args = (src_ptr, size, src_mem_ptr) kwargs = {} if self.specify_device_id: kwargs = {'device_id': device_id} unowned_mem = memory.UnownedMemory(*args, **kwargs) assert unowned_mem.size == size assert unowned_mem.ptr == src_ptr assert unowned_mem.device_id == device_id arr = cupy.ndarray(shape, dtype, memory.MemoryPointer(unowned_mem, 0)) # Delete the source object del src_mem_ptr with device.Device(device_id): arr[:] = 2 assert (arr == 2).all()
def check(self, device_id): size = 24 shape = (2, 3) dtype = cupy.float32 with device.Device(device_id): src_mem_ptr = self.allocator(size) src_ptr = src_mem_ptr.ptr args = (src_ptr, size, src_mem_ptr) kwargs = {} if self.specify_device_id: kwargs = {'device_id': device_id} unowned_mem = memory.UnownedMemory(*args, **kwargs) assert unowned_mem.size == size assert unowned_mem.ptr == src_ptr assert unowned_mem.device_id == device_id arr = cupy.ndarray(shape, dtype, memory.MemoryPointer(unowned_mem, 0)) # Delete the source object del src_mem_ptr with device.Device(device_id): arr[:] = 2 assert (arr == 2).all()
def check(self, device_id): if cupy.cuda.runtime.is_hip: if self.allocator is memory.malloc_managed: if cupy.cuda.driver.get_build_version() < 40300000: raise unittest.SkipTest( 'Managed memory requires ROCm 4.3+') else: raise unittest.SkipTest( 'hipPointerGetAttributes does not support managed ' 'memory') if self.allocator is memory.malloc_async: raise unittest.SkipTest('HIP does not support async mempool') else: if self.allocator is memory.malloc_async: if cupy.cuda.driver._is_cuda_python(): version = cupy.cuda.runtime.runtimeGetVersion() else: version = cupy.cuda.driver.get_build_version() if version < 11020: raise unittest.SkipTest('malloc_async is supported since ' 'CUDA 11.2') elif runtime.deviceGetAttribute( runtime.cudaDevAttrMemoryPoolsSupported, 0) == 0: raise unittest.SkipTest( 'malloc_async is not supported on device 0') size = 24 shape = (2, 3) dtype = cupy.float32 with device.Device(device_id): src_mem_ptr = self.allocator(size) src_ptr = src_mem_ptr.ptr args = (src_ptr, size, src_mem_ptr) kwargs = {} if self.specify_device_id: kwargs = {'device_id': device_id} if cupy.cuda.runtime.is_hip and self.allocator is memory._malloc: # In ROCm, it seems that `hipPointerGetAttributes()`, which is # called in `UnownedMemory()`, requires an unmanaged device pointer # that the current device must be the one on which the memory # referred to by the pointer physically resides. with device.Device(device_id): unowned_mem = memory.UnownedMemory(*args, **kwargs) else: unowned_mem = memory.UnownedMemory(*args, **kwargs) assert unowned_mem.size == size assert unowned_mem.ptr == src_ptr assert unowned_mem.device_id == device_id arr = cupy.ndarray(shape, dtype, memory.MemoryPointer(unowned_mem, 0)) # Delete the source object del src_mem_ptr with device.Device(device_id): arr[:] = 2 assert (arr == 2).all()
def __init__(self, size): self.size = size self.ptr = ctypes.c_void_p() self._device = None if size > 0: self._device = device.Device() self.ptr = runtime.malloc(size)
def test_device_cache(self): @jit.rawkernel() def f(x, y): tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x y[tid] = x[tid] with device.Device(0): x = testing.shaped_random((30, ), dtype=numpy.int32, seed=0) y = testing.shaped_random((30, ), dtype=numpy.int32, seed=1) f((5, ), (6, ), (x, y)) assert bool((x == y).all()) with device.Device(1): x = testing.shaped_random((30, ), dtype=numpy.int32, seed=2) y = testing.shaped_random((30, ), dtype=numpy.int32, seed=3) f((5, ), (6, ), (x, y)) assert bool((x == y).all())
def tearDown(self): for i in range(n_devices): with device.Device(i): cache = config.get_plan_cache() cache.clear() cache.set_size(self.old_sizes[i]) cache.set_memsize(-1)
def _get_arch(): # See Supported Compile Options section of NVRTC User Guide for # the maximum value allowed for `--gpu-architecture`. nvrtc_max_compute_capability = _get_max_compute_capability() arch = device.Device().compute_capability if arch in _tegra_archs: return arch else: return min(arch, nvrtc_max_compute_capability)
def setUp(self): self.caches = [] self.old_sizes = [] for i in range(n_devices): with device.Device(i): cache = config.get_plan_cache() self.old_sizes.append(cache.get_size()) cache.clear() cache.set_memsize(-1) cache.set_size(2) self.caches.append(cache)
def _syevd(a, UPLO, with_eigen_vector): if UPLO not in ('L', 'U'): raise ValueError('UPLO argument must be \'L\' or \'U\'') # reject_float16=False for backward compatibility dtype, v_dtype = _util.linalg_common_type(a, reject_float16=False) real_dtype = dtype.char.lower() w_dtype = v_dtype.char.lower() # Note that cuSolver assumes fortran array v = a.astype(dtype, order='F', copy=True) m, lda = a.shape w = cupy.empty(m, real_dtype) dev_info = cupy.empty((), numpy.int32) handle = device.Device().cusolver_handle if with_eigen_vector: jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR else: jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR if UPLO == 'L': uplo = cublas.CUBLAS_FILL_MODE_LOWER else: # UPLO == 'U' uplo = cublas.CUBLAS_FILL_MODE_UPPER if dtype == 'f': buffer_size = cupy.cuda.cusolver.ssyevd_bufferSize syevd = cupy.cuda.cusolver.ssyevd elif dtype == 'd': buffer_size = cupy.cuda.cusolver.dsyevd_bufferSize syevd = cupy.cuda.cusolver.dsyevd elif dtype == 'F': buffer_size = cupy.cuda.cusolver.cheevd_bufferSize syevd = cupy.cuda.cusolver.cheevd elif dtype == 'D': buffer_size = cupy.cuda.cusolver.zheevd_bufferSize syevd = cupy.cuda.cusolver.zheevd else: raise RuntimeError('Only float and double and cuComplex and ' + 'cuDoubleComplex are supported') work_size = buffer_size( handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr) work = cupy.empty(work_size, dtype) syevd( handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, work.data.ptr, work_size, dev_info.data.ptr) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( syevd, dev_info) return w.astype(w_dtype, copy=False), v.astype(v_dtype, copy=False)
def _get_arch(): global _nvrtc_max_compute_capability if _nvrtc_max_compute_capability is None: # See Supported Compile Options section of NVRTC User Guide for # the maximum value allowed for `--gpu-architecture`. major, minor = _get_nvrtc_version() if major < 9: # CUDA 7.0 / 7.5 / 8.0 _nvrtc_max_compute_capability = '50' else: # CUDA 9.0 / 9.1 _nvrtc_max_compute_capability = '70' cc = min(device.Device().compute_capability, _nvrtc_max_compute_capability) return 'compute_%s' % cc
def malloc(self, size): """Allocates the memory, from the pool if possible. This method can be used as a CuPy memory allocator. The simplest way to use a memory pool as the default allocator is the following code:: set_allocator(MemoryPool().malloc) Args: size (int): Size of the memory buffer to allocate in bytes. Returns: ~cupy.cuda.MemoryPointer: Pointer to the allocated buffer. """ dev = device.Device().id return self._pools[dev].malloc(size)
def _syevd(a, UPLO, with_eigen_vector): if UPLO not in ('L', 'U'): raise ValueError("UPLO argument must be 'L' or 'U'") if a.dtype == 'f' or a.dtype == 'e': dtype = 'f' ret_type = a.dtype else: # NumPy uses float64 when an input is not floating point number. dtype = 'd' ret_type = 'd' # Note that cuSolver assumes fortran array v = a.astype(dtype, order='F', copy=True) m, lda = a.shape w = cupy.empty(m, dtype) dev_info = cupy.empty((), 'i') handle = device.Device().cusolver_handle if with_eigen_vector: jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR else: jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR if UPLO == 'L': uplo = cublas.CUBLAS_FILL_MODE_LOWER else: # UPLO == 'U' uplo = cublas.CUBLAS_FILL_MODE_UPPER if dtype == 'f': buffer_size = cupy.cuda.cusolver.ssyevd_bufferSize syevd = cupy.cuda.cusolver.ssyevd elif dtype == 'd': buffer_size = cupy.cuda.cusolver.dsyevd_bufferSize syevd = cupy.cuda.cusolver.dsyevd else: raise RuntimeError('Only float and double are supported') work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr) work = cupy.empty(work_size, dtype) syevd(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, work.data.ptr, work_size, dev_info.data.ptr) return w.astype(ret_type, copy=False), v.astype(ret_type, copy=False)
def _get_arch(): # See Supported Compile Options section of NVRTC User Guide for # the maximum value allowed for `--gpu-architecture`. major, minor = _get_nvrtc_version() if major < 10 or (major == 10 and minor == 0): # CUDA 9.x / 10.0 _nvrtc_max_compute_capability = '70' elif major < 11: # CUDA 10.1 / 10.2 _nvrtc_max_compute_capability = '75' else: # CUDA 11.0 / 11.1 _nvrtc_max_compute_capability = '80' arch = device.Device().compute_capability if arch in _tegra_archs: return arch else: return min(arch, _nvrtc_max_compute_capability)
def syevj(a, UPLO='L', with_eigen_vector=True): """Eigenvalue decomposition of symmetric matrix using cusolverDn<t>syevj(). Computes eigenvalues ``w`` and (optionally) eigenvectors ``v`` of a complex Hermitian or a real symmetric matrix. Args: a (cupy.ndarray): A symmetric 2-D square matrix ``(M, M)`` or a batch of symmetric 2-D square matrices ``(..., M, M)``. UPLO (str): Select from ``'L'`` or ``'U'``. It specifies which part of ``a`` is used. ``'L'`` uses the lower triangular part of ``a``, and ``'U'`` uses the upper triangular part of ``a``. with_eigen_vector (bool): Indicates whether or not eigenvectors are computed. Returns: tuple of :class:`~cupy.ndarray`: Returns a tuple ``(w, v)``. ``w`` contains eigenvalues and ``v`` contains eigenvectors. ``v[:, i]`` is an eigenvector corresponding to an eigenvalue ``w[i]``. For batch input, ``v[k, :, i]`` is an eigenvector corresponding to an eigenvalue ``w[k, i]`` of ``a[k]``. """ if not check_availability('syevj'): raise RuntimeError('syevj is not available.') if UPLO not in ('L', 'U'): raise ValueError('UPLO argument must be \'L\' or \'U\'') if a.ndim > 2: return _syevj_batched(a, UPLO, with_eigen_vector) assert a.ndim == 2 if a.dtype == 'f' or a.dtype == 'e': dtype = 'f' inp_w_dtype = 'f' inp_v_dtype = 'f' ret_w_dtype = a.dtype ret_v_dtype = a.dtype elif a.dtype == 'd': dtype = 'd' inp_w_dtype = 'd' inp_v_dtype = 'd' ret_w_dtype = 'd' ret_v_dtype = 'd' elif a.dtype == 'F': dtype = 'F' inp_w_dtype = 'f' inp_v_dtype = 'F' ret_w_dtype = 'f' ret_v_dtype = 'F' elif a.dtype == 'D': dtype = 'D' inp_w_dtype = 'd' inp_v_dtype = 'D' ret_w_dtype = 'd' ret_v_dtype = 'D' else: # NumPy uses float64 when an input is not floating point number. dtype = 'd' inp_w_dtype = 'd' inp_v_dtype = 'd' ret_w_dtype = 'd' ret_v_dtype = 'd' # Note that cuSolver assumes fortran array v = a.astype(inp_v_dtype, order='F', copy=True) m, lda = a.shape w = cupy.empty(m, inp_w_dtype) dev_info = cupy.empty((), numpy.int32) handle = device.Device().cusolver_handle if with_eigen_vector: jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR else: jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR if UPLO == 'L': uplo = cublas.CUBLAS_FILL_MODE_LOWER else: # UPLO == 'U' uplo = cublas.CUBLAS_FILL_MODE_UPPER if dtype == 'f': buffer_size = cusolver.ssyevj_bufferSize syevj = cusolver.ssyevj elif dtype == 'd': buffer_size = cusolver.dsyevj_bufferSize syevj = cusolver.dsyevj elif dtype == 'F': buffer_size = cusolver.cheevj_bufferSize syevj = cusolver.cheevj elif dtype == 'D': buffer_size = cusolver.zheevj_bufferSize syevj = cusolver.zheevj else: raise RuntimeError('Only float and double and cuComplex and ' + 'cuDoubleComplex are supported') params = cusolver.createSyevjInfo() work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, params) work = cupy.empty(work_size, inp_v_dtype) syevj(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, work.data.ptr, work_size, dev_info.data.ptr, params) cupy.linalg.util._check_cusolver_dev_info_if_synchronization_allowed( syevj, dev_info) cusolver.destroySyevjInfo(params) w = w.astype(ret_w_dtype, copy=False) if not with_eigen_vector: return w v = v.astype(ret_v_dtype, copy=False) return w, v
def get_compute_arch_arg(self, device_id): return "-arch=compute_{0}".format( device.Device(device_id).compute_capability\ ).encode()
def _compile_with_cache_hip(source, options, arch, cache_dir, extra_source, backend='hiprtc', name_expressions=None, log_stream=None, cache_in_memory=False, use_converter=True): global _empty_file_preprocess_cache # TODO(leofang): this might be possible but is currently undocumented if _is_cudadevrt_needed(options): raise ValueError('separate compilation is not supported in HIP') # HIP's equivalent of -ftz=true, see ROCm-Developer-Tools/HIP#2252 # Notes: # - For hipcc, this should just work, as invalid options would cause errors # See https://clang.llvm.org/docs/ClangCommandLineReference.html. # - For hiprtc, this is a no-op until the compiler options like -D and -I # are accepted, see ROCm-Developer-Tools/HIP#2182 and # ROCm-Developer-Tools/HIP#2248 options += ('-fcuda-flush-denormals-to-zero',) # Workaround ROCm 4.3 LLVM_PATH issue in hipRTC #5689 rocm_build_version = driver.get_build_version() if rocm_build_version >= 40300000 and rocm_build_version < 40500000: options += ( '-I' + get_rocm_path() + '/llvm/lib/clang/13.0.0/include/',) if cache_dir is None: cache_dir = get_cache_dir() # As of ROCm 3.5.0 hiprtc/hipcc can automatically pick up the # right arch without setting HCC_AMDGPU_TARGET, so we don't need # to tell the compiler which arch we are targeting. But, we still # need to know arch as part of the cache key: if arch is None: # On HIP, gcnArch is computed from "compute capability": # https://github.com/ROCm-Developer-Tools/HIP/blob/rocm-4.0.0/rocclr/hip_device.cpp#L202 arch = device.Device().compute_capability if use_converter: source = _convert_to_hip_source(source, extra_source, is_hiprtc=(backend == 'hiprtc')) env = (arch, options, _get_nvrtc_version(), backend) base = _empty_file_preprocess_cache.get(env, None) if base is None: # This is for checking HIPRTC/HIPCC compiler internal version if backend == 'hiprtc': base = _preprocess_hiprtc('', options) else: base = _preprocess_hipcc('', options) _empty_file_preprocess_cache[env] = base key_src = '%s %s %s %s' % (env, base, source, extra_source) key_src = key_src.encode('utf-8') name = _hash_hexdigest(key_src) + '.hsaco' mod = function.Module() if not cache_in_memory: # Read from disk cache if not os.path.isdir(cache_dir): os.makedirs(cache_dir, exist_ok=True) # To handle conflicts in concurrent situation, we adopt lock-free # method to avoid performance degradation. # We force recompiling to retrieve C++ mangled names if so desired. path = os.path.join(cache_dir, name) if os.path.exists(path) and not name_expressions: with open(path, 'rb') as f: data = f.read() if len(data) >= _hash_length: hash_value = data[:_hash_length] binary = data[_hash_length:] binary_hash = _hash_hexdigest(binary).encode('ascii') if hash_value == binary_hash: mod.load(binary) return mod else: # Enforce compiling -- the resulting kernel will be cached elsewhere, # so we do nothing pass if backend == 'hiprtc': # compile_using_nvrtc calls hiprtc for hip builds binary, mapping = compile_using_nvrtc( source, options, arch, name + '.cu', name_expressions, log_stream, cache_in_memory) mod._set_mapping(mapping) else: binary = compile_using_hipcc(source, options, arch, log_stream) if not cache_in_memory: # Write to disk cache binary_hash = _hash_hexdigest(binary).encode('ascii') # shutil.move is not atomic operation, so it could result in a # corrupted file. We detect it by appending a hash at the beginning # of each cache file. If the file is corrupted, it will be ignored # next time it is read. with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tf: tf.write(binary_hash) tf.write(binary) temp_path = tf.name shutil.move(temp_path, path) # Save .cu source file along with .hsaco if _get_bool_env_variable('CUPY_CACHE_SAVE_CUDA_SOURCE', False): with open(path + '.cpp', 'w') as f: f.write(source) else: # we don't do any disk I/O pass mod.load(binary) return mod
def test_LRU_cache7(self): # test accessing a multi-GPU plan cache0 = self.caches[0] cache1 = self.caches[1] # ensure a fresh state assert cache0.get_curr_size() == 0 <= cache0.get_size() assert cache1.get_curr_size() == 0 <= cache1.get_size() # do some computation on GPU 0 with device.Device(0): a = testing.shaped_random((10, ), cupy, cupy.float32) cupy.fft.fft(a) assert cache0.get_curr_size() == 1 <= cache0.get_size() assert cache1.get_curr_size() == 0 <= cache1.get_size() # do a multi-GPU FFT config.use_multi_gpus = True config.set_cufft_gpus([0, 1]) c = testing.shaped_random((128, ), cupy, cupy.complex64) cupy.fft.fft(c) assert cache0.get_curr_size() == 2 <= cache0.get_size() assert cache1.get_curr_size() == 1 <= cache1.get_size() # check both devices' caches see the same multi-GPU plan plan0 = next(iter(cache0))[1].plan plan1 = next(iter(cache1))[1].plan assert plan0 is plan1 # reset config.use_multi_gpus = False config._device = None # do some computation on GPU 1 with device.Device(1): e = testing.shaped_random((20, ), cupy, cupy.complex128) cupy.fft.fft(e) assert cache0.get_curr_size() == 2 <= cache0.get_size() assert cache1.get_curr_size() == 2 <= cache1.get_size() # by this time, the multi-GPU plan remains the most recently # used one on GPU 0, but not on GPU 1 assert plan0 is next(iter(cache0))[1].plan assert plan1 is not next(iter(cache1))[1].plan # now use it again to make it the most recent config.use_multi_gpus = True config.set_cufft_gpus([0, 1]) c = testing.shaped_random((128, ), cupy, cupy.complex64) cupy.fft.fft(c) assert cache0.get_curr_size() == 2 <= cache0.get_size() assert cache1.get_curr_size() == 2 <= cache1.get_size() assert plan0 is next(iter(cache0))[1].plan assert plan1 is next(iter(cache1))[1].plan # reset config.use_multi_gpus = False config._device = None # Do 2 more different FFTs on one of the devices, and the # multi-GPU plan would be discarded from both caches with device.Device(1): x = testing.shaped_random((30, ), cupy, cupy.complex128) cupy.fft.fft(x) y = testing.shaped_random((40, 40), cupy, cupy.complex64) cupy.fft.fftn(y) for _, node in cache0: assert plan0 is not node.plan for _, node in cache1: assert plan1 is not node.plan assert cache0.get_curr_size() == 1 <= cache0.get_size() assert cache1.get_curr_size() == 2 <= cache1.get_size()
def _get_arch(): arch = device.Device().compute_capability return arch
def init_caches(gpus): for i in gpus: with device.Device(i): config.get_plan_cache()
def device(self): """Device whose memory the pointer refers to.""" if self._device is None: return device.Device() else: return self._device
def get_compute_arch(): return "compute_{0}".format(device.Device().compute_capability)
def get_compute_arch(t): return 'compute_%s' % device.Device().compute_capability
def _compile_with_cache_hip(source, options, arch, cache_dir, extra_source, backend='hiprtc', name_expressions=None, log_stream=None, cache_in_memory=False, use_converter=True): global _empty_file_preprocess_cache # TODO(leofang): this might be possible but is currently undocumented if _is_cudadevrt_needed(options): raise ValueError('separate compilation is not supported in HIP') if cache_dir is None: cache_dir = get_cache_dir() # As of ROCm 3.5.0 hiprtc/hipcc can automatically pick up the # right arch without setting HCC_AMDGPU_TARGET, so we don't need # to tell the compiler which arch we are targeting. But, we still # need to know arch as part of the cache key: if arch is None: # On HIP, gcnArch is computed from "compute capability": # https://github.com/ROCm-Developer-Tools/HIP/blob/rocm-4.0.0/rocclr/hip_device.cpp#L202 arch = device.Device().compute_capability if use_converter: source = _convert_to_hip_source(source, extra_source, is_hiprtc=(backend == 'hiprtc')) env = (arch, options, _get_nvrtc_version(), backend) base = _empty_file_preprocess_cache.get(env, None) if base is None: # This is for checking HIPRTC/HIPCC compiler internal version if backend == 'hiprtc': base = _preprocess_hiprtc('', options) else: base = _preprocess_hipcc('', options) _empty_file_preprocess_cache[env] = base key_src = '%s %s %s %s' % (env, base, source, extra_source) key_src = key_src.encode('utf-8') name = '%s.hsaco' % hashlib.md5(key_src).hexdigest() mod = function.Module() if not cache_in_memory: # Read from disk cache if not os.path.isdir(cache_dir): os.makedirs(cache_dir, exist_ok=True) # To handle conflicts in concurrent situation, we adopt lock-free # method to avoid performance degradation. # We force recompiling to retrieve C++ mangled names if so desired. path = os.path.join(cache_dir, name) if os.path.exists(path) and not name_expressions: with open(path, 'rb') as f: data = f.read() if len(data) >= 32: hash_value = data[:32] binary = data[32:] binary_hash = hashlib.md5(binary).hexdigest().encode('ascii') if hash_value == binary_hash: mod.load(binary) return mod else: # Enforce compiling -- the resulting kernel will be cached elsewhere, # so we do nothing pass if backend == 'hiprtc': # compile_using_nvrtc calls hiprtc for hip builds binary, mapping = compile_using_nvrtc( source, options, arch, name + '.cu', name_expressions, log_stream, cache_in_memory) mod._set_mapping(mapping) else: binary = compile_using_hipcc(source, options, arch, log_stream) if not cache_in_memory: # Write to disk cache binary_hash = hashlib.md5(binary).hexdigest().encode('ascii') # shutil.move is not atomic operation, so it could result in a # corrupted file. We detect it by appending md5 hash at the beginning # of each cache file. If the file is corrupted, it will be ignored # next time it is read. with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tf: tf.write(binary_hash) tf.write(binary) temp_path = tf.name shutil.move(temp_path, path) # Save .cu source file along with .hsaco if _get_bool_env_variable('CUPY_CACHE_SAVE_CUDA_SOURCE', False): with open(path + '.cpp', 'w') as f: f.write(source) else: # we don't do any disk I/O pass mod.load(binary) return mod
def _get_arch(): cc = device.Device().compute_capability return 'sm_%s' % cc
def _syevd(a, UPLO, with_eigen_vector, overwrite_a=False): if UPLO not in ('L', 'U'): raise ValueError('UPLO argument must be \'L\' or \'U\'') # reject_float16=False for backward compatibility dtype, v_dtype = _util.linalg_common_type(a, reject_float16=False) real_dtype = dtype.char.lower() w_dtype = v_dtype.char.lower() # Note that cuSolver assumes fortran array v = a.astype(dtype, order='F', copy=not overwrite_a) m, lda = a.shape w = cupy.empty(m, real_dtype) dev_info = cupy.empty((), numpy.int32) handle = device.Device().cusolver_handle if with_eigen_vector: jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR else: jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR if UPLO == 'L': uplo = cublas.CUBLAS_FILL_MODE_LOWER else: # UPLO == 'U' uplo = cublas.CUBLAS_FILL_MODE_UPPER global _cuda_runtime_version if _cuda_runtime_version < 0: _cuda_runtime_version = runtime.runtimeGetVersion() if not runtime.is_hip and _cuda_runtime_version >= 11010: if dtype.char not in 'fdFD': raise RuntimeError('Only float32, float64, complex64, and ' 'complex128 are supported') type_v = _dtype.to_cuda_dtype(dtype) type_w = _dtype.to_cuda_dtype(real_dtype) params = cusolver.createParams() try: work_device_size, work_host_sizse = cusolver.xsyevd_bufferSize( handle, params, jobz, uplo, m, type_v, v.data.ptr, lda, type_w, w.data.ptr, type_v) work_device = cupy.empty(work_device_size, 'b') work_host = numpy.empty(work_host_sizse, 'b') cusolver.xsyevd(handle, params, jobz, uplo, m, type_v, v.data.ptr, lda, type_w, w.data.ptr, type_v, work_device.data.ptr, work_device_size, work_host.ctypes.data, work_host_sizse, dev_info.data.ptr) finally: cusolver.destroyParams(params) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( cusolver.xsyevd, dev_info) else: if dtype == 'f': buffer_size = cupy.cuda.cusolver.ssyevd_bufferSize syevd = cupy.cuda.cusolver.ssyevd elif dtype == 'd': buffer_size = cupy.cuda.cusolver.dsyevd_bufferSize syevd = cupy.cuda.cusolver.dsyevd elif dtype == 'F': buffer_size = cupy.cuda.cusolver.cheevd_bufferSize syevd = cupy.cuda.cusolver.cheevd elif dtype == 'D': buffer_size = cupy.cuda.cusolver.zheevd_bufferSize syevd = cupy.cuda.cusolver.zheevd else: raise RuntimeError('Only float32, float64, complex64, and ' 'complex128 are supported') work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr) work = cupy.empty(work_size, dtype) syevd(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, work.data.ptr, work_size, dev_info.data.ptr) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( syevd, dev_info) return w.astype(w_dtype, copy=False), v.astype(v_dtype, copy=False)
def _syevd(a, UPLO, with_eigen_vector): if UPLO not in ('L', 'U'): raise ValueError('UPLO argument must be \'L\' or \'U\'') if a.dtype == 'f' or a.dtype == 'e': dtype = 'f' inp_w_dtype = 'f' inp_v_dtype = 'f' ret_w_dtype = a.dtype ret_v_dtype = a.dtype elif a.dtype == 'd': dtype = 'd' inp_w_dtype = 'd' inp_v_dtype = 'd' ret_w_dtype = 'd' ret_v_dtype = 'd' elif a.dtype == 'F': dtype = 'F' inp_w_dtype = 'f' inp_v_dtype = 'F' ret_w_dtype = 'f' ret_v_dtype = 'F' elif a.dtype == 'D': dtype = 'D' inp_w_dtype = 'd' inp_v_dtype = 'D' ret_w_dtype = 'd' ret_v_dtype = 'D' else: # NumPy uses float64 when an input is not floating point number. dtype = 'd' inp_w_dtype = 'd' inp_v_dtype = 'd' ret_w_dtype = 'd' ret_v_dtype = 'd' # Note that cuSolver assumes fortran array v = a.astype(inp_v_dtype, order='F', copy=True) m, lda = a.shape w = cupy.empty(m, inp_w_dtype) dev_info = cupy.empty((), numpy.int32) handle = device.Device().cusolver_handle if with_eigen_vector: jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR else: jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR if UPLO == 'L': uplo = cublas.CUBLAS_FILL_MODE_LOWER else: # UPLO == 'U' uplo = cublas.CUBLAS_FILL_MODE_UPPER if dtype == 'f': buffer_size = cupy.cuda.cusolver.ssyevd_bufferSize syevd = cupy.cuda.cusolver.ssyevd elif dtype == 'd': buffer_size = cupy.cuda.cusolver.dsyevd_bufferSize syevd = cupy.cuda.cusolver.dsyevd elif dtype == 'F': buffer_size = cupy.cuda.cusolver.cheevd_bufferSize syevd = cupy.cuda.cusolver.cheevd elif dtype == 'D': buffer_size = cupy.cuda.cusolver.zheevd_bufferSize syevd = cupy.cuda.cusolver.zheevd else: raise RuntimeError('Only float and double and cuComplex and ' + 'cuDoubleComplex are supported') work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr) work = cupy.empty(work_size, inp_v_dtype) syevd(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, work.data.ptr, work_size, dev_info.data.ptr) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( syevd, dev_info) return w.astype(ret_w_dtype, copy=False), v.astype(ret_v_dtype, copy=False)
def det_Hermitian(x, upper_or_lower=True): """eigen value decomposition for Hermitian matrix Args: x (cupy.ndarray): The regular matrix upper_or_lower: boolean "eigh" function is only for Hermitian matrix. So, to caculate eigen value, only upper or lower part of the matrix is necessary. When the input is the upper part of the matrix, this value is True Returns: cupy.ndarray: eigen values """ if not cuda.cusolver_enabled: raise RuntimeError('Error : cusolver_enabled == False') if x.shape[-2] != x.shape[-1]: raise ValueError # to prevent `a` to be overwritten shape_array = x.shape a = x.reshape(-1, shape_array[-2], shape_array[-1]).copy() with_eigen_vector = False n = a.shape[1] batchSize = len(a) info = cupy.empty(batchSize, dtype=numpy.int32) cusolver_handle = device.Device().cusolver_handle params = my_cusolver.DnCreateSyevjInfo(cusolver_handle) if a.dtype.char == 'f' or a.dtype.char == 'd' or a.dtype.char == 'F' or a.dtype.char == 'D': dtype = a.dtype.char else: # dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char print("Error: input dtype is not appropriate") raise ValueError if dtype == 'f': eigh = my_cusolver.DnSsyevjBatched eigh_bufferSize = my_cusolver.DnSsyevjBatched_bufferSize dtype_eig_val = 'f' elif dtype == 'd': eigh = my_cusolver.DnDsyevjBatched eigh_bufferSize = my_cusolver.DnDsyevjBatched_bufferSize dtype_eig_val = 'd' elif dtype == 'F': eigh = my_cusolver.DnCheevjBatched eigh_bufferSize = my_cusolver.DnCheevjBatched_bufferSize dtype_eig_val = 'f' elif dtype == 'D': eigh = my_cusolver.DnZheevjBatched eigh_bufferSize = my_cusolver.DnZheevjBatched_bufferSize dtype_eig_val = 'd' jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR if upper_or_lower: # Hermitian行列だから,右上(upper)か左下(lower)のみ見れば良い.どちらを見るか uplo = cublas.CUBLAS_FILL_MODE_LOWER else: uplo = cublas.CUBLAS_FILL_MODE_UPPER eig_val = cupy.empty((batchSize, n), dtype=dtype_eig_val) # for eigen vector buffersize = eigh_bufferSize(cusolver_handle, jobz, uplo, n, a.data.ptr, n, eig_val.data.ptr, params, batchSize) workspace = cupy.empty(buffersize, dtype=dtype) # LU factorization eigh(cusolver_handle, jobz, uplo, n, a.data.ptr, n, eig_val.data.ptr, workspace.data.ptr, buffersize, info.data.ptr, params, batchSize) if batchSize == 1: return eig_val[0].prod() else: eig_val = eig_val.prod(axis=1) return eig_val.reshape(shape_array[:-2])
def _syevj_batched(a, UPLO, with_eigen_vector): if a.dtype == 'f' or a.dtype == 'e': dtype = 'f' inp_w_dtype = 'f' inp_v_dtype = 'f' ret_w_dtype = a.dtype ret_v_dtype = a.dtype elif a.dtype == 'd': dtype = 'd' inp_w_dtype = 'd' inp_v_dtype = 'd' ret_w_dtype = 'd' ret_v_dtype = 'd' elif a.dtype == 'F': dtype = 'F' inp_w_dtype = 'f' inp_v_dtype = 'F' ret_w_dtype = 'f' ret_v_dtype = 'F' elif a.dtype == 'D': dtype = 'D' inp_w_dtype = 'd' inp_v_dtype = 'D' ret_w_dtype = 'd' ret_v_dtype = 'D' else: # NumPy uses float64 when an input is not floating point number. dtype = 'd' inp_w_dtype = 'd' inp_v_dtype = 'd' ret_w_dtype = 'd' ret_v_dtype = 'd' *batch_shape, m, lda = a.shape batch_size = numpy.prod(batch_shape) a = a.reshape(batch_size, m, lda) v = cupy.array(a.swapaxes(-2, -1), order='C', copy=True, dtype=inp_v_dtype) w = cupy.empty((batch_size, m), inp_w_dtype).swapaxes(-2, 1) dev_info = cupy.empty((), numpy.int32) handle = device.Device().cusolver_handle if with_eigen_vector: jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR else: jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR if UPLO == 'L': uplo = cublas.CUBLAS_FILL_MODE_LOWER else: # UPLO == 'U' uplo = cublas.CUBLAS_FILL_MODE_UPPER if dtype == 'f': buffer_size = cusolver.ssyevjBatched_bufferSize syevjBatched = cusolver.ssyevjBatched elif dtype == 'd': buffer_size = cusolver.dsyevjBatched_bufferSize syevjBatched = cusolver.dsyevjBatched elif dtype == 'F': buffer_size = cusolver.cheevjBatched_bufferSize syevjBatched = cusolver.cheevjBatched elif dtype == 'D': buffer_size = cusolver.zheevjBatched_bufferSize syevjBatched = cusolver.zheevjBatched else: raise RuntimeError('Only float and double and cuComplex and ' + 'cuDoubleComplex are supported') params = cusolver.createSyevjInfo() work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, params, batch_size) work = cupy.empty(work_size, inp_v_dtype) syevjBatched(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, work.data.ptr, work_size, dev_info.data.ptr, params, batch_size) cupy.linalg.util._check_cusolver_dev_info_if_synchronization_allowed( syevjBatched, dev_info) cusolver.destroySyevjInfo(params) w = w.astype(ret_w_dtype, copy=False) w = w.swapaxes(-2, -1).reshape(*batch_shape, m) if not with_eigen_vector: return w v = v.astype(ret_v_dtype, copy=False) v = v.swapaxes(-2, -1).reshape(*batch_shape, m, m) return w, v