Ejemplo n.º 1
0
    def test_LRU_cache6(self):
        # test if each device has a separate cache
        cache0 = self.caches[0]
        cache1 = self.caches[1]

        # ensure a fresh state
        assert cache0.get_curr_size() == 0 <= cache0.get_size()
        assert cache1.get_curr_size() == 0 <= cache1.get_size()

        # do some computation on GPU 0
        with device.Device(0):
            a = testing.shaped_random((10, ), cupy, cupy.float32)
            cupy.fft.fft(a)
        assert cache0.get_curr_size() == 1 <= cache0.get_size()
        assert cache1.get_curr_size() == 0 <= cache1.get_size()

        # do some computation on GPU 1
        with device.Device(1):
            c = testing.shaped_random((16, ), cupy, cupy.float64)
            cupy.fft.fft(c)
        assert cache0.get_curr_size() == 1 <= cache0.get_size()
        assert cache1.get_curr_size() == 1 <= cache1.get_size()

        # reset device 0
        cache0.clear()
        assert cache0.get_curr_size() == 0 <= cache0.get_size()
        assert cache1.get_curr_size() == 1 <= cache1.get_size()

        # reset device 1
        cache1.clear()
        assert cache0.get_curr_size() == 0 <= cache0.get_size()
        assert cache1.get_curr_size() == 0 <= cache1.get_size()
Ejemplo n.º 2
0
    def check(self, device_id):
        if (cupy.cuda.runtime.is_hip
                and self.allocator is memory.malloc_managed):
            raise unittest.SkipTest('HIP does not support managed memory')

        size = 24
        shape = (2, 3)
        dtype = cupy.float32
        with device.Device(device_id):
            src_mem_ptr = self.allocator(size)
        src_ptr = src_mem_ptr.ptr

        args = (src_ptr, size, src_mem_ptr)
        kwargs = {}
        if self.specify_device_id:
            kwargs = {'device_id': device_id}

        unowned_mem = memory.UnownedMemory(*args, **kwargs)
        assert unowned_mem.size == size
        assert unowned_mem.ptr == src_ptr
        assert unowned_mem.device_id == device_id

        arr = cupy.ndarray(shape, dtype, memory.MemoryPointer(unowned_mem, 0))

        # Delete the source object
        del src_mem_ptr

        with device.Device(device_id):
            arr[:] = 2
            assert (arr == 2).all()
Ejemplo n.º 3
0
    def check(self, device_id):
        if cupy.cuda.runtime.is_hip:
            if self.allocator is memory.malloc_managed:
                raise unittest.SkipTest('HIP does not support managed memory')
            if self.allocator is memory.malloc_async:
                raise unittest.SkipTest('HIP does not support async mempool')
        elif cupy.cuda.driver.get_build_version() < 11020:
            raise unittest.SkipTest('malloc_async is supported since '
                                    'CUDA 11.2')

        size = 24
        shape = (2, 3)
        dtype = cupy.float32
        with device.Device(device_id):
            src_mem_ptr = self.allocator(size)
        src_ptr = src_mem_ptr.ptr

        args = (src_ptr, size, src_mem_ptr)
        kwargs = {}
        if self.specify_device_id:
            kwargs = {'device_id': device_id}

        unowned_mem = memory.UnownedMemory(*args, **kwargs)
        assert unowned_mem.size == size
        assert unowned_mem.ptr == src_ptr
        assert unowned_mem.device_id == device_id

        arr = cupy.ndarray(shape, dtype, memory.MemoryPointer(unowned_mem, 0))

        # Delete the source object
        del src_mem_ptr

        with device.Device(device_id):
            arr[:] = 2
            assert (arr == 2).all()
Ejemplo n.º 4
0
    def check(self, device_id):
        size = 24
        shape = (2, 3)
        dtype = cupy.float32
        with device.Device(device_id):
            src_mem_ptr = self.allocator(size)
        src_ptr = src_mem_ptr.ptr

        args = (src_ptr, size, src_mem_ptr)
        kwargs = {}
        if self.specify_device_id:
            kwargs = {'device_id': device_id}

        unowned_mem = memory.UnownedMemory(*args, **kwargs)
        assert unowned_mem.size == size
        assert unowned_mem.ptr == src_ptr
        assert unowned_mem.device_id == device_id

        arr = cupy.ndarray(shape, dtype, memory.MemoryPointer(unowned_mem, 0))

        # Delete the source object
        del src_mem_ptr

        with device.Device(device_id):
            arr[:] = 2
            assert (arr == 2).all()
Ejemplo n.º 5
0
    def check(self, device_id):
        if cupy.cuda.runtime.is_hip:
            if self.allocator is memory.malloc_managed:
                if cupy.cuda.driver.get_build_version() < 40300000:
                    raise unittest.SkipTest(
                        'Managed memory requires ROCm 4.3+')
                else:
                    raise unittest.SkipTest(
                        'hipPointerGetAttributes does not support managed '
                        'memory')
            if self.allocator is memory.malloc_async:
                raise unittest.SkipTest('HIP does not support async mempool')
        else:
            if self.allocator is memory.malloc_async:
                if cupy.cuda.driver._is_cuda_python():
                    version = cupy.cuda.runtime.runtimeGetVersion()
                else:
                    version = cupy.cuda.driver.get_build_version()
                if version < 11020:
                    raise unittest.SkipTest('malloc_async is supported since '
                                            'CUDA 11.2')
                elif runtime.deviceGetAttribute(
                        runtime.cudaDevAttrMemoryPoolsSupported, 0) == 0:
                    raise unittest.SkipTest(
                        'malloc_async is not supported on device 0')

        size = 24
        shape = (2, 3)
        dtype = cupy.float32
        with device.Device(device_id):
            src_mem_ptr = self.allocator(size)
        src_ptr = src_mem_ptr.ptr

        args = (src_ptr, size, src_mem_ptr)
        kwargs = {}
        if self.specify_device_id:
            kwargs = {'device_id': device_id}

        if cupy.cuda.runtime.is_hip and self.allocator is memory._malloc:
            # In ROCm, it seems that `hipPointerGetAttributes()`, which is
            # called in `UnownedMemory()`, requires an unmanaged device pointer
            # that the current device must be the one on which the memory
            # referred to by the pointer physically resides.
            with device.Device(device_id):
                unowned_mem = memory.UnownedMemory(*args, **kwargs)
        else:
            unowned_mem = memory.UnownedMemory(*args, **kwargs)
        assert unowned_mem.size == size
        assert unowned_mem.ptr == src_ptr
        assert unowned_mem.device_id == device_id

        arr = cupy.ndarray(shape, dtype, memory.MemoryPointer(unowned_mem, 0))

        # Delete the source object
        del src_mem_ptr

        with device.Device(device_id):
            arr[:] = 2
            assert (arr == 2).all()
Ejemplo n.º 6
0
 def __init__(self, size):
     self.size = size
     self.ptr = ctypes.c_void_p()
     self._device = None
     if size > 0:
         self._device = device.Device()
         self.ptr = runtime.malloc(size)
Ejemplo n.º 7
0
    def test_device_cache(self):
        @jit.rawkernel()
        def f(x, y):
            tid = jit.threadIdx.x + jit.blockDim.x * jit.blockIdx.x
            y[tid] = x[tid]

        with device.Device(0):
            x = testing.shaped_random((30, ), dtype=numpy.int32, seed=0)
            y = testing.shaped_random((30, ), dtype=numpy.int32, seed=1)
            f((5, ), (6, ), (x, y))
            assert bool((x == y).all())
        with device.Device(1):
            x = testing.shaped_random((30, ), dtype=numpy.int32, seed=2)
            y = testing.shaped_random((30, ), dtype=numpy.int32, seed=3)
            f((5, ), (6, ), (x, y))
            assert bool((x == y).all())
Ejemplo n.º 8
0
 def tearDown(self):
     for i in range(n_devices):
         with device.Device(i):
             cache = config.get_plan_cache()
             cache.clear()
             cache.set_size(self.old_sizes[i])
             cache.set_memsize(-1)
Ejemplo n.º 9
0
def _get_arch():
    # See Supported Compile Options section of NVRTC User Guide for
    # the maximum value allowed for `--gpu-architecture`.
    nvrtc_max_compute_capability = _get_max_compute_capability()

    arch = device.Device().compute_capability
    if arch in _tegra_archs:
        return arch
    else:
        return min(arch, nvrtc_max_compute_capability)
Ejemplo n.º 10
0
 def setUp(self):
     self.caches = []
     self.old_sizes = []
     for i in range(n_devices):
         with device.Device(i):
             cache = config.get_plan_cache()
             self.old_sizes.append(cache.get_size())
             cache.clear()
             cache.set_memsize(-1)
             cache.set_size(2)
         self.caches.append(cache)
Ejemplo n.º 11
0
def _syevd(a, UPLO, with_eigen_vector):
    if UPLO not in ('L', 'U'):
        raise ValueError('UPLO argument must be \'L\' or \'U\'')

    # reject_float16=False for backward compatibility
    dtype, v_dtype = _util.linalg_common_type(a, reject_float16=False)
    real_dtype = dtype.char.lower()
    w_dtype = v_dtype.char.lower()

    # Note that cuSolver assumes fortran array
    v = a.astype(dtype, order='F', copy=True)

    m, lda = a.shape
    w = cupy.empty(m, real_dtype)
    dev_info = cupy.empty((), numpy.int32)
    handle = device.Device().cusolver_handle

    if with_eigen_vector:
        jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR
    else:
        jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR

    if UPLO == 'L':
        uplo = cublas.CUBLAS_FILL_MODE_LOWER
    else:  # UPLO == 'U'
        uplo = cublas.CUBLAS_FILL_MODE_UPPER

    if dtype == 'f':
        buffer_size = cupy.cuda.cusolver.ssyevd_bufferSize
        syevd = cupy.cuda.cusolver.ssyevd
    elif dtype == 'd':
        buffer_size = cupy.cuda.cusolver.dsyevd_bufferSize
        syevd = cupy.cuda.cusolver.dsyevd
    elif dtype == 'F':
        buffer_size = cupy.cuda.cusolver.cheevd_bufferSize
        syevd = cupy.cuda.cusolver.cheevd
    elif dtype == 'D':
        buffer_size = cupy.cuda.cusolver.zheevd_bufferSize
        syevd = cupy.cuda.cusolver.zheevd
    else:
        raise RuntimeError('Only float and double and cuComplex and '
                           + 'cuDoubleComplex are supported')

    work_size = buffer_size(
        handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr)
    work = cupy.empty(work_size, dtype)
    syevd(
        handle, jobz, uplo, m, v.data.ptr, lda,
        w.data.ptr, work.data.ptr, work_size, dev_info.data.ptr)
    cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed(
        syevd, dev_info)

    return w.astype(w_dtype, copy=False), v.astype(v_dtype, copy=False)
Ejemplo n.º 12
0
def _get_arch():
    global _nvrtc_max_compute_capability
    if _nvrtc_max_compute_capability is None:
        # See Supported Compile Options section of NVRTC User Guide for
        # the maximum value allowed for `--gpu-architecture`.
        major, minor = _get_nvrtc_version()
        if major < 9:
            # CUDA 7.0 / 7.5 / 8.0
            _nvrtc_max_compute_capability = '50'
        else:
            # CUDA 9.0 / 9.1
            _nvrtc_max_compute_capability = '70'
    cc = min(device.Device().compute_capability, _nvrtc_max_compute_capability)
    return 'compute_%s' % cc
Ejemplo n.º 13
0
    def malloc(self, size):
        """Allocates the memory, from the pool if possible.

        This method can be used as a CuPy memory allocator. The simplest way to
        use a memory pool as the default allocator is the following code::

           set_allocator(MemoryPool().malloc)

        Args:
            size (int): Size of the memory buffer to allocate in bytes.

        Returns:
            ~cupy.cuda.MemoryPointer: Pointer to the allocated buffer.

        """
        dev = device.Device().id
        return self._pools[dev].malloc(size)
Ejemplo n.º 14
0
def _syevd(a, UPLO, with_eigen_vector):
    if UPLO not in ('L', 'U'):
        raise ValueError("UPLO argument must be 'L' or 'U'")

    if a.dtype == 'f' or a.dtype == 'e':
        dtype = 'f'
        ret_type = a.dtype
    else:
        # NumPy uses float64 when an input is not floating point number.
        dtype = 'd'
        ret_type = 'd'

    # Note that cuSolver assumes fortran array
    v = a.astype(dtype, order='F', copy=True)

    m, lda = a.shape
    w = cupy.empty(m, dtype)
    dev_info = cupy.empty((), 'i')
    handle = device.Device().cusolver_handle

    if with_eigen_vector:
        jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR
    else:
        jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR

    if UPLO == 'L':
        uplo = cublas.CUBLAS_FILL_MODE_LOWER
    else:  # UPLO == 'U'
        uplo = cublas.CUBLAS_FILL_MODE_UPPER

    if dtype == 'f':
        buffer_size = cupy.cuda.cusolver.ssyevd_bufferSize
        syevd = cupy.cuda.cusolver.ssyevd
    elif dtype == 'd':
        buffer_size = cupy.cuda.cusolver.dsyevd_bufferSize
        syevd = cupy.cuda.cusolver.dsyevd
    else:
        raise RuntimeError('Only float and double are supported')

    work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr)
    work = cupy.empty(work_size, dtype)
    syevd(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, work.data.ptr,
          work_size, dev_info.data.ptr)

    return w.astype(ret_type, copy=False), v.astype(ret_type, copy=False)
Ejemplo n.º 15
0
def _get_arch():
    # See Supported Compile Options section of NVRTC User Guide for
    # the maximum value allowed for `--gpu-architecture`.
    major, minor = _get_nvrtc_version()
    if major < 10 or (major == 10 and minor == 0):
        # CUDA 9.x / 10.0
        _nvrtc_max_compute_capability = '70'
    elif major < 11:
        # CUDA 10.1 / 10.2
        _nvrtc_max_compute_capability = '75'
    else:
        # CUDA 11.0 / 11.1
        _nvrtc_max_compute_capability = '80'

    arch = device.Device().compute_capability
    if arch in _tegra_archs:
        return arch
    else:
        return min(arch, _nvrtc_max_compute_capability)
Ejemplo n.º 16
0
def syevj(a, UPLO='L', with_eigen_vector=True):
    """Eigenvalue decomposition of symmetric matrix using cusolverDn<t>syevj().

    Computes eigenvalues ``w`` and (optionally) eigenvectors ``v`` of a complex
    Hermitian or a real symmetric matrix.

    Args:
        a (cupy.ndarray): A symmetric 2-D square matrix ``(M, M)`` or a batch
            of symmetric 2-D square matrices ``(..., M, M)``.
        UPLO (str): Select from ``'L'`` or ``'U'``. It specifies which
            part of ``a`` is used. ``'L'`` uses the lower triangular part of
            ``a``, and ``'U'`` uses the upper triangular part of ``a``.
        with_eigen_vector (bool): Indicates whether or not eigenvectors
            are computed.

    Returns:
        tuple of :class:`~cupy.ndarray`:
            Returns a tuple ``(w, v)``. ``w`` contains eigenvalues and
            ``v`` contains eigenvectors. ``v[:, i]`` is an eigenvector
            corresponding to an eigenvalue ``w[i]``. For batch input,
            ``v[k, :, i]`` is an eigenvector corresponding to an eigenvalue
            ``w[k, i]`` of ``a[k]``.
    """
    if not check_availability('syevj'):
        raise RuntimeError('syevj is not available.')

    if UPLO not in ('L', 'U'):
        raise ValueError('UPLO argument must be \'L\' or \'U\'')

    if a.ndim > 2:
        return _syevj_batched(a, UPLO, with_eigen_vector)

    assert a.ndim == 2

    if a.dtype == 'f' or a.dtype == 'e':
        dtype = 'f'
        inp_w_dtype = 'f'
        inp_v_dtype = 'f'
        ret_w_dtype = a.dtype
        ret_v_dtype = a.dtype
    elif a.dtype == 'd':
        dtype = 'd'
        inp_w_dtype = 'd'
        inp_v_dtype = 'd'
        ret_w_dtype = 'd'
        ret_v_dtype = 'd'
    elif a.dtype == 'F':
        dtype = 'F'
        inp_w_dtype = 'f'
        inp_v_dtype = 'F'
        ret_w_dtype = 'f'
        ret_v_dtype = 'F'
    elif a.dtype == 'D':
        dtype = 'D'
        inp_w_dtype = 'd'
        inp_v_dtype = 'D'
        ret_w_dtype = 'd'
        ret_v_dtype = 'D'
    else:
        # NumPy uses float64 when an input is not floating point number.
        dtype = 'd'
        inp_w_dtype = 'd'
        inp_v_dtype = 'd'
        ret_w_dtype = 'd'
        ret_v_dtype = 'd'

    # Note that cuSolver assumes fortran array
    v = a.astype(inp_v_dtype, order='F', copy=True)

    m, lda = a.shape
    w = cupy.empty(m, inp_w_dtype)
    dev_info = cupy.empty((), numpy.int32)
    handle = device.Device().cusolver_handle

    if with_eigen_vector:
        jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR
    else:
        jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR

    if UPLO == 'L':
        uplo = cublas.CUBLAS_FILL_MODE_LOWER
    else:  # UPLO == 'U'
        uplo = cublas.CUBLAS_FILL_MODE_UPPER

    if dtype == 'f':
        buffer_size = cusolver.ssyevj_bufferSize
        syevj = cusolver.ssyevj
    elif dtype == 'd':
        buffer_size = cusolver.dsyevj_bufferSize
        syevj = cusolver.dsyevj
    elif dtype == 'F':
        buffer_size = cusolver.cheevj_bufferSize
        syevj = cusolver.cheevj
    elif dtype == 'D':
        buffer_size = cusolver.zheevj_bufferSize
        syevj = cusolver.zheevj
    else:
        raise RuntimeError('Only float and double and cuComplex and ' +
                           'cuDoubleComplex are supported')

    params = cusolver.createSyevjInfo()
    work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr,
                            params)
    work = cupy.empty(work_size, inp_v_dtype)
    syevj(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, work.data.ptr,
          work_size, dev_info.data.ptr, params)
    cupy.linalg.util._check_cusolver_dev_info_if_synchronization_allowed(
        syevj, dev_info)

    cusolver.destroySyevjInfo(params)

    w = w.astype(ret_w_dtype, copy=False)
    if not with_eigen_vector:
        return w
    v = v.astype(ret_v_dtype, copy=False)
    return w, v
Ejemplo n.º 17
0
 def get_compute_arch_arg(self, device_id):
   return "-arch=compute_{0}".format(
     device.Device(device_id).compute_capability\
   ).encode()
Ejemplo n.º 18
0
def _compile_with_cache_hip(source, options, arch, cache_dir, extra_source,
                            backend='hiprtc', name_expressions=None,
                            log_stream=None, cache_in_memory=False,
                            use_converter=True):
    global _empty_file_preprocess_cache

    # TODO(leofang): this might be possible but is currently undocumented
    if _is_cudadevrt_needed(options):
        raise ValueError('separate compilation is not supported in HIP')

    # HIP's equivalent of -ftz=true, see ROCm-Developer-Tools/HIP#2252
    # Notes:
    # - For hipcc, this should just work, as invalid options would cause errors
    #   See https://clang.llvm.org/docs/ClangCommandLineReference.html.
    # - For hiprtc, this is a no-op until the compiler options like -D and -I
    #   are accepted, see ROCm-Developer-Tools/HIP#2182 and
    #   ROCm-Developer-Tools/HIP#2248
    options += ('-fcuda-flush-denormals-to-zero',)

    # Workaround ROCm 4.3 LLVM_PATH issue in hipRTC #5689
    rocm_build_version = driver.get_build_version()
    if rocm_build_version >= 40300000 and rocm_build_version < 40500000:
        options += (
            '-I' + get_rocm_path() + '/llvm/lib/clang/13.0.0/include/',)

    if cache_dir is None:
        cache_dir = get_cache_dir()
    # As of ROCm 3.5.0 hiprtc/hipcc can automatically pick up the
    # right arch without setting HCC_AMDGPU_TARGET, so we don't need
    # to tell the compiler which arch we are targeting. But, we still
    # need to know arch as part of the cache key:
    if arch is None:
        # On HIP, gcnArch is computed from "compute capability":
        # https://github.com/ROCm-Developer-Tools/HIP/blob/rocm-4.0.0/rocclr/hip_device.cpp#L202
        arch = device.Device().compute_capability
    if use_converter:
        source = _convert_to_hip_source(source, extra_source,
                                        is_hiprtc=(backend == 'hiprtc'))

    env = (arch, options, _get_nvrtc_version(), backend)
    base = _empty_file_preprocess_cache.get(env, None)
    if base is None:
        # This is for checking HIPRTC/HIPCC compiler internal version
        if backend == 'hiprtc':
            base = _preprocess_hiprtc('', options)
        else:
            base = _preprocess_hipcc('', options)
        _empty_file_preprocess_cache[env] = base

    key_src = '%s %s %s %s' % (env, base, source, extra_source)
    key_src = key_src.encode('utf-8')
    name = _hash_hexdigest(key_src) + '.hsaco'

    mod = function.Module()

    if not cache_in_memory:
        # Read from disk cache
        if not os.path.isdir(cache_dir):
            os.makedirs(cache_dir, exist_ok=True)

        # To handle conflicts in concurrent situation, we adopt lock-free
        # method to avoid performance degradation.
        # We force recompiling to retrieve C++ mangled names if so desired.
        path = os.path.join(cache_dir, name)
        if os.path.exists(path) and not name_expressions:
            with open(path, 'rb') as f:
                data = f.read()
            if len(data) >= _hash_length:
                hash_value = data[:_hash_length]
                binary = data[_hash_length:]
                binary_hash = _hash_hexdigest(binary).encode('ascii')
                if hash_value == binary_hash:
                    mod.load(binary)
                    return mod
    else:
        # Enforce compiling -- the resulting kernel will be cached elsewhere,
        # so we do nothing
        pass

    if backend == 'hiprtc':
        # compile_using_nvrtc calls hiprtc for hip builds
        binary, mapping = compile_using_nvrtc(
            source, options, arch, name + '.cu', name_expressions,
            log_stream, cache_in_memory)
        mod._set_mapping(mapping)
    else:
        binary = compile_using_hipcc(source, options, arch, log_stream)

    if not cache_in_memory:
        # Write to disk cache
        binary_hash = _hash_hexdigest(binary).encode('ascii')

        # shutil.move is not atomic operation, so it could result in a
        # corrupted file. We detect it by appending a hash at the beginning
        # of each cache file. If the file is corrupted, it will be ignored
        # next time it is read.
        with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tf:
            tf.write(binary_hash)
            tf.write(binary)
            temp_path = tf.name
        shutil.move(temp_path, path)

        # Save .cu source file along with .hsaco
        if _get_bool_env_variable('CUPY_CACHE_SAVE_CUDA_SOURCE', False):
            with open(path + '.cpp', 'w') as f:
                f.write(source)
    else:
        # we don't do any disk I/O
        pass

    mod.load(binary)
    return mod
Ejemplo n.º 19
0
    def test_LRU_cache7(self):
        # test accessing a multi-GPU plan
        cache0 = self.caches[0]
        cache1 = self.caches[1]

        # ensure a fresh state
        assert cache0.get_curr_size() == 0 <= cache0.get_size()
        assert cache1.get_curr_size() == 0 <= cache1.get_size()

        # do some computation on GPU 0
        with device.Device(0):
            a = testing.shaped_random((10, ), cupy, cupy.float32)
            cupy.fft.fft(a)
        assert cache0.get_curr_size() == 1 <= cache0.get_size()
        assert cache1.get_curr_size() == 0 <= cache1.get_size()

        # do a multi-GPU FFT
        config.use_multi_gpus = True
        config.set_cufft_gpus([0, 1])
        c = testing.shaped_random((128, ), cupy, cupy.complex64)
        cupy.fft.fft(c)
        assert cache0.get_curr_size() == 2 <= cache0.get_size()
        assert cache1.get_curr_size() == 1 <= cache1.get_size()

        # check both devices' caches see the same multi-GPU plan
        plan0 = next(iter(cache0))[1].plan
        plan1 = next(iter(cache1))[1].plan
        assert plan0 is plan1

        # reset
        config.use_multi_gpus = False
        config._device = None

        # do some computation on GPU 1
        with device.Device(1):
            e = testing.shaped_random((20, ), cupy, cupy.complex128)
            cupy.fft.fft(e)
        assert cache0.get_curr_size() == 2 <= cache0.get_size()
        assert cache1.get_curr_size() == 2 <= cache1.get_size()

        # by this time, the multi-GPU plan remains the most recently
        # used one on GPU 0, but not on GPU 1
        assert plan0 is next(iter(cache0))[1].plan
        assert plan1 is not next(iter(cache1))[1].plan

        # now use it again to make it the most recent
        config.use_multi_gpus = True
        config.set_cufft_gpus([0, 1])
        c = testing.shaped_random((128, ), cupy, cupy.complex64)
        cupy.fft.fft(c)
        assert cache0.get_curr_size() == 2 <= cache0.get_size()
        assert cache1.get_curr_size() == 2 <= cache1.get_size()
        assert plan0 is next(iter(cache0))[1].plan
        assert plan1 is next(iter(cache1))[1].plan
        # reset
        config.use_multi_gpus = False
        config._device = None

        # Do 2 more different FFTs on one of the devices, and the
        # multi-GPU plan would be discarded from both caches
        with device.Device(1):
            x = testing.shaped_random((30, ), cupy, cupy.complex128)
            cupy.fft.fft(x)
            y = testing.shaped_random((40, 40), cupy, cupy.complex64)
            cupy.fft.fftn(y)
        for _, node in cache0:
            assert plan0 is not node.plan
        for _, node in cache1:
            assert plan1 is not node.plan
        assert cache0.get_curr_size() == 1 <= cache0.get_size()
        assert cache1.get_curr_size() == 2 <= cache1.get_size()
Ejemplo n.º 20
0
def _get_arch():
    arch = device.Device().compute_capability
    return arch
Ejemplo n.º 21
0
 def init_caches(gpus):
     for i in gpus:
         with device.Device(i):
             config.get_plan_cache()
Ejemplo n.º 22
0
 def device(self):
     """Device whose memory the pointer refers to."""
     if self._device is None:
         return device.Device()
     else:
         return self._device
Ejemplo n.º 23
0
 def get_compute_arch():
     return "compute_{0}".format(device.Device().compute_capability)
Ejemplo n.º 24
0
def get_compute_arch(t):
    return 'compute_%s' % device.Device().compute_capability
Ejemplo n.º 25
0
def _compile_with_cache_hip(source, options, arch, cache_dir, extra_source,
                            backend='hiprtc', name_expressions=None,
                            log_stream=None, cache_in_memory=False,
                            use_converter=True):
    global _empty_file_preprocess_cache

    # TODO(leofang): this might be possible but is currently undocumented
    if _is_cudadevrt_needed(options):
        raise ValueError('separate compilation is not supported in HIP')

    if cache_dir is None:
        cache_dir = get_cache_dir()
    # As of ROCm 3.5.0 hiprtc/hipcc can automatically pick up the
    # right arch without setting HCC_AMDGPU_TARGET, so we don't need
    # to tell the compiler which arch we are targeting. But, we still
    # need to know arch as part of the cache key:
    if arch is None:
        # On HIP, gcnArch is computed from "compute capability":
        # https://github.com/ROCm-Developer-Tools/HIP/blob/rocm-4.0.0/rocclr/hip_device.cpp#L202
        arch = device.Device().compute_capability
    if use_converter:
        source = _convert_to_hip_source(source, extra_source,
                                        is_hiprtc=(backend == 'hiprtc'))

    env = (arch, options, _get_nvrtc_version(), backend)
    base = _empty_file_preprocess_cache.get(env, None)
    if base is None:
        # This is for checking HIPRTC/HIPCC compiler internal version
        if backend == 'hiprtc':
            base = _preprocess_hiprtc('', options)
        else:
            base = _preprocess_hipcc('', options)
        _empty_file_preprocess_cache[env] = base

    key_src = '%s %s %s %s' % (env, base, source, extra_source)
    key_src = key_src.encode('utf-8')
    name = '%s.hsaco' % hashlib.md5(key_src).hexdigest()

    mod = function.Module()

    if not cache_in_memory:
        # Read from disk cache
        if not os.path.isdir(cache_dir):
            os.makedirs(cache_dir, exist_ok=True)

        # To handle conflicts in concurrent situation, we adopt lock-free
        # method to avoid performance degradation.
        # We force recompiling to retrieve C++ mangled names if so desired.
        path = os.path.join(cache_dir, name)
        if os.path.exists(path) and not name_expressions:
            with open(path, 'rb') as f:
                data = f.read()
            if len(data) >= 32:
                hash_value = data[:32]
                binary = data[32:]
                binary_hash = hashlib.md5(binary).hexdigest().encode('ascii')
                if hash_value == binary_hash:
                    mod.load(binary)
                    return mod
    else:
        # Enforce compiling -- the resulting kernel will be cached elsewhere,
        # so we do nothing
        pass

    if backend == 'hiprtc':
        # compile_using_nvrtc calls hiprtc for hip builds
        binary, mapping = compile_using_nvrtc(
            source, options, arch, name + '.cu', name_expressions,
            log_stream, cache_in_memory)
        mod._set_mapping(mapping)
    else:
        binary = compile_using_hipcc(source, options, arch, log_stream)

    if not cache_in_memory:
        # Write to disk cache
        binary_hash = hashlib.md5(binary).hexdigest().encode('ascii')

        # shutil.move is not atomic operation, so it could result in a
        # corrupted file. We detect it by appending md5 hash at the beginning
        # of each cache file. If the file is corrupted, it will be ignored
        # next time it is read.
        with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tf:
            tf.write(binary_hash)
            tf.write(binary)
            temp_path = tf.name
        shutil.move(temp_path, path)

        # Save .cu source file along with .hsaco
        if _get_bool_env_variable('CUPY_CACHE_SAVE_CUDA_SOURCE', False):
            with open(path + '.cpp', 'w') as f:
                f.write(source)
    else:
        # we don't do any disk I/O
        pass

    mod.load(binary)
    return mod
Ejemplo n.º 26
0
def _get_arch():
    cc = device.Device().compute_capability
    return 'sm_%s' % cc
Ejemplo n.º 27
0
def _syevd(a, UPLO, with_eigen_vector, overwrite_a=False):
    if UPLO not in ('L', 'U'):
        raise ValueError('UPLO argument must be \'L\' or \'U\'')

    # reject_float16=False for backward compatibility
    dtype, v_dtype = _util.linalg_common_type(a, reject_float16=False)
    real_dtype = dtype.char.lower()
    w_dtype = v_dtype.char.lower()

    # Note that cuSolver assumes fortran array
    v = a.astype(dtype, order='F', copy=not overwrite_a)

    m, lda = a.shape
    w = cupy.empty(m, real_dtype)
    dev_info = cupy.empty((), numpy.int32)
    handle = device.Device().cusolver_handle

    if with_eigen_vector:
        jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR
    else:
        jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR

    if UPLO == 'L':
        uplo = cublas.CUBLAS_FILL_MODE_LOWER
    else:  # UPLO == 'U'
        uplo = cublas.CUBLAS_FILL_MODE_UPPER

    global _cuda_runtime_version
    if _cuda_runtime_version < 0:
        _cuda_runtime_version = runtime.runtimeGetVersion()

    if not runtime.is_hip and _cuda_runtime_version >= 11010:
        if dtype.char not in 'fdFD':
            raise RuntimeError('Only float32, float64, complex64, and '
                               'complex128 are supported')
        type_v = _dtype.to_cuda_dtype(dtype)
        type_w = _dtype.to_cuda_dtype(real_dtype)
        params = cusolver.createParams()
        try:
            work_device_size, work_host_sizse = cusolver.xsyevd_bufferSize(
                handle, params, jobz, uplo, m, type_v, v.data.ptr, lda, type_w,
                w.data.ptr, type_v)
            work_device = cupy.empty(work_device_size, 'b')
            work_host = numpy.empty(work_host_sizse, 'b')
            cusolver.xsyevd(handle, params, jobz, uplo, m, type_v, v.data.ptr,
                            lda, type_w, w.data.ptr, type_v,
                            work_device.data.ptr, work_device_size,
                            work_host.ctypes.data, work_host_sizse,
                            dev_info.data.ptr)
        finally:
            cusolver.destroyParams(params)
        cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed(
            cusolver.xsyevd, dev_info)
    else:
        if dtype == 'f':
            buffer_size = cupy.cuda.cusolver.ssyevd_bufferSize
            syevd = cupy.cuda.cusolver.ssyevd
        elif dtype == 'd':
            buffer_size = cupy.cuda.cusolver.dsyevd_bufferSize
            syevd = cupy.cuda.cusolver.dsyevd
        elif dtype == 'F':
            buffer_size = cupy.cuda.cusolver.cheevd_bufferSize
            syevd = cupy.cuda.cusolver.cheevd
        elif dtype == 'D':
            buffer_size = cupy.cuda.cusolver.zheevd_bufferSize
            syevd = cupy.cuda.cusolver.zheevd
        else:
            raise RuntimeError('Only float32, float64, complex64, and '
                               'complex128 are supported')

        work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda,
                                w.data.ptr)
        work = cupy.empty(work_size, dtype)
        syevd(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr,
              work.data.ptr, work_size, dev_info.data.ptr)
        cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed(
            syevd, dev_info)

    return w.astype(w_dtype, copy=False), v.astype(v_dtype, copy=False)
Ejemplo n.º 28
0
def _syevd(a, UPLO, with_eigen_vector):
    if UPLO not in ('L', 'U'):
        raise ValueError('UPLO argument must be \'L\' or \'U\'')

    if a.dtype == 'f' or a.dtype == 'e':
        dtype = 'f'
        inp_w_dtype = 'f'
        inp_v_dtype = 'f'
        ret_w_dtype = a.dtype
        ret_v_dtype = a.dtype
    elif a.dtype == 'd':
        dtype = 'd'
        inp_w_dtype = 'd'
        inp_v_dtype = 'd'
        ret_w_dtype = 'd'
        ret_v_dtype = 'd'
    elif a.dtype == 'F':
        dtype = 'F'
        inp_w_dtype = 'f'
        inp_v_dtype = 'F'
        ret_w_dtype = 'f'
        ret_v_dtype = 'F'
    elif a.dtype == 'D':
        dtype = 'D'
        inp_w_dtype = 'd'
        inp_v_dtype = 'D'
        ret_w_dtype = 'd'
        ret_v_dtype = 'D'
    else:
        # NumPy uses float64 when an input is not floating point number.
        dtype = 'd'
        inp_w_dtype = 'd'
        inp_v_dtype = 'd'
        ret_w_dtype = 'd'
        ret_v_dtype = 'd'

    # Note that cuSolver assumes fortran array
    v = a.astype(inp_v_dtype, order='F', copy=True)

    m, lda = a.shape
    w = cupy.empty(m, inp_w_dtype)
    dev_info = cupy.empty((), numpy.int32)
    handle = device.Device().cusolver_handle

    if with_eigen_vector:
        jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR
    else:
        jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR

    if UPLO == 'L':
        uplo = cublas.CUBLAS_FILL_MODE_LOWER
    else:  # UPLO == 'U'
        uplo = cublas.CUBLAS_FILL_MODE_UPPER

    if dtype == 'f':
        buffer_size = cupy.cuda.cusolver.ssyevd_bufferSize
        syevd = cupy.cuda.cusolver.ssyevd
    elif dtype == 'd':
        buffer_size = cupy.cuda.cusolver.dsyevd_bufferSize
        syevd = cupy.cuda.cusolver.dsyevd
    elif dtype == 'F':
        buffer_size = cupy.cuda.cusolver.cheevd_bufferSize
        syevd = cupy.cuda.cusolver.cheevd
    elif dtype == 'D':
        buffer_size = cupy.cuda.cusolver.zheevd_bufferSize
        syevd = cupy.cuda.cusolver.zheevd
    else:
        raise RuntimeError('Only float and double and cuComplex and ' +
                           'cuDoubleComplex are supported')

    work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr)
    work = cupy.empty(work_size, inp_v_dtype)
    syevd(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr, work.data.ptr,
          work_size, dev_info.data.ptr)
    cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed(
        syevd, dev_info)

    return w.astype(ret_w_dtype, copy=False), v.astype(ret_v_dtype, copy=False)
Ejemplo n.º 29
0
def det_Hermitian(x, upper_or_lower=True):
    """eigen value decomposition for Hermitian matrix

    Args:
        x (cupy.ndarray): The regular matrix
        upper_or_lower: boolean
            "eigh" function is only for Hermitian matrix.
            So, to caculate eigen value, only upper or lower part of the matrix is necessary.
            When the input is the upper part of the matrix, this value is True
    Returns:
        cupy.ndarray: eigen values
    """
    if not cuda.cusolver_enabled:
        raise RuntimeError('Error : cusolver_enabled == False')
    if x.shape[-2] != x.shape[-1]:
        raise ValueError
    # to prevent `a` to be overwritten
    shape_array = x.shape
    a = x.reshape(-1, shape_array[-2], shape_array[-1]).copy()

    with_eigen_vector = False

    n = a.shape[1]
    batchSize = len(a)
    info = cupy.empty(batchSize, dtype=numpy.int32)
    cusolver_handle = device.Device().cusolver_handle

    params = my_cusolver.DnCreateSyevjInfo(cusolver_handle)

    if a.dtype.char == 'f' or a.dtype.char == 'd' or a.dtype.char == 'F' or a.dtype.char == 'D':
        dtype = a.dtype.char
    else:
        # dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char
        print("Error: input dtype is not appropriate")
        raise ValueError

    if dtype == 'f':
        eigh = my_cusolver.DnSsyevjBatched
        eigh_bufferSize = my_cusolver.DnSsyevjBatched_bufferSize
        dtype_eig_val = 'f'
    elif dtype == 'd':
        eigh = my_cusolver.DnDsyevjBatched
        eigh_bufferSize = my_cusolver.DnDsyevjBatched_bufferSize
        dtype_eig_val = 'd'
    elif dtype == 'F':
        eigh = my_cusolver.DnCheevjBatched
        eigh_bufferSize = my_cusolver.DnCheevjBatched_bufferSize
        dtype_eig_val = 'f'
    elif dtype == 'D':
        eigh = my_cusolver.DnZheevjBatched
        eigh_bufferSize = my_cusolver.DnZheevjBatched_bufferSize
        dtype_eig_val = 'd'

    jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR

    if upper_or_lower: # Hermitian行列だから,右上(upper)か左下(lower)のみ見れば良い.どちらを見るか
        uplo = cublas.CUBLAS_FILL_MODE_LOWER
    else:
        uplo = cublas.CUBLAS_FILL_MODE_UPPER

    eig_val = cupy.empty((batchSize, n), dtype=dtype_eig_val) # for eigen vector

    buffersize = eigh_bufferSize(cusolver_handle, jobz, uplo, n, a.data.ptr, n, eig_val.data.ptr, params, batchSize)
    workspace = cupy.empty(buffersize, dtype=dtype)

    # LU factorization
    eigh(cusolver_handle, jobz, uplo, n, a.data.ptr, n, eig_val.data.ptr, workspace.data.ptr, buffersize, info.data.ptr, params, batchSize)

    if batchSize == 1:
        return eig_val[0].prod()
    else:
        eig_val = eig_val.prod(axis=1)
        return eig_val.reshape(shape_array[:-2])
Ejemplo n.º 30
0
def _syevj_batched(a, UPLO, with_eigen_vector):
    if a.dtype == 'f' or a.dtype == 'e':
        dtype = 'f'
        inp_w_dtype = 'f'
        inp_v_dtype = 'f'
        ret_w_dtype = a.dtype
        ret_v_dtype = a.dtype
    elif a.dtype == 'd':
        dtype = 'd'
        inp_w_dtype = 'd'
        inp_v_dtype = 'd'
        ret_w_dtype = 'd'
        ret_v_dtype = 'd'
    elif a.dtype == 'F':
        dtype = 'F'
        inp_w_dtype = 'f'
        inp_v_dtype = 'F'
        ret_w_dtype = 'f'
        ret_v_dtype = 'F'
    elif a.dtype == 'D':
        dtype = 'D'
        inp_w_dtype = 'd'
        inp_v_dtype = 'D'
        ret_w_dtype = 'd'
        ret_v_dtype = 'D'
    else:
        # NumPy uses float64 when an input is not floating point number.
        dtype = 'd'
        inp_w_dtype = 'd'
        inp_v_dtype = 'd'
        ret_w_dtype = 'd'
        ret_v_dtype = 'd'

    *batch_shape, m, lda = a.shape
    batch_size = numpy.prod(batch_shape)
    a = a.reshape(batch_size, m, lda)
    v = cupy.array(a.swapaxes(-2, -1), order='C', copy=True, dtype=inp_v_dtype)

    w = cupy.empty((batch_size, m), inp_w_dtype).swapaxes(-2, 1)
    dev_info = cupy.empty((), numpy.int32)
    handle = device.Device().cusolver_handle

    if with_eigen_vector:
        jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR
    else:
        jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR

    if UPLO == 'L':
        uplo = cublas.CUBLAS_FILL_MODE_LOWER
    else:  # UPLO == 'U'
        uplo = cublas.CUBLAS_FILL_MODE_UPPER

    if dtype == 'f':
        buffer_size = cusolver.ssyevjBatched_bufferSize
        syevjBatched = cusolver.ssyevjBatched
    elif dtype == 'd':
        buffer_size = cusolver.dsyevjBatched_bufferSize
        syevjBatched = cusolver.dsyevjBatched
    elif dtype == 'F':
        buffer_size = cusolver.cheevjBatched_bufferSize
        syevjBatched = cusolver.cheevjBatched
    elif dtype == 'D':
        buffer_size = cusolver.zheevjBatched_bufferSize
        syevjBatched = cusolver.zheevjBatched
    else:
        raise RuntimeError('Only float and double and cuComplex and ' +
                           'cuDoubleComplex are supported')

    params = cusolver.createSyevjInfo()
    work_size = buffer_size(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr,
                            params, batch_size)
    work = cupy.empty(work_size, inp_v_dtype)
    syevjBatched(handle, jobz, uplo, m, v.data.ptr, lda, w.data.ptr,
                 work.data.ptr, work_size, dev_info.data.ptr, params,
                 batch_size)
    cupy.linalg.util._check_cusolver_dev_info_if_synchronization_allowed(
        syevjBatched, dev_info)

    cusolver.destroySyevjInfo(params)

    w = w.astype(ret_w_dtype, copy=False)
    w = w.swapaxes(-2, -1).reshape(*batch_shape, m)
    if not with_eigen_vector:
        return w
    v = v.astype(ret_v_dtype, copy=False)
    v = v.swapaxes(-2, -1).reshape(*batch_shape, m, m)
    return w, v