Ejemplo n.º 1
0
    def test_copy_multi_device_with_stream(self):
        # Kernel that takes long enough then finally writes values.
        src = _test_copy_multi_device_with_stream_src
        if runtime.is_hip and driver.get_build_version() >= 5_00_00000:
            src = '#include <ctime>\n' + src
        kern = cupy.RawKernel(src, 'wait_and_write')

        # Allocates a memory and launches the kernel on a device with its
        # stream.
        with cuda.Device(0):
            # Keep this stream alive over the D2D copy below for HIP
            with cuda.Stream() as s1:  # NOQA
                a = cupy.zeros((2,), dtype=numpy.uint64)
                kern((1,), (1,), a)

        # D2D copy to another device with another stream should get the
        # original values of the memory before the kernel on the first device
        # finally makes the write.
        with cuda.Device(1):
            with cuda.Stream():
                b = a.copy()
                testing.assert_array_equal(
                    b, numpy.array([0, 0], dtype=numpy.uint64))
Ejemplo n.º 2
0
 def __mul__(self, other):
     if cupy.isscalar(other):
         self.sum_duplicates()
         return self._with_data(self.data * other)
     elif isspmatrix_csr(other):
         self.sum_duplicates()
         other.sum_duplicates()
         if cusparse.check_availability('csrgemm2'):
             return cusparse.csrgemm2(self, other)
         elif cusparse.check_availability('csrgemm'):
             return cusparse.csrgemm(self, other)
         else:
             raise NotImplementedError
     elif csc.isspmatrix_csc(other):
         self.sum_duplicates()
         other.sum_duplicates()
         if cusparse.check_availability('csrgemm'):
             return cusparse.csrgemm(self, other.T, transb=True)
         elif cusparse.check_availability('csrgemm2'):
             b = other.tocsr()
             b.sum_duplicates()
             return cusparse.csrgemm2(self, b)
         else:
             raise NotImplementedError
     elif base.isspmatrix(other):
         return self * other.tocsr()
     elif base.isdense(other):
         if other.ndim == 0:
             self.sum_duplicates()
             return self._with_data(self.data * other)
         elif other.ndim == 1:
             self.sum_duplicates()
             other = cupy.asfortranarray(other)
             # need extra padding to ensure not stepping on the CUB bug,
             # see cupy/cupy#3679 for discussion
             is_cub_safe = (self.indptr.data.mem.size >
                            self.indptr.size * self.indptr.dtype.itemsize)
             # CUB spmv is buggy since CUDA 11.0, see
             # https://github.com/cupy/cupy/issues/3822#issuecomment-782607637
             is_cub_safe &= (driver.get_build_version() < 11000)
             for accelerator in _accelerator.get_routine_accelerators():
                 if (accelerator == _accelerator.ACCELERATOR_CUB
                         and is_cub_safe and other.flags.c_contiguous):
                     return cub.device_csrmv(self.shape[0], self.shape[1],
                                             self.nnz, self.data,
                                             self.indptr, self.indices,
                                             other)
             if (cusparse.check_availability('csrmvEx') and self.nnz > 0
                     and cusparse.csrmvExIsAligned(self, other)):
                 # csrmvEx does not work if nnz == 0
                 csrmv = cusparse.csrmvEx
             elif cusparse.check_availability('csrmv'):
                 csrmv = cusparse.csrmv
             elif cusparse.check_availability('spmv'):
                 csrmv = cusparse.spmv
             else:
                 raise NotImplementedError
             return csrmv(self, other)
         elif other.ndim == 2:
             self.sum_duplicates()
             if cusparse.check_availability('csrmm2'):
                 csrmm = cusparse.csrmm2
             elif cusparse.check_availability('spmm'):
                 csrmm = cusparse.spmm
             else:
                 raise NotImplementedError
             return csrmm(self, cupy.asfortranarray(other))
         else:
             raise ValueError('could not interpret dimensions')
     else:
         return NotImplemented
Ejemplo n.º 3
0
from cupy_backends.cuda.libs import nvrtc  # NOQA
from cupy_backends.cuda.libs import profiler  # NOQA

_available = None


class _UnavailableModule():
    available = False

    def __init__(self, name):
        self.__name__ = name


from cupy.cuda import cub  # NOQA

if not runtime.is_hip and driver.get_build_version() > 0:
    from cupy.cuda import jitify  # NOQA
else:
    jitify = None

try:
    from cupy_backends.cuda.libs import nvtx  # NOQA
except ImportError:
    nvtx = _UnavailableModule('cupy.cuda.nvtx')

try:
    from cupy.cuda import thrust  # NOQA
except ImportError:
    thrust = _UnavailableModule('cupy.cuda.thrust')

Ejemplo n.º 4
0
import os
import re
import shutil
import subprocess
import sys
import tempfile

from cupy.cuda import device
from cupy.cuda import function
from cupy_backends.cuda.api import driver
from cupy_backends.cuda.api import runtime
from cupy_backends.cuda.libs import nvrtc
from cupy import _util

if not runtime.is_hip:
    _cuda_version = driver.get_build_version()
    if _cuda_version > 0:
        from cupy.cuda.jitify import jitify


_nvrtc_version = None
_win32 = sys.platform.startswith('win32')
_rdc_flags = ('--device-c', '-dc', '-rdc=true',
              '--relocatable-device-code=true')
_cudadevrt = None


class NVCCException(Exception):
    pass

Ejemplo n.º 5
0
def _compile_with_cache_hip(source, options, arch, cache_dir, extra_source,
                            backend='hiprtc', name_expressions=None,
                            log_stream=None, cache_in_memory=False,
                            use_converter=True):
    global _empty_file_preprocess_cache

    # TODO(leofang): this might be possible but is currently undocumented
    if _is_cudadevrt_needed(options):
        raise ValueError('separate compilation is not supported in HIP')

    # HIP's equivalent of -ftz=true, see ROCm-Developer-Tools/HIP#2252
    # Notes:
    # - For hipcc, this should just work, as invalid options would cause errors
    #   See https://clang.llvm.org/docs/ClangCommandLineReference.html.
    # - For hiprtc, this is a no-op until the compiler options like -D and -I
    #   are accepted, see ROCm-Developer-Tools/HIP#2182 and
    #   ROCm-Developer-Tools/HIP#2248
    options += ('-fcuda-flush-denormals-to-zero',)

    # Workaround ROCm 4.3 LLVM_PATH issue in hipRTC #5689
    rocm_build_version = driver.get_build_version()
    if rocm_build_version >= 40300000 and rocm_build_version < 40500000:
        options += (
            '-I' + get_rocm_path() + '/llvm/lib/clang/13.0.0/include/',)

    if cache_dir is None:
        cache_dir = get_cache_dir()
    # As of ROCm 3.5.0 hiprtc/hipcc can automatically pick up the
    # right arch without setting HCC_AMDGPU_TARGET, so we don't need
    # to tell the compiler which arch we are targeting. But, we still
    # need to know arch as part of the cache key:
    if arch is None:
        # On HIP, gcnArch is computed from "compute capability":
        # https://github.com/ROCm-Developer-Tools/HIP/blob/rocm-4.0.0/rocclr/hip_device.cpp#L202
        arch = device.Device().compute_capability
    if use_converter:
        source = _convert_to_hip_source(source, extra_source,
                                        is_hiprtc=(backend == 'hiprtc'))

    env = (arch, options, _get_nvrtc_version(), backend)
    base = _empty_file_preprocess_cache.get(env, None)
    if base is None:
        # This is for checking HIPRTC/HIPCC compiler internal version
        if backend == 'hiprtc':
            base = _preprocess_hiprtc('', options)
        else:
            base = _preprocess_hipcc('', options)
        _empty_file_preprocess_cache[env] = base

    key_src = '%s %s %s %s' % (env, base, source, extra_source)
    key_src = key_src.encode('utf-8')
    name = _hash_hexdigest(key_src) + '.hsaco'

    mod = function.Module()

    if not cache_in_memory:
        # Read from disk cache
        if not os.path.isdir(cache_dir):
            os.makedirs(cache_dir, exist_ok=True)

        # To handle conflicts in concurrent situation, we adopt lock-free
        # method to avoid performance degradation.
        # We force recompiling to retrieve C++ mangled names if so desired.
        path = os.path.join(cache_dir, name)
        if os.path.exists(path) and not name_expressions:
            with open(path, 'rb') as f:
                data = f.read()
            if len(data) >= _hash_length:
                hash_value = data[:_hash_length]
                binary = data[_hash_length:]
                binary_hash = _hash_hexdigest(binary).encode('ascii')
                if hash_value == binary_hash:
                    mod.load(binary)
                    return mod
    else:
        # Enforce compiling -- the resulting kernel will be cached elsewhere,
        # so we do nothing
        pass

    if backend == 'hiprtc':
        # compile_using_nvrtc calls hiprtc for hip builds
        binary, mapping = compile_using_nvrtc(
            source, options, arch, name + '.cu', name_expressions,
            log_stream, cache_in_memory)
        mod._set_mapping(mapping)
    else:
        binary = compile_using_hipcc(source, options, arch, log_stream)

    if not cache_in_memory:
        # Write to disk cache
        binary_hash = _hash_hexdigest(binary).encode('ascii')

        # shutil.move is not atomic operation, so it could result in a
        # corrupted file. We detect it by appending a hash at the beginning
        # of each cache file. If the file is corrupted, it will be ignored
        # next time it is read.
        with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tf:
            tf.write(binary_hash)
            tf.write(binary)
            temp_path = tf.name
        shutil.move(temp_path, path)

        # Save .cu source file along with .hsaco
        if _get_bool_env_variable('CUPY_CACHE_SAVE_CUDA_SOURCE', False):
            with open(path + '.cpp', 'w') as f:
                f.write(source)
    else:
        # we don't do any disk I/O
        pass

    mod.load(binary)
    return mod
Ejemplo n.º 6
0
Archivo: _csc.py Proyecto: takagi/cupy
 def __mul__(self, other):
     if cupy.isscalar(other):
         self.sum_duplicates()
         return self._with_data(self.data * other)
     elif cupyx.scipy.sparse.isspmatrix_csr(other):
         self.sum_duplicates()
         other.sum_duplicates()
         if cusparse.check_availability('csrgemm') and not runtime.is_hip:
             # trans=True is still buggy as of ROCm 4.2.0
             a = self.T
             return cusparse.csrgemm(a, other, transa=True)
         elif cusparse.check_availability('csrgemm2'):
             a = self.tocsr()
             a.sum_duplicates()
             return cusparse.csrgemm2(a, other)
         else:
             raise NotImplementedError
     elif isspmatrix_csc(other):
         self.sum_duplicates()
         other.sum_duplicates()
         if cusparse.check_availability('csrgemm') and not runtime.is_hip:
             # trans=True is still buggy as of ROCm 4.2.0
             a = self.T
             b = other.T
             return cusparse.csrgemm(a, b, transa=True, transb=True)
         elif cusparse.check_availability('csrgemm2'):
             a = self.tocsr()
             b = other.tocsr()
             a.sum_duplicates()
             b.sum_duplicates()
             return cusparse.csrgemm2(a, b)
         else:
             raise NotImplementedError
     elif cupyx.scipy.sparse.isspmatrix(other):
         return self * other.tocsr()
     elif _base.isdense(other):
         if other.ndim == 0:
             self.sum_duplicates()
             return self._with_data(self.data * other)
         elif other.ndim == 1:
             self.sum_duplicates()
             if (cusparse.check_availability('csrmv')
                     and (not runtime.is_hip
                          or driver.get_build_version() >= 5_00_00000)):
                 # trans=True is buggy as of ROCm 4.2.0
                 csrmv = cusparse.csrmv
             elif (cusparse.check_availability('spmv')
                   and not runtime.is_hip):
                 # trans=True is buggy as of ROCm 4.2.0
                 # (I got HIPSPARSE_STATUS_INTERNAL_ERROR...)
                 csrmv = cusparse.spmv
             else:
                 raise NotImplementedError
             return csrmv(self.T, cupy.asfortranarray(other), transa=True)
         elif other.ndim == 2:
             self.sum_duplicates()
             if (cusparse.check_availability('csrmm2')
                     and (not runtime.is_hip
                          or driver.get_build_version() >= 5_00_00000)):
                 # trans=True is buggy as of ROCm 4.2.0
                 csrmm = cusparse.csrmm2
             elif cusparse.check_availability('spmm'):
                 csrmm = cusparse.spmm
             else:
                 raise NotImplementedError
             return csrmm(self.T, cupy.asfortranarray(other), transa=True)
         else:
             raise ValueError('could not interpret dimensions')
     else:
         return NotImplemented