Exemple #1
0
    def call(self, env, mask, var, val_id, *, width=None):
        name = self._name

        var = Data.init(var, env)
        ctype = var.ctype
        if ctype.dtype.name not in self._dtypes:
            raise TypeError(f'`{name}` does not support {ctype.dtype} input.')

        try:
            mask = mask.obj
        except Exception:
            raise TypeError('mask must be an integer')
        if runtime.is_hip:
            warnings.warn(f'mask {mask} is ignored on HIP', RuntimeWarning)
        elif not (0x0 <= mask <= 0xffffffff):
            raise ValueError('mask is out of range')

        # val_id refers to "delta" for shfl_{up, down}, "srcLane" for shfl, and
        # "laneMask" for shfl_xor
        if self._op in ('up', 'down'):
            val_id_t = _cuda_types.uint32
        else:
            val_id_t = _cuda_types.int32
        val_id = _compile._astype_scalar(val_id, val_id_t, 'same_kind', env)
        val_id = Data.init(val_id, env)

        if width:
            if isinstance(width, Constant):
                if width.obj not in (2, 4, 8, 16, 32):
                    raise ValueError('width needs to be power of 2')
        else:
            width = Constant(64) if runtime.is_hip else Constant(32)
        width = _compile._astype_scalar(width, _cuda_types.int32, 'same_kind',
                                        env)
        width = Data.init(width, env)

        code = f'{name}({hex(mask)}, {var.code}, {val_id.code}'
        code += f', {width.code})'
        return Data(code, ctype)
Exemple #2
0
 def call(self, env, array, index, value, value2=None):
     name = self._name
     op = self._op
     array = Data.init(array, env)
     if not isinstance(array.ctype, (_cuda_types.CArray, _cuda_types.Ptr)):
         raise TypeError('The first argument must be of array type.')
     target = _compile._indexing(array, index, env)
     ctype = target.ctype
     if ctype.dtype.name not in self._dtypes:
         raise TypeError(f'`{name}` does not support {ctype.dtype} input.')
     # On HIP, 'e' is not supported and we will never reach here
     if (op == 'Add' and ctype.dtype.char == 'e'
             and runtime.runtimeGetVersion() < 10000):
         raise RuntimeError(
             'float16 atomic operation is not supported before CUDA 10.0.')
     value = _compile._astype_scalar(value, ctype, 'same_kind', env)
     value = Data.init(value, env)
     if op == 'CAS':
         assert value2 is not None
         # On HIP, 'H' is not supported and we will never reach here
         if ctype.dtype.char == 'H':
             if runtime.runtimeGetVersion() < 10010:
                 raise RuntimeError(
                     'uint16 atomic operation is not supported before '
                     'CUDA 10.1')
             if int(device.get_compute_capability()) < 70:
                 raise RuntimeError(
                     'uint16 atomic operation is not supported before '
                     'sm_70')
         value2 = _compile._astype_scalar(value2, ctype, 'same_kind', env)
         value2 = Data.init(value2, env)
         code = f'{name}(&{target.code}, {value.code}, {value2.code})'
     else:
         assert value2 is None
         code = f'{name}(&{target.code}, {value.code})'
     return Data(code, ctype)
Exemple #3
0
 def call(self, env, array, index, value):
     array = Data.init(array, env)
     if not isinstance(array.ctype, (_cuda_types.CArray, _cuda_types.Ptr)):
         raise TypeError('The first argument must be of array type.')
     target = _compile._indexing(array, index, env)
     ctype = target.ctype
     value = _compile._astype_scalar(value, ctype, 'same_kind', env)
     name = self._name
     value = Data.init(value, env)
     if ctype.dtype.char not in self._dtypes:
         raise TypeError(f'`{name}` does not support {ctype.dtype} input.')
     if ctype.dtype.char == 'e' and runtime.runtimeGetVersion() < 10000:
         raise RuntimeError(
             'float16 atomic operation is not supported this CUDA version.')
     return Data(f'{name}(&{target.code}, {value.code})', ctype)
Exemple #4
0
    def call(self, env, *, mask=None):
        if runtime.is_hip:
            if mask is not None:
                warnings.warn(f'mask {mask} is ignored on HIP', RuntimeWarning)
                mask = None

        if mask:
            if isinstance(mask, Constant):
                if not (0x0 <= mask.obj <= 0xffffffff):
                    raise ValueError('mask is out of range')
            mask = _compile._astype_scalar(mask, _cuda_types.int32,
                                           'same_kind', env)
            mask = Data.init(mask, env)
            code = f'__syncwarp({mask.code})'
        else:
            code = '__syncwarp()'
        return Data(code, _cuda_types.void)
Exemple #5
0
    def call(self,
             env,
             group,
             dst,
             dst_idx,
             src,
             src_idx,
             size,
             *,
             aligned_size=None):
        if _runtime.runtimeGetVersion() < 11010:
            # the overloaded version of memcpy_async that we use does not yet
            # exist in CUDA 11.0
            raise RuntimeError("not supported in CUDA < 11.1")
        _check_include(env, 'cg')
        _check_include(env, 'cg_memcpy_async')

        dst = _Data.init(dst, env)
        src = _Data.init(src, env)
        for arr in (dst, src):
            if not isinstance(arr.ctype,
                              (_cuda_types.CArray, _cuda_types.Ptr)):
                raise TypeError('dst/src must be of array type.')
        dst = _compile._indexing(dst, dst_idx, env)
        src = _compile._indexing(src, src_idx, env)

        size = _compile._astype_scalar(
            # it's very unlikely that the size would exceed 2^32, so we just
            # pick uint32 for simplicity
            size,
            _cuda_types.uint32,
            'same_kind',
            env)
        size = _Data.init(size, env)
        size_code = f'{size.code}'

        if aligned_size:
            if not isinstance(aligned_size, _Constant):
                raise ValueError(
                    'aligned_size must be a compile-time constant')
            _check_include(env, 'cuda_barrier')
            size_code = (f'cuda::aligned_size_t<{aligned_size.obj}>'
                         f'({size_code})')
        return _Data(
            f'cg::memcpy_async({group.code}, &({dst.code}), '
            f'&({src.code}), {size_code})', _cuda_types.void)