def call(self, env, mask, var, val_id, *, width=None): name = self._name var = Data.init(var, env) ctype = var.ctype if ctype.dtype.name not in self._dtypes: raise TypeError(f'`{name}` does not support {ctype.dtype} input.') try: mask = mask.obj except Exception: raise TypeError('mask must be an integer') if runtime.is_hip: warnings.warn(f'mask {mask} is ignored on HIP', RuntimeWarning) elif not (0x0 <= mask <= 0xffffffff): raise ValueError('mask is out of range') # val_id refers to "delta" for shfl_{up, down}, "srcLane" for shfl, and # "laneMask" for shfl_xor if self._op in ('up', 'down'): val_id_t = _cuda_types.uint32 else: val_id_t = _cuda_types.int32 val_id = _compile._astype_scalar(val_id, val_id_t, 'same_kind', env) val_id = Data.init(val_id, env) if width: if isinstance(width, Constant): if width.obj not in (2, 4, 8, 16, 32): raise ValueError('width needs to be power of 2') else: width = Constant(64) if runtime.is_hip else Constant(32) width = _compile._astype_scalar(width, _cuda_types.int32, 'same_kind', env) width = Data.init(width, env) code = f'{name}({hex(mask)}, {var.code}, {val_id.code}' code += f', {width.code})' return Data(code, ctype)
def call(self, env, array, index, value, value2=None): name = self._name op = self._op array = Data.init(array, env) if not isinstance(array.ctype, (_cuda_types.CArray, _cuda_types.Ptr)): raise TypeError('The first argument must be of array type.') target = _compile._indexing(array, index, env) ctype = target.ctype if ctype.dtype.name not in self._dtypes: raise TypeError(f'`{name}` does not support {ctype.dtype} input.') # On HIP, 'e' is not supported and we will never reach here if (op == 'Add' and ctype.dtype.char == 'e' and runtime.runtimeGetVersion() < 10000): raise RuntimeError( 'float16 atomic operation is not supported before CUDA 10.0.') value = _compile._astype_scalar(value, ctype, 'same_kind', env) value = Data.init(value, env) if op == 'CAS': assert value2 is not None # On HIP, 'H' is not supported and we will never reach here if ctype.dtype.char == 'H': if runtime.runtimeGetVersion() < 10010: raise RuntimeError( 'uint16 atomic operation is not supported before ' 'CUDA 10.1') if int(device.get_compute_capability()) < 70: raise RuntimeError( 'uint16 atomic operation is not supported before ' 'sm_70') value2 = _compile._astype_scalar(value2, ctype, 'same_kind', env) value2 = Data.init(value2, env) code = f'{name}(&{target.code}, {value.code}, {value2.code})' else: assert value2 is None code = f'{name}(&{target.code}, {value.code})' return Data(code, ctype)
def call(self, env, array, index, value): array = Data.init(array, env) if not isinstance(array.ctype, (_cuda_types.CArray, _cuda_types.Ptr)): raise TypeError('The first argument must be of array type.') target = _compile._indexing(array, index, env) ctype = target.ctype value = _compile._astype_scalar(value, ctype, 'same_kind', env) name = self._name value = Data.init(value, env) if ctype.dtype.char not in self._dtypes: raise TypeError(f'`{name}` does not support {ctype.dtype} input.') if ctype.dtype.char == 'e' and runtime.runtimeGetVersion() < 10000: raise RuntimeError( 'float16 atomic operation is not supported this CUDA version.') return Data(f'{name}(&{target.code}, {value.code})', ctype)
def call(self, env, *, mask=None): if runtime.is_hip: if mask is not None: warnings.warn(f'mask {mask} is ignored on HIP', RuntimeWarning) mask = None if mask: if isinstance(mask, Constant): if not (0x0 <= mask.obj <= 0xffffffff): raise ValueError('mask is out of range') mask = _compile._astype_scalar(mask, _cuda_types.int32, 'same_kind', env) mask = Data.init(mask, env) code = f'__syncwarp({mask.code})' else: code = '__syncwarp()' return Data(code, _cuda_types.void)
def call(self, env, group, dst, dst_idx, src, src_idx, size, *, aligned_size=None): if _runtime.runtimeGetVersion() < 11010: # the overloaded version of memcpy_async that we use does not yet # exist in CUDA 11.0 raise RuntimeError("not supported in CUDA < 11.1") _check_include(env, 'cg') _check_include(env, 'cg_memcpy_async') dst = _Data.init(dst, env) src = _Data.init(src, env) for arr in (dst, src): if not isinstance(arr.ctype, (_cuda_types.CArray, _cuda_types.Ptr)): raise TypeError('dst/src must be of array type.') dst = _compile._indexing(dst, dst_idx, env) src = _compile._indexing(src, src_idx, env) size = _compile._astype_scalar( # it's very unlikely that the size would exceed 2^32, so we just # pick uint32 for simplicity size, _cuda_types.uint32, 'same_kind', env) size = _Data.init(size, env) size_code = f'{size.code}' if aligned_size: if not isinstance(aligned_size, _Constant): raise ValueError( 'aligned_size must be a compile-time constant') _check_include(env, 'cuda_barrier') size_code = (f'cuda::aligned_size_t<{aligned_size.obj}>' f'({size_code})') return _Data( f'cg::memcpy_async({group.code}, &({dst.code}), ' f'&({src.code}), {size_code})', _cuda_types.void)