def setUp(self): compute_capability = int(device.get_compute_capability()) if compute_capability < 70 and 'e' in self.dtype_combo: self.skipTest("Not supported") dtype_chars = list(self.dtype_combo) self.a_dtype = numpy.dtype(dtype_chars[0]) self.b_dtype = numpy.dtype(dtype_chars[1]) self.c_dtype = numpy.dtype(dtype_chars[2]) self.tol = self._tol[dtype_chars[2].lower()] self.compute_type = cupy.core.core.COMPUTE_TYPE_DEFAULT if self.compute_type_hint == 'down-convert': if self.c_dtype.char in 'fF': self.compute_type = cupy.core.core.COMPUTE_TYPE_FP16 self.tol = self._tol['e'] elif self.c_dtype.char in 'dD': self.compute_type = cupy.core.core.COMPUTE_TYPE_FP32 self.tol = self._tol['f'] elif self.compute_type_hint == 'TF32': if self.c_dtype.char in 'fF': self.compute_type = cupy.core.core.COMPUTE_TYPE_TF32 self.tol = self._tol['e'] m, n, k = self.shape self.a = self.make_matrix((m, k), self.a_dtype) self.b = self.make_matrix((k, n), self.b_dtype) self.c = self.make_matrix((m, n), self.c_dtype) self.c_ref = self.alpha * cupy.matmul(self.a, self.b) self.c_ref += self.beta * self.c self.old_compute_type = cupy.core.get_compute_type(self.c_dtype) cupy.core.set_compute_type(self.c_dtype, self.compute_type)
def check_availability(name): if name not in _available_cuda_version: msg = 'No available version information specified for {}'.format(name) raise ValueError(msg) version_added, version_removed = _available_cuda_version[name] cuda_version = _runtime.runtimeGetVersion() if version_added is not None and cuda_version < version_added: return False if version_removed is not None and cuda_version >= version_removed: return False if name in _available_compute_capability: compute_capability = int(_device.get_compute_capability()) if compute_capability < _available_compute_capability[name]: return False return True
def test_contraction(self): compute_capability = int(device.get_compute_capability()) if compute_capability < 70 and self.dtype == numpy.float16: self.skipTest('Not supported.') desc_a = cutensor.create_tensor_descriptor(self.a) desc_b = cutensor.create_tensor_descriptor(self.b) desc_c = cutensor.create_tensor_descriptor(self.c) d = cutensor.contraction(self.alpha, self.a, desc_a, self.mode_a, self.b, desc_b, self.mode_b, self.beta, self.c, desc_c, self.mode_c) assert self.c is d testing.assert_allclose( self.alpha * self.a_transposed * self.b_transposed + self.beta * self.c_transposed, d, rtol=self.tol, atol=self.tol)
def test_atomic_cas(self, dtype): if dtype == cupy.uint16: if (runtime.is_hip or runtime.runtimeGetVersion() < 10010 or int(device.get_compute_capability()) < 70): self.skipTest('not supported') @jit.rawkernel() def f(x, y, out): tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x if tid < x.size: # = y[tid] if out[tid] == x[tid] else out[tid] jit.atomic_cas(out, tid, x[tid], y[tid]) x = cupy.arange(1024, dtype=dtype) y = x.copy() y[512:] = 0 out = x.copy() out[:512] = 0 f((32, ), (32, ), (x, y, out)) expected = cupy.zeros_like(out) self._check(out, expected)
def check_availability(name): if not _runtime.is_hip: available_version = _available_cuda_version version = _runtime.runtimeGetVersion() else: available_version = _available_hip_version # TODO(leofang): use HIP_VERSION instead? version = _cusolver._getVersion() version = version[0] * 100 + version[1] if name not in available_version: msg = 'No available version information specified for {}'.format(name) raise ValueError(msg) version_added, version_removed = available_version[name] if version_added is not None and version < version_added: return False if version_removed is not None and version >= version_removed: return False # CUDA specific stuff if name in _available_compute_capability: compute_capability = int(_device.get_compute_capability()) if compute_capability < _available_compute_capability[name]: return False return True
def call(self, env, array, index, value, value2=None): name = self._name op = self._op array = Data.init(array, env) if not isinstance(array.ctype, (_cuda_types.CArray, _cuda_types.Ptr)): raise TypeError('The first argument must be of array type.') target = _compile._indexing(array, index, env) ctype = target.ctype if ctype.dtype.name not in self._dtypes: raise TypeError(f'`{name}` does not support {ctype.dtype} input.') # On HIP, 'e' is not supported and we will never reach here if (op == 'Add' and ctype.dtype.char == 'e' and runtime.runtimeGetVersion() < 10000): raise RuntimeError( 'float16 atomic operation is not supported before CUDA 10.0.') value = _compile._astype_scalar(value, ctype, 'same_kind', env) value = Data.init(value, env) if op == 'CAS': assert value2 is not None # On HIP, 'H' is not supported and we will never reach here if ctype.dtype.char == 'H': if runtime.runtimeGetVersion() < 10010: raise RuntimeError( 'uint16 atomic operation is not supported before ' 'CUDA 10.1') if int(device.get_compute_capability()) < 70: raise RuntimeError( 'uint16 atomic operation is not supported before ' 'sm_70') value2 = _compile._astype_scalar(value2, ctype, 'same_kind', env) value2 = Data.init(value2, env) code = f'{name}(&{target.code}, {value.code}, {value2.code})' else: assert value2 is None code = f'{name}(&{target.code}, {value.code})' return Data(code, ctype)
def is_tensor_core_available(dtype): if (dtype == numpy.float16 and _cudnn_version >= 7000 and int(device.get_compute_capability()) == 70): return True return False