buffer_dtype=torch.float16, reduce_dtype=torch.float16, ) # Params and buffers are not cast, comm only happens # in reduced precision. mp_only_reduce = MixedPrecision(reduce_dtype=torch.float16) # Only parameters are cast (thus comm should happen in the param_dtype precision) mp_only_param_and_buf = MixedPrecision(param_dtype=torch.float16, buffer_dtype=torch.float16) # Nothing is cast (thus param, comm, grad, and buffer should be in the full precision) mp_no_mixed_precision = MixedPrecision() nccl_supports_bf16 = ( CUDA11OrLater and dist.is_nccl_available() and nccl.version() >= (2, 10) ) mp_configs = [default_mp, mp_only_reduce, mp_only_param_and_buf, mp_no_mixed_precision] if nccl_supports_bf16: mp_diff_buffer_and_reduce = MixedPrecision( param_dtype=torch.float16, buffer_dtype=torch.bfloat16, reduce_dtype=torch.float32 ) mp_configs.extend([mp_diff_buffer_and_reduce]) # Buffer original dtype, which can differ from model params. _BUFFER_ORIG_DTYPE = torch.float64 params = "mp_config,cpu_offload,backward_prefetch,full_precision_param_dtype,sharded_grad_scaler"
reduce_dtype=torch.float16, ) # Params and buffers are not cast, comm only happens # in reduced precision. mp_only_reduce = MixedPrecision(reduce_dtype=torch.float16) # Only parameters are cast (thus comm should happen in the param_dtype precision) mp_only_param_and_buf = MixedPrecision(param_dtype=torch.float16, buffer_dtype=torch.float16) # Nothing is cast (thus param, comm, grad, and buffer should be in the full precision) mp_no_mixed_precision = MixedPrecision() nccl_supports_bf16 = (CUDA11OrLater and dist.is_nccl_available() and nccl.version() >= (2, 10)) mp_configs = [ default_mp, mp_only_reduce, mp_only_param_and_buf, mp_no_mixed_precision ] if nccl_supports_bf16: mp_diff_buffer_and_reduce = MixedPrecision(param_dtype=torch.float16, buffer_dtype=torch.bfloat16, reduce_dtype=torch.float32) mp_configs.extend([mp_diff_buffer_and_reduce]) # Buffer original dtype, which can differ from model params. _BUFFER_ORIG_DTYPE = torch.float64 params = "mp_config,cpu_offload,backward_prefetch,full_precision_param_dtype" cpu_offload_config = [
import re HIP_VERSION = 0.0 if torch.version.hip is None else float( re.search(r"^\d+\.\d+", torch.version.hip)[0]) # load_tests from common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings load_tests = load_tests nGPUs = torch.cuda.device_count() if not TEST_CUDA: print('CUDA not available, skipping tests', file=sys.stderr) TestCase = object # noqa: F811 datatypes = [torch.float] if (TEST_CUDA and CUDA11OrLater and c10d.is_nccl_available() and nccl.version() >= (2, 10)) or TEST_WITH_ROCM: datatypes.append(torch.bfloat16) class TestNCCL(TestCase): @sandcastle_skip_if(IS_WINDOWS, "NCCL doesn't support Windows") def test_unique_id(self, device): uid = nccl.unique_id() self.assertIsInstance(uid, bytes) self.assertGreater(len(uid), 1) @sandcastle_skip_if(TEST_WITH_ROCM and HIP_VERSION < 3.5, 'Skip NCCL tests for ROCm') @sandcastle_skip_if(IS_WINDOWS, "NCCL doesn't support Windows") @sandcastle_skip_if(not TEST_MULTIGPU, "only one GPU detected") @dtypes(*datatypes)