コード例 #1
0
    buffer_dtype=torch.float16,
    reduce_dtype=torch.float16,
)

# Params and buffers are not cast, comm only happens
# in reduced precision.
mp_only_reduce = MixedPrecision(reduce_dtype=torch.float16)

# Only parameters are cast (thus comm should happen in the param_dtype precision)
mp_only_param_and_buf = MixedPrecision(param_dtype=torch.float16, buffer_dtype=torch.float16)

# Nothing is cast (thus param, comm, grad, and buffer should be in the full precision)
mp_no_mixed_precision = MixedPrecision()

nccl_supports_bf16 = (
    CUDA11OrLater and dist.is_nccl_available() and nccl.version() >= (2, 10)
)

mp_configs = [default_mp, mp_only_reduce, mp_only_param_and_buf, mp_no_mixed_precision]
if nccl_supports_bf16:
    mp_diff_buffer_and_reduce = MixedPrecision(
        param_dtype=torch.float16,
        buffer_dtype=torch.bfloat16,
        reduce_dtype=torch.float32
    )
    mp_configs.extend([mp_diff_buffer_and_reduce])

# Buffer original dtype, which can differ from model params.
_BUFFER_ORIG_DTYPE = torch.float64

params = "mp_config,cpu_offload,backward_prefetch,full_precision_param_dtype,sharded_grad_scaler"
コード例 #2
0
    reduce_dtype=torch.float16,
)

# Params and buffers are not cast, comm only happens
# in reduced precision.
mp_only_reduce = MixedPrecision(reduce_dtype=torch.float16)

# Only parameters are cast (thus comm should happen in the param_dtype precision)
mp_only_param_and_buf = MixedPrecision(param_dtype=torch.float16,
                                       buffer_dtype=torch.float16)

# Nothing is cast (thus param, comm, grad, and buffer should be in the full precision)
mp_no_mixed_precision = MixedPrecision()

nccl_supports_bf16 = (CUDA11OrLater and dist.is_nccl_available()
                      and nccl.version() >= (2, 10))

mp_configs = [
    default_mp, mp_only_reduce, mp_only_param_and_buf, mp_no_mixed_precision
]
if nccl_supports_bf16:
    mp_diff_buffer_and_reduce = MixedPrecision(param_dtype=torch.float16,
                                               buffer_dtype=torch.bfloat16,
                                               reduce_dtype=torch.float32)
    mp_configs.extend([mp_diff_buffer_and_reduce])

# Buffer original dtype, which can differ from model params.
_BUFFER_ORIG_DTYPE = torch.float64

params = "mp_config,cpu_offload,backward_prefetch,full_precision_param_dtype"
cpu_offload_config = [
コード例 #3
0
import re
HIP_VERSION = 0.0 if torch.version.hip is None else float(
    re.search(r"^\d+\.\d+", torch.version.hip)[0])

# load_tests from common_utils is used to automatically filter tests for
# sharding on sandcastle. This line silences flake warnings
load_tests = load_tests

nGPUs = torch.cuda.device_count()
if not TEST_CUDA:
    print('CUDA not available, skipping tests', file=sys.stderr)
    TestCase = object  # noqa: F811

datatypes = [torch.float]
if (TEST_CUDA and CUDA11OrLater and c10d.is_nccl_available()
        and nccl.version() >= (2, 10)) or TEST_WITH_ROCM:
    datatypes.append(torch.bfloat16)


class TestNCCL(TestCase):
    @sandcastle_skip_if(IS_WINDOWS, "NCCL doesn't support Windows")
    def test_unique_id(self, device):
        uid = nccl.unique_id()
        self.assertIsInstance(uid, bytes)
        self.assertGreater(len(uid), 1)

    @sandcastle_skip_if(TEST_WITH_ROCM and HIP_VERSION < 3.5,
                        'Skip NCCL tests for ROCm')
    @sandcastle_skip_if(IS_WINDOWS, "NCCL doesn't support Windows")
    @sandcastle_skip_if(not TEST_MULTIGPU, "only one GPU detected")
    @dtypes(*datatypes)