def _test__native_dist_model_create_from_backend_dist(init_method, local_rank, rank, world_size, backend, true_device): import os from datetime import timedelta timeout = timedelta(seconds=20) os.environ["RANK"] = f"{rank}" assert "MASTER_ADDR" not in os.environ assert "MASTER_PORT" not in os.environ model = _NativeDistModel.create_from_backend(backend=backend, timeout=timeout, init_method=init_method) assert dist.is_available() and dist.is_initialized() assert dist.get_backend() == backend with pytest.raises( RuntimeError, match= r"Can not create new distributed process group if default one is"): _NativeDistModel.create_from_backend(backend=backend, timeout=timeout) _assert_model( model, { "device": true_device, "local_rank": local_rank, "rank": rank, "world_size": world_size, "node_index": 0, "nnodes": 1, "nproc_per_node": world_size, }, ) if init_method is None: assert model._init_method == "env://" else: assert model._init_method == init_method model.finalize() del os.environ["RANK"] assert "MASTER_ADDR" not in os.environ assert "MASTER_PORT" not in os.environ assert "RANK" not in os.environ
def test__native_dist_model_create_from_backend_bad_config(): import os from datetime import timedelta os.environ["RANK"] = "1" with pytest.raises( RuntimeError, match= r"PyTorch distributed configuration should define env variables"): _NativeDistModel.create_from_backend(backend="gloo", timeout=timedelta(seconds=10)) del os.environ["RANK"]
def test__native_dist_model_init_method_is_not_none(world_size, local_rank, get_fixed_dirname): init_method = f"file://{get_fixed_dirname('native_dist_model_init_method_is_not_none')}/shared" with pytest.raises(ValueError, match=r"Both rank and world_size should be provided"): _NativeDistModel.create_from_backend(backend="gloo", world_size=world_size, init_method=init_method) with pytest.raises(ValueError, match=r"Both rank and world_size should be provided"): _NativeDistModel.create_from_backend(backend="gloo", rank=local_rank, init_method=init_method)
def test__native_dist_model_create_from_backend_bad_slurm_config(): import os from datetime import timedelta os.environ["SLURM_JOB_ID"] = "1" with pytest.raises(RuntimeError, match=r"SLURM distributed configuration is missing"): _NativeDistModel.create_from_backend(backend="gloo", timeout=timedelta(seconds=10)) with pytest.raises( ValueError, match= r"Arguments rank and world_size should not be specified with SLURM" ): _NativeDistModel.create_from_backend(backend="gloo", timeout=timedelta(seconds=10), rank=1, init_method="", world_size=1) os.environ["SLURM_PROCID"] = "0" os.environ["SLURM_LOCALID"] = "0" os.environ["SLURM_NTASKS"] = "1" os.environ["SLURM_JOB_NODELIST"] = "localhost" os.environ["SLURM_JOB_NUM_NODES"] = "1" os.environ["RANK"] = "1" with pytest.warns(UserWarning, match=r"We detected the following env variables"): model = _NativeDistModel.create_from_backend( backend="gloo", timeout=timedelta(seconds=10)) model.finalize() del os.environ["SLURM_JOB_ID"] del os.environ["SLURM_PROCID"] del os.environ["SLURM_LOCALID"] del os.environ["SLURM_NTASKS"] del os.environ["SLURM_JOB_NODELIST"] del os.environ["SLURM_JOB_NUM_NODES"] del os.environ["RANK"]
def test__native_dist_model(): available_backends = _NativeDistModel.available_backends if dist.is_nccl_available(): assert "nccl" in available_backends else: assert "nccl" not in available_backends if dist.is_gloo_available(): assert "gloo" in available_backends else: assert "gloo" not in available_backends if dist.is_mpi_available(): assert "mpi" in available_backends else: assert "mpi" not in available_backends with pytest.raises(ValueError, match=r"Backend should be one of"): _NativeDistModel.create_from_backend("abc")
def _test__native_dist_model_create_from_backend_dist(local_rank, rank, world_size, backend, true_device): import os from datetime import timedelta timeout = timedelta(seconds=20) os.environ["RANK"] = "{}".format(rank) model = _NativeDistModel.create_from_backend(backend=backend, timeout=timeout) assert dist.is_available() and dist.is_initialized() assert dist.get_backend() == backend with pytest.raises( RuntimeError, match= r"Can not create new distributed process group if default one is"): _NativeDistModel.create_from_backend(backend=backend, timeout=timeout) _assert_model( model, { "device": true_device, "local_rank": local_rank, "rank": rank, "world_size": world_size, "node_index": 0, "nnodes": 1, "nproc_per_node": world_size, }, ) model.finalize() del os.environ["RANK"]
def test__native_dist_model_create_from_backend_bad_slurm_config(): import os from datetime import timedelta os.environ["SLURM_JOB_ID"] = "1" with pytest.raises(RuntimeError, match=r"SLURM distributed configuration is missing"): _NativeDistModel.create_from_backend(backend="gloo", timeout=timedelta(seconds=10)) with pytest.raises( ValueError, match= r"Arguments rank and world_size should not be specified with SLURM" ): _NativeDistModel.create_from_backend(backend="gloo", timeout=timedelta(seconds=10), rank=1, init_method="", world_size=1) os.environ["SLURM_PROCID"] = "0" os.environ["SLURM_LOCALID"] = "0" os.environ["SLURM_NTASKS"] = "1" os.environ["SLURM_JOB_NODELIST"] = "localhost" with pytest.raises(FileNotFoundError, match=r"No such file or directory: 'scontrol'"): _NativeDistModel.create_from_backend(backend="gloo", timeout=timedelta(seconds=10)) os.environ["RANK"] = "1" with pytest.raises(RuntimeError, match=r"Defined env variables"): _NativeDistModel.create_from_backend(backend="gloo", timeout=timedelta(seconds=10)) del os.environ["SLURM_JOB_ID"] del os.environ["SLURM_PROCID"] del os.environ["SLURM_LOCALID"] del os.environ["SLURM_NTASKS"] del os.environ["SLURM_JOB_NODELIST"] del os.environ["RANK"]
def _test__native_dist_model_create_from_backend_no_dist(backend, true_device): from datetime import timedelta model = _NativeDistModel.create_from_backend(backend=backend, timeout=timedelta(seconds=20)) assert dist.is_available() and dist.is_initialized() assert dist.get_backend() == backend _assert_model( model, { "device": true_device, "local_rank": 0, "rank": 0, "world_size": 1, "node_index": 0, "nnodes": 1, "nproc_per_node": 1, }, ) model.finalize()
def test__native_dist_model_init_method_is_none(world_size): with pytest.raises(ValueError, match=r"Arguments rank and world_size should be None"): _NativeDistModel.create_from_backend(backend="gloo", world_size=world_size)
def _test__native_dist_model_create_from_backend_slurm(local_rank, rank, world_size, backend, true_device): import os from datetime import timedelta timeout = timedelta(seconds=20) assert "MASTER_ADDR" not in os.environ assert "MASTER_PORT" not in os.environ del os.environ["WORLD_SIZE"] del os.environ["LOCAL_RANK"] os.environ["SLURM_JOB_ID"] = "15000" os.environ["SLURM_PROCID"] = str(rank) os.environ["SLURM_LOCALID"] = str(local_rank) os.environ["SLURM_NTASKS"] = str(world_size) os.environ["SLURM_JOB_NODELIST"] = "localhost" os.environ["SLURM_JOB_NUM_NODES"] = "1" model = _NativeDistModel.create_from_backend(backend=backend, timeout=timeout) assert dist.is_available() and dist.is_initialized() assert dist.get_backend() == backend with pytest.raises( RuntimeError, match= r"Can not create new distributed process group if default one is"): _NativeDistModel.create_from_backend(backend=backend, timeout=timeout) _assert_model( model, { "device": true_device, "local_rank": local_rank, "rank": rank, "world_size": world_size, "node_index": 0, "nnodes": 1, "nproc_per_node": world_size, }, ) model.finalize() del os.environ["SLURM_JOB_ID"] del os.environ["SLURM_PROCID"] del os.environ["SLURM_LOCALID"] del os.environ["SLURM_NTASKS"] del os.environ["SLURM_JOB_NODELIST"] del os.environ["SLURM_JOB_NUM_NODES"] assert "MASTER_ADDR" not in os.environ assert "MASTER_PORT" not in os.environ assert "RANK" not in os.environ os.environ["WORLD_SIZE"] = str(world_size) os.environ["LOCAL_RANK"] = str(local_rank)