def test_random_master_port():
    """ Test randomly chosen master port when no master port was given by user. """
    env = LightningEnvironment()
    port = env.master_port()
    assert isinstance(port, int)
    # repeated calls do not generate a new port number
    assert env.master_port() == port
Esempio n. 2
0
def test_default_attributes():
    """Test the default attributes when no environment variables are set."""
    env = LightningEnvironment()
    assert not env.creates_processes_externally
    assert env.main_address == "127.0.0.1"
    assert isinstance(env.main_port, int)
    assert env.world_size() == 1
    assert env.local_rank() == 0
    assert env.node_rank() == 0
Esempio n. 3
0
def environment_combinations():
    expected = dict(global_rank=3, local_rank=1, node_rank=1, world_size=4)
    # Lightning
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "LOCAL_RANK": "1",
        "NODE_RANK": "1",
        "WORLD_SIZE": "8"
    }
    environment = LightningEnvironment()
    yield environment, variables, expected
    # SLURM
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "SLURM_JOB_NAME": "SOME_NAME",
        "SLURM_LOCALID": "1",
        "SLURM_NODEID": "1",
        "SLURM_PROCID": "3",
        "SLURM_NTASKS": "4",
    }
    environment = SLURMEnvironment()
    yield environment, variables, expected
    # TorchElastic
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "LOCAL_RANK": "1",
        "GROUP_RANK": "1",
        "RANK": "3",
        "WORLD_SIZE": "4",
        "LOCAL_WORLD_SIZE": "2",
    }
    environment = TorchElasticEnvironment()
    yield environment, variables, expected
Esempio n. 4
0
def test_sync_batchnorm_ddp(tmpdir):
    seed_everything(234)
    set_random_main_port()

    # define datamodule and dataloader
    dm = MNISTDataModule()
    dm.prepare_data()
    dm.setup(stage=None)

    train_dataloader = dm.train_dataloader()
    model = SyncBNModule()

    bn_outputs = []

    # shuffle is false by default
    for batch_idx, batch in enumerate(train_dataloader):
        x, _ = batch

        _, out_bn = model.forward(x, batch_idx)
        bn_outputs.append(out_bn)

        # get 3 steps
        if batch_idx == 2:
            break

    bn_outputs = [x.cuda() for x in bn_outputs]

    # reset datamodule
    # batch-size = 16 because 2 GPUs in DDP
    dm = MNISTDataModule(batch_size=16, dist_sampler=True)
    dm.prepare_data()
    dm.setup(stage=None)

    model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs)
    ddp = DDPSpawnStrategy(
        parallel_devices=[torch.device("cuda", 0),
                          torch.device("cuda", 1)],
        num_nodes=1,
        sync_batchnorm=True,
        cluster_environment=LightningEnvironment(),
        find_unused_parameters=True,
    )

    trainer = Trainer(
        default_root_dir=tmpdir,
        gpus=2,
        num_nodes=1,
        strategy=ddp,
        max_epochs=1,
        max_steps=3,
        sync_batchnorm=True,
        num_sanity_val_steps=0,
        replace_sampler_ddp=False,
    )

    trainer.fit(model, dm)
    # the strategy is responsible for tearing down the batchnorm wrappers
    assert not isinstance(model.bn_layer,
                          torch.nn.modules.batchnorm.SyncBatchNorm)
    assert isinstance(model.bn_layer, torch.nn.modules.batchnorm._BatchNorm)
Esempio n. 5
0
def test_default_attributes():
    """ Test the default attributes when no environment variables are set. """
    env = LightningEnvironment()
    assert not env.creates_children()
    assert env.master_address() == "127.0.0.1"
    assert isinstance(env.master_port(), int)
    assert env.world_size() is None
    assert env.local_rank() == 0
    assert env.node_rank() == 0
Esempio n. 6
0
def test_sync_batchnorm_ddp(tmpdir):
    seed_everything(234)
    set_random_master_port()

    # define datamodule and dataloader
    dm = MNISTDataModule()
    dm.prepare_data()
    dm.setup(stage=None)

    train_dataloader = dm.train_dataloader()
    model = SyncBNModule()

    bn_outputs = []

    # shuffle is false by default
    for batch_idx, batch in enumerate(train_dataloader):
        x, _ = batch

        _, out_bn = model.forward(x, batch_idx)
        bn_outputs.append(out_bn)

        # get 3 steps
        if batch_idx == 2:
            break

    bn_outputs = [x.cuda() for x in bn_outputs]

    # reset datamodule
    # batch-size = 16 because 2 GPUs in DDP
    dm = MNISTDataModule(batch_size=16, dist_sampler=True)
    dm.prepare_data()
    dm.setup(stage=None)

    model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs)
    ddp = DDPSpawnPlugin(
        parallel_devices=[torch.device("cuda", 0),
                          torch.device("cuda", 1)],
        num_nodes=1,
        sync_batchnorm=True,
        cluster_environment=LightningEnvironment(),
        find_unused_parameters=True,
    )

    trainer = Trainer(
        default_root_dir=tmpdir,
        gpus=2,
        num_nodes=1,
        accelerator="ddp_spawn",
        max_epochs=1,
        max_steps=3,
        sync_batchnorm=True,
        num_sanity_val_steps=0,
        replace_sampler_ddp=False,
        plugins=[ddp],
    )

    trainer.fit(model, dm)
    assert trainer.state.finished, "Sync batchnorm failing with DDP"
Esempio n. 7
0
def test_attributes_from_environment_variables():
    """ Test that the default cluster environment takes the attributes from the environment variables. """
    env = LightningEnvironment()
    assert env.master_address() == "1.2.3.4"
    assert env.master_port() == 500
    assert env.world_size() is None
    assert env.local_rank() == 2
    assert env.node_rank() == 3
Esempio n. 8
0
 def _choose_and_init_cluster_environment(self) -> ClusterEnvironment:
     if isinstance(self._cluster_environment_flag, ClusterEnvironment):
         return self._cluster_environment_flag
     if self._is_slurm_managing_tasks():
         rank_zero_info("Multiprocessing is handled by SLURM.")
         return SLURMEnvironment()
     for env_type in (BaguaEnvironment, TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment):
         if env_type.detect():
             return env_type()
     return LightningEnvironment()
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self.is_slurm_managing_tasks:
         env = SLURMEnvironment()
     elif TorchElasticEnvironment.is_using_torchelastic():
         env = TorchElasticEnvironment()
     else:
         env = LightningEnvironment()
     return env
 def _choose_and_init_cluster_environment(self) -> ClusterEnvironment:
     if isinstance(self._cluster_environment_flag, ClusterEnvironment):
         return self._cluster_environment_flag
     if self._is_slurm_managing_tasks():
         rank_zero_info("Multiprocessing is handled by SLURM.")
         return SLURMEnvironment()
     for env_type in (BaguaEnvironment, TorchElasticEnvironment,
                      KubeflowEnvironment, LSFEnvironment):
         if env_type.detect():
             # Ignore type error because it is a false positive: https://github.com/python/mypy/issues/13044
             return env_type()  # type: ignore[abstract]
     return LightningEnvironment()
    def select_cluster_environment(self) -> ClusterEnvironment:
        if self._cluster_environment is not None:
            return self._cluster_environment
        if self._is_slurm_managing_tasks():
            rank_zero_info("Multiprocessing is handled by SLURM.")
            return SLURMEnvironment()

        for env_type in (TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment):
            if env_type.detect():
                return env_type()

        return LightningEnvironment()
Esempio n. 12
0
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self._is_slurm_managing_tasks():
         env = SLURMEnvironment()
         rank_zero_info("Multiprocessing is handled by SLURM.")
     elif TorchElasticEnvironment.is_using_torchelastic():
         env = TorchElasticEnvironment()
     elif KubeflowEnvironment.is_using_kubeflow():
         env = KubeflowEnvironment()
     elif LSFEnvironment.is_using_lsf():
         env = LSFEnvironment()
     else:
         env = LightningEnvironment()
     return env
Esempio n. 13
0
def test_v1_4_0_deprecated_lightning_distributed_data_parallel(tmpdir):
    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        gpus=2,
        accelerator="ddp_spawn",
        plugins=[
            CustomDDPPlugin(
                parallel_devices=[torch.device("cuda", 0), torch.device("cuda", 1)],
                cluster_environment=LightningEnvironment(),
            )
        ]
    )
    trainer.fit(model)
Esempio n. 14
0
    class CustomParallelStrategy(DDPStrategy):
        def __init__(self, **kwargs):
            super().__init__(**kwargs)
            # Set to None so it will be overwritten by the accelerator connector.
            self._layer_sync = None

    strategy = CustomParallelStrategy()
    assert strategy._layer_sync is None
    Trainer(strategy=strategy, sync_batchnorm=True)
    assert isinstance(strategy._layer_sync, NativeSyncBatchNorm)


@pytest.mark.parametrize(
    ["plugins", "expected"],
    [
        ([LightningEnvironment(), SLURMEnvironment()], "ClusterEnvironment"),
        ([TorchCheckpointIO(), TorchCheckpointIO()], "CheckpointIO"),
        (
            [
                PrecisionPlugin(),
                DoublePrecisionPlugin(),
                LightningEnvironment(),
                SLURMEnvironment()
            ],
            "PrecisionPlugin, ClusterEnvironment",
        ),
    ],
)
def test_plugin_only_one_instance_for_one_type(plugins, expected):
    with pytest.raises(MisconfigurationException,
                       match=f"Received multiple values for {expected}"):
def test_teardown():
    """ Test that the GROUP_RANK substitutes NODE_RANK. """
    env = LightningEnvironment()
    assert "WORLD_SIZE" in os.environ
    env.teardown()
    assert "WORLD_SIZE" not in os.environ
def test_node_rank_from_group_rank():
    """ Test that the GROUP_RANK substitutes NODE_RANK. """
    env = LightningEnvironment()
    assert "NODE_RANK" not in os.environ
    assert env.node_rank() == 1
def test_manual_user_launch(environ, creates_children):
    """ Test that the environment switches to manual user mode when LOCAL_RANK env variable detected. """
    with mock.patch.dict(os.environ, environ):
        env = LightningEnvironment()
        assert env.creates_children() == creates_children
Esempio n. 18
0
    def __init__(self, num_processes: int = 1,
                 use_ipex: bool = False,
                 enable_bf16=False,
                 distributed_backend="spawn",
                 cpu_for_each_process: Optional[List[List[int]]] = None,
                 *args: Any, **kwargs: Any) -> None:
        """
        A pytorch lightning trainer that uses bigdl-nano optimization.
        :param num_processes: number of processes in distributed training. default: 4.
        :param use_ipex: whether we use ipex as accelerator for trainer. default: True.
        :param cpu_for_each_process: A list of length `num_processes`, each containing a list of
            indices of cpus each process will be using. default: None, and the cpu will be
            automatically and evenly distributed among processes.
        """

        # Check keyword arguments
        if "accelerator" in kwargs:
            warning(f"""Accelerator will be specified by bigdl-nano,
            accelerator entered {kwargs['accelerator']} will be ignored. """)

            kwargs.pop('accelerator')
        if "plugins" in kwargs:
            warning(f"""Plugins will be specified by bigdl-nano,
             plugines entered {kwargs['plugins']} will be ignored. """)

            kwargs.pop('plugins')
        if cpu_for_each_process is not None:
            if len(cpu_for_each_process) != num_processes:
                raise ValueError(f"The length of `cpu_for_each_process` ("
                                 f"{len(cpu_for_each_process)}) is not equal to the number of"
                                 f" processes {num_processes}.")

        # Initialize trainer
        if use_ipex and not check_avx512():
            warning("Enable ipex in a cpu instruction set"
                    " without avx512 may cause some random error."
                    "Fall back to cpu device.")
            use_ipex = False

        if num_processes == 1:
            accelerator = None
            if use_ipex:
                from bigdl.nano.pytorch.accelerators.ipex_accelerator import IPEXAccelerator
                accelerator = IPEXAccelerator(enable_bf16=enable_bf16)
            super().__init__(accelerator=accelerator, *args, **kwargs)
        else:
            plugin = None
            assert distributed_backend in distributed_backends, \
                f"Distributed backends supported now are spawn and ray," \
                " but get {distributed_backend}."
            if distributed_backend == "spawn":
                if use_ipex:
                    import intel_pytorch_extension as ipex
                    device = ipex.DEVICE
                else:
                    device = "cpu"
                plugin = DDPSpawnPlugin(parallel_devices=[
                    torch.device(device) for _ in range(num_processes)],
                    cpu_for_each_process=cpu_for_each_process,
                    cluster_environment=LightningEnvironment())
            elif distributed_backend == "ray":
                # Import RayPlugins may entangle with openmp even if it has not been used,
                # which leads to an unacceptably low performance.
                # So we import when we need.
                from bigdl.nano.pytorch.plugins.ray_distributed import RayPlugin
                plugin = RayPlugin(num_workers=num_processes,  # type: ignore
                                   use_ipex=use_ipex)

            accelerator = None
            if use_ipex:
                from bigdl.nano.pytorch.accelerators.ipex_accelerator import IPEXAccelerator
                accelerator = IPEXAccelerator(training_type_plugin=plugin,  # type: ignore
                                              enable_bf16=enable_bf16)

            super().__init__(accelerator=accelerator,
                             plugins=[plugin], *args, **kwargs)
Esempio n. 19
0
def test_detect():
    assert LightningEnvironment.detect()
Esempio n. 20
0
def test_attributes_from_environment_variables():
    """Test that the default cluster environment takes the attributes from the environment variables."""
    env = LightningEnvironment()
    assert env.main_address == "1.2.3.4"
    assert env.main_port == 500
    assert env.world_size() == 1
    assert env.global_rank() == 0
    assert env.local_rank() == 2
    assert env.node_rank() == 3
    env.set_global_rank(100)
    assert env.global_rank() == 100
    env.set_world_size(100)
    assert env.world_size() == 100
@RunIf(skip_windows=True)
def test_sync_batchnorm_set_in_custom_strategy(tmpdir):
    """Tests if layer_sync is automatically set for custom strategy."""

    class CustomParallelStrategy(DDPStrategy):
        def __init__(self, **kwargs):
            super().__init__(**kwargs)
            # Set to None so it will be overwritten by the accelerator connector.
            self._layer_sync = None

    strategy = CustomParallelStrategy()
    assert strategy._layer_sync is None
    Trainer(strategy=strategy, sync_batchnorm=True)
    assert isinstance(strategy._layer_sync, NativeSyncBatchNorm)


@pytest.mark.parametrize(
    ["plugins", "expected"],
    [
        ([LightningEnvironment(), SLURMEnvironment()], "ClusterEnvironment"),
        ([TorchCheckpointIO(), TorchCheckpointIO()], "CheckpointIO"),
        (
            [PrecisionPlugin(), DoublePrecisionPlugin(), LightningEnvironment(), SLURMEnvironment()],
            "PrecisionPlugin, ClusterEnvironment",
        ),
    ],
)
def test_plugin_only_one_instance_for_one_type(plugins, expected):
    with pytest.raises(MisconfigurationException, match=f"Received multiple values for {expected}"):
        Trainer(plugins=plugins)