Example #1
0
def test_ddp_configure_ddp():
    """Tests with ddp strategy."""
    model = BoringModel()
    ddp_strategy = DDPStrategy()
    trainer = Trainer(
        max_epochs=1,
        strategy=ddp_strategy,
    )
    # test wrap the model if fitting
    trainer.state.fn = TrainerFn.FITTING
    trainer.strategy.connect(model)
    trainer.lightning_module.trainer = trainer
    trainer.strategy.setup_environment()
    assert isinstance(trainer.model, LightningModule)
    trainer.strategy.setup(trainer)
    # in DDPStrategy configure_ddp(), model wrapped by DistributedDataParallel
    assert isinstance(trainer.model, DistributedDataParallel)

    ddp_strategy = DDPStrategy()
    trainer = Trainer(
        max_epochs=1,
        strategy=ddp_strategy,
    )
    # test do not wrap the model if TrainerFn is not fitting
    trainer.state.fn = TrainerFn.VALIDATING
    trainer.strategy.connect(model)
    trainer.lightning_module.trainer = trainer
    trainer.strategy.setup_environment()
    trainer.strategy.setup(trainer)
    # in DDPStrategy configure_ddp(), model are still LightningModule
    assert isinstance(trainer.model, LightningModule)
Example #2
0
def test_ddp_post_local_sgd_comm_hook(tmpdir):
    """Test for DDP post-localSGD hook."""
    model = BoringModel()

    strategy = DDPStrategy(
        ddp_comm_state=post_localSGD.PostLocalSGDState(
            process_group=None,
            subgroup=None,
            start_localSGD_iter=8,
        ),
        ddp_comm_hook=post_localSGD.post_localSGD_hook,
        model_averaging_period=4,
    )
    trainer = Trainer(
        fast_dev_run=True,
        gpus=2,
        strategy=strategy,
        default_root_dir=tmpdir,
        sync_batchnorm=True,
    )
    trainer.fit(model)
    trainer_comm_hook = trainer.strategy.model.get_ddp_logging_data().comm_hook
    expected_comm_hook = post_localSGD.post_localSGD_hook.__qualname__
    assert trainer_comm_hook == expected_comm_hook
    assert trainer.state.finished, f"Training failed with {trainer.state}"
Example #3
0
def test_pluggable_accelerator():
    class TestAccelerator(Accelerator):
        @staticmethod
        def parse_devices(devices):
            return devices

        @staticmethod
        def get_parallel_devices(devices):
            return ["foo"] * devices

        @staticmethod
        def auto_device_count():
            return 3

        @staticmethod
        def is_available():
            return True

        @staticmethod
        def name():
            return "custom_acc_name"

    trainer = Trainer(accelerator=TestAccelerator(), devices=2, strategy="ddp")
    assert isinstance(trainer.accelerator, TestAccelerator)
    assert isinstance(trainer.strategy, DDPStrategy)
    assert trainer.strategy.parallel_devices == ["foo"] * 2

    trainer = Trainer(strategy=DDPStrategy(TestAccelerator()), devices="auto")
    assert isinstance(trainer.accelerator, TestAccelerator)
    assert isinstance(trainer.strategy, DDPStrategy)
    assert trainer.strategy.parallel_devices == ["foo"] * 3
Example #4
0
def test_post_local_sgd_model_averaging_value_error(average_parameters_mock, tmpdir):
    """Test that when using DDP with post-localSGD a ValueError is thrown when the optmizer is
    ZeroRedundancyOptimizer."""
    from torch.distributed.optim import ZeroRedundancyOptimizer

    class OptimizerModel(BoringModel):
        def configure_optimizers(self):
            return ZeroRedundancyOptimizer(params=self.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)

    model = OptimizerModel()
    strategy = DDPStrategy(
        ddp_comm_state=post_localSGD.PostLocalSGDState(
            process_group=None,
            subgroup=None,
            start_localSGD_iter=8,
        ),
        ddp_comm_hook=post_localSGD.post_localSGD_hook,
        model_averaging_period=4,
    )

    trainer = Trainer(
        fast_dev_run=True,
        gpus=2,
        strategy=strategy,
        default_root_dir=tmpdir,
        sync_batchnorm=True,
    )

    with pytest.raises(ValueError, match="Currently model averaging cannot work with a distributed optimizer"):
        trainer.fit(model)

    average_parameters_mock.assert_not_called()
def test_tpu_invalid_raises():
    strategy = TPUSpawnStrategy(accelerator=TPUAccelerator(), precision_plugin=PrecisionPlugin())
    with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"):
        Trainer(strategy=strategy, devices=8)

    strategy = DDPStrategy(accelerator=TPUAccelerator(), precision_plugin=TPUPrecisionPlugin())
    with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"):
        Trainer(strategy=strategy, devices=8)
Example #6
0
def test_configure_launcher_create_processes_externally():
    class MyClusterEnvironment(ClusterEnvironment):
        @property
        def creates_processes_externally(self):
            return True

        @property
        def main_address(self):
            return ""

        @property
        def main_port(self):
            return 8080

        @staticmethod
        def detect():
            return True

        def world_size(self):
            return 1

        def set_world_size(self):
            pass

        def global_rank(self):
            return 0

        def set_global_rank(self):
            pass

        def local_rank(self):
            return 0

        def node_rank(self):
            return 0

    ddp_strategy = DDPStrategy(cluster_environment=MyClusterEnvironment())
    assert ddp_strategy.launcher is None
    ddp_strategy._configure_launcher()
    assert ddp_strategy.launcher is None
Example #7
0
def test_ddp_dont_configure_sync_batchnorm(trainer_fn):
    model = BoringModelGPU()
    model.layer = torch.nn.BatchNorm1d(10)
    ddp_strategy = DDPStrategy()
    trainer = Trainer(accelerator="gpu", devices=1, strategy=ddp_strategy, sync_batchnorm=True)
    trainer.state.fn = trainer_fn
    trainer.strategy.connect(model)
    trainer.lightning_module.trainer = trainer
    trainer.strategy.setup_environment()
    assert isinstance(trainer.model, LightningModule)
    trainer.strategy.setup(trainer)
    # because TrainerFn is not FITTING, model is not configured with sync batchnorm
    assert not isinstance(trainer.strategy.model.layer, torch.nn.modules.batchnorm.SyncBatchNorm)
Example #8
0
def main(v_cfg: DictConfig):
    print(OmegaConf.to_yaml(v_cfg))
    seed_everything(0)
    torch.autograd.set_detect_anomaly(True)
    early_stop_callback = EarlyStopping(
        patience=100,
        monitor="Validation Loss"
    )

    model_check_point = ModelCheckpoint(
        monitor='Valid mean spearman boost',
        save_top_k=1,
        save_last=True,
        mode="max",
        auto_insert_metric_name=True,
        # train_time_interval=timedelta(seconds=60 * 60)
    )

    trainer = Trainer(gpus=v_cfg["trainer"].gpu, enable_model_summary=False,
                      strategy=DDPStrategy(
                          process_group_backend="gloo" if platform.system() == "Windows" else "nccl",
                          find_unused_parameters=False
                      ) if not v_cfg["trainer"]["evaluate"] else None,
                      # early_stop_callback=early_stop_callback,
                      callbacks=[model_check_point],
                      auto_lr_find="learning_rate" if v_cfg["trainer"].auto_lr_find else False,
                      max_epochs=3000,
                      gradient_clip_val=0.1,
                      check_val_every_n_epoch=1,
                      replace_sampler_ddp=False
                      )

    model = Regress_hyper_parameters(v_cfg)
    if v_cfg["trainer"].resume_from_checkpoint is not None:
        state_dict = torch.load(v_cfg["trainer"].resume_from_checkpoint)["state_dict"]
        # for item in list(state_dict.keys()):
        #     if "point_feature_extractor" in item:
        #         state_dict.pop(item)
        model.load_state_dict(state_dict, strict=False)

    if v_cfg["trainer"].auto_lr_find:
        trainer.tune(model)
        print(model.learning_rate)
    # model.save('temp/model.pt')
    if v_cfg["trainer"].evaluate:
        trainer.test(model)
    else:
        trainer.fit(model)
Example #9
0
def test_tpu_invalid_raises():
    training_type_plugin = TPUSpawnStrategy(accelerator=TPUAccelerator(),
                                            precision_plugin=Mock())
    with pytest.raises(
            ValueError,
            match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"
    ):
        Trainer(strategy=training_type_plugin)

    training_type_plugin = DDPStrategy(accelerator=TPUAccelerator(),
                                       precision_plugin=TPUPrecisionPlugin())
    with pytest.raises(
            ValueError,
            match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"
    ):
        Trainer(strategy=training_type_plugin)
Example #10
0
def test_ddp_fp16_compress_comm_hook(tmpdir):
    """Test for DDP FP16 compress hook."""
    model = BoringModel()
    strategy = DDPStrategy(ddp_comm_hook=default.fp16_compress_hook)
    trainer = Trainer(
        max_epochs=1,
        gpus=2,
        strategy=strategy,
        default_root_dir=tmpdir,
        sync_batchnorm=True,
        fast_dev_run=True,
    )
    trainer.fit(model)
    trainer_comm_hook = trainer.strategy.model.get_ddp_logging_data().comm_hook
    expected_comm_hook = default.fp16_compress_hook.__qualname__
    assert trainer_comm_hook == expected_comm_hook
    assert trainer.state.finished, f"Training failed with {trainer.state}"
Example #11
0
def test_tpu_invalid_raises_set_precision_with_strategy():
    accelerator = TPUAccelerator()
    training_type_plugin = TPUSpawnStrategy(accelerator=accelerator,
                                            precision_plugin=object())
    with pytest.raises(
            ValueError,
            match=
            "`TPUAccelerator` can only be used with a `TPUPrecisionPlugin`"):
        Trainer(strategy=training_type_plugin)

    accelerator = TPUAccelerator()
    training_type_plugin = DDPStrategy(accelerator=accelerator,
                                       precision_plugin=TPUPrecisionPlugin())
    with pytest.raises(
            ValueError,
            match=
            "The `TPUAccelerator` can only be used with a `SingleTPUStrategy` or `TPUSpawnStrategy"
    ):
        Trainer(strategy=training_type_plugin)
Example #12
0
def test_post_local_sgd_model_averaging(average_parameters_mock, tmpdir):
    """Test that when using DDP with post-localSGD, model averaging is called."""
    model = BoringModel()

    # test regular ddp does not call model averaging
    trainer = Trainer(
        fast_dev_run=True,
        accelerator="gpu",
        devices=2,
        strategy="ddp",
        default_root_dir=tmpdir,
        sync_batchnorm=True,
        enable_progress_bar=False,
        enable_model_summary=False,
    )

    trainer.fit(model)
    average_parameters_mock.assert_not_called()

    # test ddp with post-localSGD does call model averaging
    ddp_strategy = DDPStrategy(
        ddp_comm_state=post_localSGD.PostLocalSGDState(
            process_group=None,
            subgroup=None,
            start_localSGD_iter=8,
        ),
        ddp_comm_hook=post_localSGD.post_localSGD_hook,
        model_averaging_period=4,
    )

    trainer = Trainer(
        fast_dev_run=True,
        accelerator="gpu",
        devices=2,
        strategy=ddp_strategy,
        default_root_dir=tmpdir,
        sync_batchnorm=True,
    )

    trainer.fit(model)
    average_parameters_mock.assert_called()
Example #13
0
def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir):
    """Test for DDP FP16 compress wrapper for SGD hook."""
    model = BoringModel()
    strategy = DDPStrategy(
        ddp_comm_state=powerSGD.PowerSGDState(process_group=None),
        ddp_comm_hook=powerSGD.powerSGD_hook,
        ddp_comm_wrapper=default.fp16_compress_wrapper,
    )
    trainer = Trainer(
        max_epochs=1,
        gpus=2,
        strategy=strategy,
        default_root_dir=tmpdir,
        sync_batchnorm=True,
        fast_dev_run=True,
    )
    trainer.fit(model)
    trainer_comm_hook = trainer.strategy.model.get_ddp_logging_data().comm_hook
    expected_comm_hook = default.fp16_compress_wrapper(powerSGD.powerSGD_hook).__qualname__
    assert trainer_comm_hook == expected_comm_hook
    assert trainer.state.finished, f"Training failed with {trainer.state}"
def test_ddp_strategy_set_timeout(mock_init_process_group):
    """Tests with ddp strategy."""
    test_timedelta = timedelta(seconds=30)
    model = BoringModel()
    ddp_strategy = DDPStrategy(timeout=test_timedelta)
    trainer = Trainer(
        max_epochs=1,
        strategy=ddp_strategy,
    )
    # test wrap the model if fitting
    trainer.state.fn = TrainerFn.FITTING
    trainer.strategy.connect(model)
    trainer.lightning_module.trainer = trainer
    trainer.strategy.setup_environment()

    process_group_backend = trainer.strategy._get_process_group_backend()
    global_rank = trainer.strategy.cluster_environment.global_rank()
    world_size = trainer.strategy.cluster_environment.world_size()
    mock_init_process_group.assert_called_with(process_group_backend,
                                               rank=global_rank,
                                               world_size=world_size,
                                               timeout=test_timedelta)
Example #15
0
def process_args(args=None, return_io=False):
    """
    Process arguments for running training
    """
    if not isinstance(args, argparse.Namespace):
        args = parse_args(args)

    args.loader_kwargs = dict()

    targs = dict(max_epochs=args.epochs, )

    targs['accumulate_grad_batches'] = args.accumulate

    env = None

    if args.ipu:
        targs['accelerator'] = 'ipu'
        targs['devices'] = process_gpus(args.gpus)
    else:
        targs['gpus'] = process_gpus(args.gpus)
        targs['num_nodes'] = args.num_nodes
        if args.lsf:
            ##########################################################################################
            # Currently coding against pytorch-lightning 1.4.3
            ##########################################################################################
            if args.num_workers > 4:
                print0(
                    "num_workers (-k) > 4 can lead to hanging on Summit -- setting to 4",
                    file=sys.stderr)
                args.num_workers = 4
            args.loader_kwargs[
                'num_workers'] = 1  # Set as a default. This will get overridden elsewhere
            args.loader_kwargs['multiprocessing_context'] = 'spawn'
            env = LSFEnvironment()
        elif args.slurm:
            env = SLURMEnvironment()

        if env is not None:
            global RANK
            global SIZE
            try:
                RANK = env.global_rank()
                SIZE = env.world_size()
            except:
                print(
                    ">>> Could not get global rank -- setting RANK to 0 and SIZE to 1",
                    file=sys.stderr)
                RANK = 0
                SIZE = 1

        if targs['gpus'] is not None:
            targs['accelerator'] = 'gpu'
            if targs['gpus'] == 1:
                targs['devices'] = 1
            else:
                if env is None:
                    raise ValueError(
                        'Please specify environment (--lsf or --slurm) if using more than one GPU'
                    )
                # parallel_devices = [torch.device(i) for i in range(torch.cuda.device_count()) if i < targs['gpus']]
                # precision_plugin = NativeMixedPrecisionPlugin(16, 'cuda')
                torch.cuda.set_device(env.local_rank())
                targs['devices'] = targs['gpus']
                targs['strategy'] = DDPStrategy(
                    find_unused_parameters=False,
                    cluster_environment=env,
                    #accelerator=GPUAccelerator(),
                    #parallel_devices=parallel_devices,
                    #precision_plugin=precision_plugin,
                )

                print(
                    "---- Rank %s  -  Using GPUAccelerator with DDPStrategy" %
                    env.global_rank(),
                    file=sys.stderr)
        else:
            targs['accelerator'] = 'cpu'

    del args.gpus

    if args.sanity:
        if isinstance(args.sanity, str):
            args.sanity = int(args.sanity)
        else:
            args.sanity = 4000
        targs['limit_train_batches'] = args.sanity
        targs['limit_val_batches'] = args.sanity // 4

    if args.lr_find:
        targs['auto_lr_find'] = True
    del args.lr_find

    if args.checkpoint is not None:
        if os.path.exists(args.checkpoint):
            targs['resume_from_checkpoint'] = args.checkpoint
        else:
            warnings.warn(
                "Ignoring -c/--checkpoint argument because {args.checkpoint} does not exist."
            )
            args.checkpoint = None

    if args.cuda_profile:
        targs['profiler'] = PyTorchProfiler(
            filename=f'pytorch_prof.{RANK:0{len(str(SIZE))}}', emit_nvtx=True)

    targs['replace_sampler_ddp'] = False

    args.loader_kwargs = dict()

    # make sure we are classifying if we are using adding classifier layers
    # to a resnet features model
    if args.features_checkpoint is not None:
        if args.manifold:
            raise ValueError(
                'Cannot use manifold loss (i.e. -M) if adding classifier (i.e. -F)'
            )
        args.classify = True

    data_mod = DeepIndexDataModule(args,
                                   keep_open=True,
                                   seed=args.seed + RANK,
                                   rank=RANK,
                                   size=SIZE)

    # if classification problem, use the number of taxa as the number of outputs
    if args.classify:
        args.n_outputs = data_mod.dataset.n_outputs

    args.input_nc = 136 if args.tnf else len(data_mod.dataset.vocab)

    model = process_model(args, taxa_table=data_mod.dataset.difile.taxa_table)

    if args.num_workers > 0:
        data_mod.dataset.close()

    ret = [model, args, targs]
    if return_io:
        ret.append(io)

    ret.append(data_mod)

    return tuple(ret)
Example #16
0
def test_parallel_devices_in_strategy_confilict_with_accelerator(
        parallel_devices, accelerator):
    with pytest.raises(MisconfigurationException,
                       match=r"parallel_devices set through"):
        Trainer(strategy=DDPStrategy(parallel_devices=parallel_devices),
                accelerator=accelerator)
@RunIf(min_gpus=2)
@mock.patch.dict(
    os.environ,
    {
        "CUDA_VISIBLE_DEVICES": "0,1",
        "SLURM_NTASKS": "2",
        "SLURM_JOB_NAME": "SOME_NAME",
        "SLURM_NODEID": "0",
        "SLURM_PROCID": "1",
        "SLURM_LOCALID": "1",
    },
)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed",
            autospec=True)
@pytest.mark.parametrize("strategy", ["ddp", DDPStrategy()])
def test_strategy_choice_ddp_slurm(setup_distributed_mock, strategy):
    trainer = Trainer(fast_dev_run=True, strategy=strategy, gpus=2)
    assert trainer._accelerator_connector._is_slurm_managing_tasks()
    assert isinstance(trainer.accelerator, GPUAccelerator)
    assert isinstance(trainer.strategy, DDPStrategy)
    assert isinstance(trainer.strategy.cluster_environment, SLURMEnvironment)
    assert trainer.strategy.cluster_environment.local_rank() == 1
    assert trainer.strategy.local_rank == 1


@mock.patch.dict(
    os.environ,
    {
        "CUDA_VISIBLE_DEVICES": "0,1",
        "SLURM_NTASKS": "2",