Ejemplo n.º 1
0
def test_distributed_backend_set_when_using_tpu(tmpdir, tpu_cores):
    """Test if distributed_backend is set to `tpu` when tpu_cores is not None"""
    assert Trainer(tpu_cores=tpu_cores).distributed_backend == "tpu"
Ejemplo n.º 2
0
        
        return logger
    


    def initialize_checkpoint_callback(self):

        checkpoint_callback = ModelCheckpoint(
            monitor="val_acc_epoch",
            dirpath=f"model_checkpoints/" \
                    f"{self.logger_subdir}/" \
                    f"{self.logger_run_name}",
            filename='{epoch:02d}-{val_acc_epoch:.4f}',
            save_top_k=self.configs["train_num_epochs"],
            mode='max',
        )

        return checkpoint_callback



if __name__ == '__main__':
    
    model = SupervisedModel()
    logger = model.initialize_logger()
    checkpoint_callback = model.initialize_checkpoint_callback()
    lr_monitor = LearningRateMonitor(logging_interval='epoch')

    trainer = Trainer(gpus=1, deterministic=True, max_epochs=model.configs['train_num_epochs'], callbacks=[checkpoint_callback, lr_monitor], logger=logger, fast_dev_run=False)
    trainer.fit(model)
    
def test_val_step_epoch_step_metrics(tmpdir):
    """
    Make sure the logged + pbar metrics are allocated accordingly at every step when requested
    """
    # enable internal debugging actions
    os.environ['PL_DEV_DEBUG'] = '1'

    model = DeterministicModel()
    model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks
    model.training_step_end = None
    model.training_epoch_end = None
    model.validation_step = model.validation_step_result_epoch_step_metrics
    model.validation_step_end = None
    model.validation_epoch_end = None

    batches = 3
    epochs = 3
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=epochs,
        row_log_interval=1,
        limit_train_batches=batches,
        limit_val_batches=batches,
        weights_summary=None,
    )
    trainer.fit(model)

    assert len(trainer.logger_connector.callback_metrics) == 11
    expected_metrics = {
        'early_stop_on', 'checkpoint_on', 'val_step_pbar_acc',
        'epoch_val_step_pbar_acc', 'val_step_log_acc',
        'epoch_val_step_log_acc', 'val_step_log_pbar_acc',
        'epoch_val_step_log_pbar_acc', 'val_step_batch_idx',
        'epoch_val_step_batch_idx'
    }
    expected_metrics.add('debug_epoch')
    seen_metrics = set(trainer.logger_connector.callback_metrics)
    assert expected_metrics == seen_metrics

    # make sure correct steps were called
    assert model.validation_step_called
    assert not model.validation_step_end_called
    assert not model.validation_epoch_end_called

    # no early stopping
    assert len(trainer.dev_debugger.early_stopping_history) == 0

    # make sure we logged the exact number of metrics
    assert len(
        trainer.dev_debugger.logged_metrics) == epochs * batches + (epochs)
    assert len(
        trainer.dev_debugger.pbar_added_metrics) == epochs * batches + (epochs)

    # make sure we logged the correct epoch metrics
    for metric_idx in range(0, len(trainer.dev_debugger.logged_metrics),
                            batches + 1):
        batch_metrics = trainer.dev_debugger.logged_metrics[
            metric_idx:metric_idx + batches]
        epoch_metric = trainer.dev_debugger.logged_metrics[metric_idx +
                                                           batches]
        epoch = epoch_metric['epoch']

        # make sure the metric was split
        for batch_metric in batch_metrics:
            assert f'step_val_step_log_acc/epoch_{epoch}' in batch_metric
            assert f'step_val_step_log_pbar_acc/epoch_{epoch}' in batch_metric

        # make sure the epoch split was correct
        assert 'epoch_val_step_log_acc' in epoch_metric
        assert 'epoch_val_step_log_pbar_acc' in epoch_metric

    # make sure we logged the correct pbar metrics
    for metric_idx in range(0, len(trainer.dev_debugger.pbar_added_metrics),
                            batches + 1):
        batch_metrics = trainer.dev_debugger.pbar_added_metrics[
            metric_idx:metric_idx + batches]
        epoch_metric = trainer.dev_debugger.pbar_added_metrics[metric_idx +
                                                               batches]

        # make sure the metric was split
        for batch_metric in batch_metrics:
            assert 'step_val_step_pbar_acc' in batch_metric
            assert 'step_val_step_log_pbar_acc' in batch_metric

        # make sure the epoch split was correct
        assert 'epoch_val_step_pbar_acc' in epoch_metric
        assert 'epoch_val_step_log_pbar_acc' in epoch_metric

    # only 1 checkpoint expected since values didn't change after that
    assert len(trainer.dev_debugger.checkpoint_callback_history) == 1

    # make sure the last known metric is correct
    assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 189
Ejemplo n.º 4
0
def test_lr_finder_fails_fast_on_bad_config(tmpdir):
    """ Test that tune fails if the model does not have a lr BEFORE running lr find """
    trainer = Trainer(default_root_dir=tmpdir, max_steps=2, auto_lr_find=True)
    with pytest.raises(MisconfigurationException,
                       match='should have one of these fields'):
        trainer.tune(BoringModel())
Ejemplo n.º 5
0
                  data_path=os.path.join(
                      PARENT_DIR, 'datasets',
                      'pendulum-gym-image-dataset-train.pkl'))
    checkpoint_callback = ModelCheckpoint(monitor='loss',
                                          prefix=args.name +
                                          f'-T_p={args.T_pred}-',
                                          save_top_k=1,
                                          save_last=True)
    trainer = Trainer.from_argparse_args(
        args,
        deterministic=True,
        default_root_dir=os.path.join(PARENT_DIR, 'logs', args.name),
        checkpoint_callback=checkpoint_callback)
    trainer.fit(model)


if __name__ == '__main__':
    parser = ArgumentParser(add_help=False)
    parser.add_argument('--name', default='ablation-pend-lag-caAE', type=str)
    parser.add_argument('--T_pred', default=4, type=int)
    parser.add_argument('--solver', default='euler', type=str)
    parser.add_argument('--homo_u', dest='homo_u', action='store_true')
    # add args from trainer
    parser = Trainer.add_argparse_args(parser)
    # give the module a chance to add own params
    # good practice to define LightningModule speficic params in the module
    parser = Model.add_model_specific_args(parser)
    # parse params
    args = parser.parse_args()

    main(args)
Ejemplo n.º 6
0
def test_training_step_with_dataloader_access(tmpdir) -> None:
    """A baseline functional test for `training_step` with dataloader access."""
    trainer = Trainer(max_epochs=1, default_root_dir=tmpdir)
    m = AsyncBoringModel()
    trainer.fit(m)
    assert m.num_batches_processed == DATASET_LEN, f"Expect all {DATASET_LEN} batches to be processed."
def test_logger_after_fit_predict_test_calls(tmpdir):
    """Make sure logger outputs are finalized after fit, prediction, and test calls."""
    class BufferLogger(LightningLoggerBase):
        def __init__(self):
            super().__init__()
            self.buffer = {}
            self.logs = {}

        def log_metrics(self,
                        metrics: Dict[str, float],
                        step: Optional[int] = None) -> None:
            self.buffer.update(metrics)

        def finalize(self, status: str) -> None:
            self.logs.update(self.buffer)
            self.buffer = {}

        @property
        def experiment(self) -> Any:
            return None

        @property
        def version(self) -> Union[int, str]:
            return 1

        @property
        def name(self) -> str:
            return "BufferLogger"

        def log_hyperparams(self, *args, **kwargs) -> None:
            return None

    class LoggerCallsObserver(Callback):
        def on_fit_end(self, trainer: "pl.Trainer",
                       pl_module: "pl.LightningModule") -> None:
            trainer.logger.log_metrics({"fit": 1})

        def on_validation_end(self, trainer: "pl.Trainer",
                              pl_module: "pl.LightningModule") -> None:
            trainer.logger.log_metrics({"validate": 1})

        def on_predict_end(self, trainer: "pl.Trainer",
                           pl_module: "pl.LightningModule") -> None:
            trainer.logger.log_metrics({"predict": 1})

        def on_test_end(self, trainer: "pl.Trainer",
                        pl_module: "pl.LightningModule") -> None:
            trainer.logger.log_metrics({"test": 1})

    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_train_batches=1,
        limit_val_batches=1,
        max_epochs=1,
        logger=BufferLogger(),
        callbacks=[LoggerCallsObserver()],
    )

    assert not trainer.logger.logs
    trainer.fit(model)
    assert trainer.logger.logs == {"fit": 1, "validate": 1}
    trainer.test(model)
    assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1}
    trainer.predict(model)
    assert trainer.logger.logs == {
        "fit": 1,
        "validate": 1,
        "test": 1,
        "predict": 1
    }
def test_gradient_accumulation_scheduling(tmpdir):
    """
    Test grad accumulation by the freq of optimizer updates
    """

    # test incorrect configs
    with pytest.raises(IndexError):
        assert Trainer(accumulate_grad_batches={0: 3, 1: 4, 4: 6})
        assert Trainer(accumulate_grad_batches={-2: 3})

    with pytest.raises(TypeError):
        assert Trainer(accumulate_grad_batches={})
        assert Trainer(accumulate_grad_batches=[[2, 3], [4, 6]])
        assert Trainer(accumulate_grad_batches={1: 2, 3.: 4})
        assert Trainer(accumulate_grad_batches={1: 2.5, 3: 5})

    # test optimizer call freq matches scheduler
    def _optimizer_step(self,
                        epoch,
                        batch_idx,
                        optimizer,
                        optimizer_idx,
                        second_order_closure=None):
        # only test the first 12 batches in epoch
        if batch_idx < 12:
            if epoch == 0:
                # reset counter when starting epoch
                if batch_idx == 0:
                    self.prev_called_batch_idx = 0

                    # use this opportunity to test once
                    assert self.trainer.accumulate_grad_batches == 1

                assert batch_idx == self.prev_called_batch_idx
                self.prev_called_batch_idx += 1

            elif 1 <= epoch <= 2:
                # reset counter when starting epoch
                if batch_idx == 1:
                    self.prev_called_batch_idx = 1

                    # use this opportunity to test once
                    assert self.trainer.accumulate_grad_batches == 2

                assert batch_idx == self.prev_called_batch_idx
                self.prev_called_batch_idx += 2

            else:
                if batch_idx == 3:
                    self.prev_called_batch_idx = 3

                    # use this opportunity to test once
                    assert self.trainer.accumulate_grad_batches == 4

                assert batch_idx == self.prev_called_batch_idx
                self.prev_called_batch_idx += 3

        optimizer.step()

        # clear gradients
        optimizer.zero_grad()

    model = EvalModelTemplate()
    schedule = {1: 2, 3: 4}

    trainer = Trainer(accumulate_grad_batches=schedule,
                      train_percent_check=0.1,
                      val_percent_check=0.1,
                      max_epochs=2,
                      default_root_dir=tmpdir)

    # for the test
    trainer.optimizer_step = _optimizer_step
    model.prev_called_batch_idx = 0

    trainer.fit(model)
Ejemplo n.º 9
0
def test_trainer_flag(caplog):
    class TestModel(BoringModel):
        def on_fit_start(self):
            raise SystemExit()

    trainer = Trainer(max_time=dict(seconds=1337))
    with pytest.raises(SystemExit):
        trainer.fit(TestModel())
    timer = [c for c in trainer.callbacks if isinstance(c, Timer)][0]
    assert timer._duration == 1337

    trainer = Trainer(max_time=dict(seconds=1337), callbacks=[Timer()])
    with pytest.raises(SystemExit), caplog.at_level(level=logging.INFO):
        trainer.fit(TestModel())
    assert "callbacks list already contains a Timer" in caplog.text

    # Make sure max_time still honored even if max_epochs == -1
    trainer = Trainer(max_time=dict(seconds=1), max_epochs=-1)
    with pytest.raises(SystemExit):
        trainer.fit(TestModel())
    timer = [c for c in trainer.callbacks if isinstance(c, Timer)][0]
    assert timer._duration == 1
    assert trainer.max_epochs == -1
    assert trainer.max_steps == -1
Ejemplo n.º 10
0
def test_dp_resume(tmpdir):
    """Make sure DP continues training correctly."""
    model = CustomClassificationModelDP(lr=0.1)
    dm = ClassifDataModule()

    trainer_options = dict(max_epochs=1,
                           gpus=2,
                           accelerator='dp',
                           default_root_dir=tmpdir)

    # get logger
    logger = tutils.get_default_logger(tmpdir)

    # exp file to get weights
    # logger file to get weights
    checkpoint = tutils.init_checkpoint_callback(logger)

    # add these to the trainer options
    trainer_options['logger'] = logger
    trainer_options['callbacks'] = [checkpoint]

    # fit model
    trainer = Trainer(**trainer_options)
    trainer.is_slurm_managing_tasks = True
    trainer.fit(model, datamodule=dm)

    # track epoch before saving. Increment since we finished the current epoch, don't want to rerun
    real_global_epoch = trainer.current_epoch + 1

    # correct result and ok accuracy
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

    # ---------------------------
    # HPC LOAD/SAVE
    # ---------------------------
    # save
    trainer.checkpoint_connector.hpc_save(tmpdir, logger)

    # init new trainer
    new_logger = tutils.get_default_logger(tmpdir, version=logger.version)
    trainer_options['logger'] = new_logger
    trainer_options['callbacks'] = [ModelCheckpoint(dirpath=tmpdir)]
    trainer_options['limit_train_batches'] = 0.5
    trainer_options['limit_val_batches'] = 0.2
    trainer_options['max_epochs'] = 1
    new_trainer = Trainer(**trainer_options)

    class CustomModel(CustomClassificationModelDP):
        def __init__(self):
            super().__init__()
            self.on_train_start_called = False

        # set the epoch start hook so we can predict before the model does the full training
        def on_train_start(self):
            assert self.trainer.current_epoch == real_global_epoch and self.trainer.current_epoch > 0

            # if model and state loaded correctly, predictions will be good even though we
            # haven't trained with the new loaded model
            new_trainer._running_stage = RunningStage.VALIDATING

            dataloader = self.train_dataloader()
            tpipes.run_prediction_eval_model_template(
                self.trainer.lightning_module, dataloader=dataloader)
            self.on_train_start_called = True

    # new model
    model = CustomModel()

    # fit new model which should load hpc weights
    new_trainer.fit(model, datamodule=dm)
    assert model.on_train_start_called

    # test freeze on gpu
    model.freeze()
    model.unfreeze()
def test_resume_from_checkpoint_epoch_restored(tmpdir):
    """Verify resuming from checkpoint runs the right number of epochs"""

    hparams = EvalModelTemplate.get_default_hparams()

    def _new_model():
        # Create a model that tracks epochs and batches seen
        model = EvalModelTemplate(**hparams)
        model.num_epochs_seen = 0
        model.num_batches_seen = 0
        model.num_on_load_checkpoint_called = 0

        def increment_epoch(self):
            self.num_epochs_seen += 1

        def increment_batch(self, _):
            self.num_batches_seen += 1

        def increment_on_load_checkpoint(self, _):
            self.num_on_load_checkpoint_called += 1

        # Bind methods to keep track of epoch numbers, batch numbers it has seen
        # as well as number of times it has called on_load_checkpoint()
        model.on_epoch_end = types.MethodType(increment_epoch, model)
        model.on_batch_start = types.MethodType(increment_batch, model)
        model.on_load_checkpoint = types.MethodType(
            increment_on_load_checkpoint, model)
        return model

    model = _new_model()

    trainer_options = dict(
        progress_bar_refresh_rate=0,
        max_epochs=2,
        train_percent_check=0.65,
        val_percent_check=1,
        checkpoint_callback=ModelCheckpoint(tmpdir, save_top_k=-1),
        default_root_dir=tmpdir,
        early_stop_callback=False,
        val_check_interval=1.,
    )

    trainer = Trainer(**trainer_options)
    # fit model
    trainer.fit(model)

    training_batches = trainer.num_training_batches

    assert model.num_epochs_seen == 2
    assert model.num_batches_seen == training_batches * 2
    assert model.num_on_load_checkpoint_called == 0

    # Other checkpoints can be uncommented if/when resuming mid-epoch is supported
    checkpoints = sorted(
        glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, '*.ckpt')))

    for check in checkpoints:
        next_model = _new_model()
        state = torch.load(check)

        # Resume training
        trainer_options['max_epochs'] = 2
        new_trainer = Trainer(**trainer_options, resume_from_checkpoint=check)
        new_trainer.fit(next_model)
        assert state[
            'global_step'] + next_model.num_batches_seen == training_batches * trainer_options[
                'max_epochs']
        assert next_model.num_on_load_checkpoint_called == 1
Ejemplo n.º 12
0
def test_test_checkpoint_path(tmpdir, ckpt_path, save_top_k):
    hparams = EvalModelTemplate.get_default_hparams()

    loaded_checkpoint_path = ''

    class TestBestModel(EvalModelTemplate):
        @classmethod
        def load_from_checkpoint(cls, checkpoint_path, *args, **kwargs):
            nonlocal loaded_checkpoint_path
            loaded_checkpoint_path = checkpoint_path
            return super().load_from_checkpoint(checkpoint_path, *args,
                                                **kwargs)

    model = TestBestModel(**hparams)
    trainer = Trainer(
        max_epochs=2,
        progress_bar_refresh_rate=0,
        default_root_dir=tmpdir,
        checkpoint_callback=ModelCheckpoint(save_top_k=save_top_k),
    )
    trainer.fit(model)
    if ckpt_path == 'best':
        # ckpt_path is 'best', meaning we load the best weights
        if save_top_k <= 0:
            with pytest.raises(MisconfigurationException,
                               match='.*is not configured to save the best.*'):
                trainer.test(ckpt_path=ckpt_path)
        else:
            trainer.test(ckpt_path=ckpt_path)
            assert loaded_checkpoint_path == trainer.checkpoint_callback.best_model_path
    elif ckpt_path is None:
        # ckpt_path is None, meaning we don't load any checkpoints and
        # use the weights from the end of training
        trainer.test(ckpt_path=ckpt_path)
        assert loaded_checkpoint_path == ''
    else:
        # specific checkpoint, pick one from saved ones
        if save_top_k == 0:
            with pytest.raises(FileNotFoundError):
                trainer.test(ckpt_path='random.ckpt')
        else:
            ckpt_path = str(
                list((Path(tmpdir) /
                      'lightning_logs/version_0/checkpoints').iterdir())[0])
            trainer.test(ckpt_path=ckpt_path)
            assert loaded_checkpoint_path == ckpt_path
Ejemplo n.º 13
0
def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir,
                                               tmpdir_server, url_ckpt):
    """Verify resuming from checkpoint runs the right number of epochs"""
    # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
    monkeypatch.setenv('TORCH_HOME', tmpdir)

    hparams = EvalModelTemplate.get_default_hparams()

    def _new_model():
        # Create a model that tracks epochs and batches seen
        model = EvalModelTemplate(**hparams)
        model.num_epochs_seen = 0
        model.num_batches_seen = 0
        model.num_on_load_checkpoint_called = 0

        def increment_epoch(self):
            self.num_epochs_seen += 1

        def increment_batch(self, _):
            self.num_batches_seen += 1

        def increment_on_load_checkpoint(self, _):
            self.num_on_load_checkpoint_called += 1

        # Bind methods to keep track of epoch numbers, batch numbers it has seen
        # as well as number of times it has called on_load_checkpoint()
        model.on_epoch_end = types.MethodType(increment_epoch, model)
        model.on_batch_start = types.MethodType(increment_batch, model)
        model.on_load_checkpoint = types.MethodType(
            increment_on_load_checkpoint, model)
        return model

    model = _new_model()

    trainer_options = dict(
        progress_bar_refresh_rate=0,
        max_epochs=2,
        limit_train_batches=0.65,
        limit_val_batches=1,
        checkpoint_callback=ModelCheckpoint(tmpdir, save_top_k=-1),
        default_root_dir=tmpdir,
        early_stop_callback=False,
        val_check_interval=1.,
    )

    trainer = Trainer(**trainer_options)
    # fit model
    trainer.fit(model)

    training_batches = trainer.num_training_batches

    assert model.num_epochs_seen == 2
    assert model.num_batches_seen == training_batches * 2
    assert model.num_on_load_checkpoint_called == 0

    # Other checkpoints can be uncommented if/when resuming mid-epoch is supported
    checkpoints = sorted(
        glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, '*.ckpt')))
    if url_ckpt:
        # transform local paths into url checkpoints
        ip, port = tmpdir_server
        checkpoints = [
            f'http://{ip}:{port}/' + os.path.basename(check)
            for check in checkpoints
        ]

    for check in checkpoints:
        next_model = _new_model()
        state = pl_load(check)

        # Resume training
        trainer_options['max_epochs'] = 2
        new_trainer = Trainer(**trainer_options, resume_from_checkpoint=check)
        new_trainer.fit(next_model)
        assert state[
            'global_step'] + next_model.num_batches_seen == training_batches * trainer_options[
                'max_epochs']
        assert next_model.num_on_load_checkpoint_called == 1
Ejemplo n.º 14
0
def test_trainer_pickle(tmpdir):
    trainer = Trainer(max_epochs=1, default_root_dir=tmpdir)
    pickle.dumps(trainer)
    cloudpickle.dumps(trainer)
def test_step_with_optimizer_closure_with_different_frequencies(
        mock_sgd_step, mock_adam_step, tmpdir):
    """Tests that `step` works with optimizer_closure and different accumulated_gradient frequency."""
    class TestModel(BoringModel):
        def __init__(self):
            super().__init__()
            self.automatic_optimization = False

        def training_step(self, batch, batch_idx):

            # emulate gans training
            opt_gen, opt_dis = self.optimizers()

            # Note: Be careful, don't log on the same key in self.log in both closure
            # as they will be aggregated together on epoch_end

            def compute_loss():
                x = batch[0]
                x = F.dropout(x, 0.1)
                predictions = self(x)
                predictions = F.dropout(predictions, 0.1)
                loss = self.loss(None, predictions)
                return loss

            def gen_closure():
                loss_gen = compute_loss()
                self.log("loss_gen", loss_gen, on_step=True, on_epoch=True)
                self.manual_backward(loss_gen)

            def dis_closure():
                loss_dis = compute_loss()
                self.log("loss_dis", loss_dis, on_step=True, on_epoch=True)
                self.manual_backward(loss_dis)

            # this will accumulate gradients for 2 batches and then call opt_gen.step()
            gen_closure()
            if batch_idx % 2 == 0:
                opt_gen.step(closure=gen_closure, optim="sgd")
                opt_gen.zero_grad()

            # update discriminator every 4 baches
            # therefore, no gradient accumulation for discriminator
            if batch_idx % 4 == 0:
                opt_dis.step(closure=dis_closure)
                opt_dis.zero_grad()

        def configure_optimizers(self):
            optimizer_gen = torch.optim.SGD(self.layer.parameters(), lr=0.1)
            optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001)
            return [optimizer_gen, optimizer_dis]

    model = TestModel()
    model.val_dataloader = None
    model.training_epoch_end = None

    limit_train_batches = 8
    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_train_batches=limit_train_batches,
        limit_val_batches=2,
        max_epochs=1,
        log_every_n_steps=1,
    )

    trainer.fit(model)
    expected_calls = [call(closure=ANY, optim="sgd") for s in range(4)]
    mock_sgd_step.assert_has_calls(expected_calls)
    expected_calls = [call(closure=ANY) for s in range(2)]
    mock_adam_step.assert_has_calls(expected_calls)
Ejemplo n.º 16
0
def main(hparams):
    if hparams.logging_location == "s3":
        logging_dir = os.path.join(S3_LIGHTNING_LOGS_DIR, hparams.name)
    else:
        logging_dir = os.path.join(LIGHTNING_LOGS_DIR, hparams.name)

    # main LightningModule
    if hparams.checkpoint_path is not None:
        pretrain_system = PreTrainSystem.load_from_checkpoint(
            hparams.adversarial_system)
    else:
        pretrain_system = PreTrainSystem(**vars(hparams))

    pretrain_checkpoints = ModelCheckpoint(
        dirpath=os.path.join(MODEL_CHECKPOINTS_DIR, hparams.version),
        monitor="Val/loss",
        verbose=True,
        mode="min",
        save_top_k=hparams.save_top_k,
    )

    pretrain_early_stopping = EarlyStopping(
        monitor="Val/loss",
        min_delta=0.00,
        patience=hparams.patience,
        verbose=False,
        mode="min",
    )

    gpu_stats = GPUStatsMonitor(temperature=True)

    log_recolored_to_tensorboard = LogPairRecoloringToTensorboard()
    log_hyperparams_to_tensorboard = LogHyperparamsToTensorboard(
        hp_metric="Test/loss")

    notify = Notify(test_metric_name="Test/loss")

    logger = TensorBoardLogger(
        logging_dir,
        name=hparams.name,
        version=hparams.version,
        log_graph=True,
        default_hp_metric=False,
    )

    trainer = Trainer.from_argparse_args(
        hparams,
        resume_from_checkpoint=hparams.checkpoint_path,
        logger=logger,
        checkpoint_callback=pretrain_checkpoints,
        callbacks=[
            pretrain_early_stopping,
            log_recolored_to_tensorboard,
            log_hyperparams_to_tensorboard,
            gpu_stats,
            notify,
        ],
        profiler="simple",
        benchmark=True,
    )

    datamodule = PreTrainDataModule(**vars(hparams))

    trainer.fit(pretrain_system, datamodule=datamodule)

    # lightning automatically uses the best model checkpoint for testing
    trainer.test(pretrain_system, datamodule=datamodule)

    if hparams.upload_model_to_s3:
        # upload best model to S3
        best_model_path = pretrain_checkpoints.best_model_path
        S3_best_model_path = os.path.join(
            S3_MODEL_CHECKPOINTS_RELATIVE_DIR,
            hparams.name,
            ".".join([hparams.version,
                      best_model_path.split(".")[-1]]),
        )
        upload_to_s3(best_model_path, S3_best_model_path)
def test_multiple_optimizers_manual_no_return(tmpdir, kwargs):
    apex_optimizer_patches = []
    apex_optimizer_steps = []

    class TestModel(ManualOptModel):
        def training_step(self, batch, batch_idx):
            # avoid returning a value
            super().training_step(batch, batch_idx)

        def training_epoch_end(self, outputs):
            # outputs is empty as training_step does not return
            # and it is not automatic optimization
            assert not outputs

        def on_train_start(self):
            if kwargs.get("amp_backend") != "apex":
                return
            # extremely ugly. APEX patches all the native torch optimizers on `_initialize` which we call on
            # `ApexMixedPrecisionPlugin.dispatch`. Additionally, their replacement `new_step` functions are locally
            # defined so can't even patch those, thus we need to create the mock after APEX has been initialized
            nonlocal apex_optimizer_patches, apex_optimizer_steps
            for opt in self.trainer.optimizers:
                # `amp.scale_loss` will also patch the step to avoid it when gradient overflow happens. avoid it
                opt._amp_stash.already_patched = True
                patch = mock.patch.object(opt, "step")
                apex_optimizer_patches.append(patch)
                apex_optimizer_steps.append(patch.start())

        def on_train_end(self):
            if kwargs.get("amp_backend") == "apex":
                for p in apex_optimizer_patches:
                    p.stop()

    model = TestModel()
    model.val_dataloader = None

    limit_train_batches = 2
    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_train_batches=limit_train_batches,
        limit_val_batches=2,
        max_epochs=1,
        log_every_n_steps=1,
        weights_summary=None,
        **kwargs,
    )

    if kwargs.get("amp_backend") == "native":
        # mock the scaler instead of the optimizer step because it can be skipped with NaNs
        scaler_step_patch = mock.patch.object(
            trainer.precision_plugin.scaler,
            "step",
            wraps=trainer.precision_plugin.scaler.step)
        scaler_step = scaler_step_patch.start()

    with mock.patch.object(Accelerator,
                           "backward",
                           wraps=trainer.accelerator.backward) as bwd_mock:
        trainer.fit(model)
    assert bwd_mock.call_count == limit_train_batches * 3

    if kwargs.get("amp_backend") == "native":
        scaler_step_patch.stop()
        assert scaler_step.call_count == len(
            model.optimizers()) * limit_train_batches
    if kwargs.get("amp_backend") == "apex":
        assert [s.call_count for s in apex_optimizer_steps
                ] == [len(model.optimizers())] * limit_train_batches
def test_lightning_optimizer_automatic_optimization_optimizer_zero_grad_make_optimizer_step(tmpdir):
    """
    Test lightning optimize works with optimizer_zero_grad overrides and make_optimizer_step in automatic_optimization
    """

    try:
        with patch("torch.optim.Adam.zero_grad") as adam_zero_grad, \
             patch("torch.optim.SGD.zero_grad") as sgd_zero_grad:

            class TestModel(BoringModel):

                def training_step(self, batch, batch_idx, optimizer_idx=None):
                    output = self.layer(batch)
                    loss = self.loss(batch, output)
                    return {"loss": loss}

                def training_epoch_end(self, outputs):
                    outputs = sum(outputs, [])
                    torch.stack([x["loss"] for x in outputs]).mean()

                def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):
                    if optimizer_idx == 0:
                        if batch_idx % 2 == 0:
                            optimizer.zero_grad()

                    if optimizer_idx == 1:
                        if batch_idx % 5 == 0:
                            optimizer.zero_grad()

                def optimizer_step(
                    self,
                    epoch,
                    batch_idx,
                    optimizer,
                    optimizer_idx,
                    optimizer_closure,
                    on_tpu,
                    using_native_amp,
                    using_lbfgs,
                ):

                    assert optimizer_closure.__name__ == "train_step_and_backward_closure"

                    if optimizer_idx == 0:
                        optimizer.step(closure=optimizer_closure, make_optimizer_step=batch_idx % 3 == 0)
                        return
                    optimizer.step(closure=optimizer_closure)

                def configure_optimizers(self):
                    optimizer_1 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
                    optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1)
                    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1)
                    return [optimizer_1, optimizer_2], [lr_scheduler]

            model = TestModel()
            trainer = Trainer(
                default_root_dir=tmpdir,
                limit_train_batches=20,
                limit_val_batches=1,
                max_epochs=1,
                weights_summary=None,
            )
            trainer.fit(model)

            assert adam_zero_grad.call_count == 4
            assert sgd_zero_grad.call_count == 10

    except MisconfigurationException as e:
        assert "When overriding LightningModule `optimizer_zero_grad`, make_optimizer_step is not allowed" in str(e)
def _test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset(rank, worldsize):
    if worldsize > 1:
        _setup_ddp(rank, worldsize)

    def all_gather(tensor, world_size):
        tensor_list = [torch.zeros_like(tensor, dtype=torch.int64) for _ in range(world_size)]
        torch.distributed.all_gather(tensor_list, tensor)
        return tensor_list

    initial_seed = seed_everything(42)

    generator = torch.Generator()
    generator.manual_seed(initial_seed)

    num_workers = 2
    batch_size = 4
    dataset_length = 60
    num_classes = 10

    labels = np.random.randint(0, num_classes, dataset_length)

    dataset = ClassificationDataset(range(dataset_length), labels)
    dataset = MetaLearningDataset(
        dataset,
        batch_size=batch_size,
        drop_last=True,
        num_workers=num_workers,
        global_rank=rank,
        world_size=worldsize,
        initial_seed=initial_seed,
        debugging=True,
        shuffle=True,
    )
    dataset = CaptureIterableDataset(dataset, initial_seed=initial_seed)
    dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=1, generator=generator)
    Trainer._add_sampler_metadata_collate(dataloader)

    epoch_results = []
    for _ in range(2):
        iter_dataloader = iter(dataloader)
        batches = []
        while True:
            try:
                batches.append(next(iter_dataloader))
            except StopIteration:
                break
        epoch_results.append(batches)
        dataloader.dataset.dataset.current_task_iteration += 1

    assert len(epoch_results) == 2

    assert len(epoch_results[0]) == math.ceil((dataset_length / (num_workers * worldsize)) / batch_size) + 2

    if worldsize == 1:
        assert epoch_results[0][0]["data"]["task_length"] == epoch_results[0][1]["data"]["task_length"]
        assert torch.equal(
            epoch_results[0][0]["data"]["selected_indexes"], epoch_results[0][1]["data"]["selected_indexes"]
        )
        assert 0 in epoch_results[0][2][AutoRestartBatchKeys.PL_SAMPLERS]["iter_sampler"]  # worker id 0
        assert 1 in epoch_results[0][3][AutoRestartBatchKeys.PL_SAMPLERS]["iter_sampler"]  # worker id 1
        assert not torch.equal(epoch_results[0][2]["data"][0], epoch_results[0][3]["data"][0])
    else:
        first_task_metadata = all_gather(epoch_results[0][0]["data"]["task_length"], worldsize)
        second_task_metadata = all_gather(epoch_results[0][1]["data"]["task_length"], worldsize)
        assert torch.equal(first_task_metadata[0], first_task_metadata[1])
        assert torch.equal(second_task_metadata[0], second_task_metadata[1])
        assert torch.equal(first_task_metadata[0], second_task_metadata[1])

        first_batch_list = all_gather(epoch_results[0][2]["data"][0], worldsize)
        assert not torch.equal(first_batch_list[0], first_batch_list[1])
        second_batch_list = all_gather(epoch_results[0][3]["data"][0], worldsize)
        assert not torch.equal(second_batch_list[0], second_batch_list[1])

    # restarting on epoch 0 / real batch 2
    state_dict = {"iter_sampler": {}}
    for batch in epoch_results[0][2:4]:
        batch, _state_dict = CaptureIterableDataset.extract_samplers_state_dict_from_batch(batch)
        for k, v in _state_dict[0].items():
            state_dict[k].update(v)

    dataset = ClassificationDataset(range(dataset_length), labels)
    dataset = MetaLearningDataset(
        dataset,
        batch_size=batch_size,
        drop_last=True,
        num_workers=num_workers,
        global_rank=rank,
        world_size=worldsize,
        initial_seed=initial_seed,
        debugging=True,
        shuffle=True,
    )

    dataset = CaptureIterableDataset(dataset, initial_seed=initial_seed)
    dataset.load_state_dict(state_dict)
    dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=1, generator=generator)
    Trainer._add_sampler_metadata_collate(dataloader)

    epoch_results_restart = []
    for _ in range(2):
        iter_dataloader = iter(dataloader)
        batches = []
        while True:
            try:
                batches.append(next(iter_dataloader))
            except StopIteration:
                break
        epoch_results_restart.append(batches)
        dataloader.dataset.dataset.increment_iteration()
        dataloader.dataset.reset_on_epoch()

    assert len(epoch_results_restart[0]) + 2 == len(epoch_results[0])
    epoch_tensors = [e["data"][0] for e in epoch_results[0][4:]]
    epoch_tensors_restart = [e["data"][0] for e in epoch_results_restart[0][2:]]

    for t, tr in zip(epoch_tensors, epoch_tensors_restart):
        assert torch.equal(t, tr)

    epoch_tensors = [e["data"][0] for e in epoch_results[1][2:]]
    epoch_tensors_restart = [e["data"][0] for e in epoch_results_restart[1][2:]]

    for t, tr in zip(epoch_tensors, epoch_tensors_restart):
        assert torch.equal(t, tr)
Ejemplo n.º 20
0
def train(config: DictConfig, resume_from_checkpoint: str = None):
    filter_warnings()
    print_config(config)
    seed_everything(config.seed)

    known_models = {
        "token": get_token_based,
        "vuldeepecker": get_VDP,
        "vgdetector": get_VGD,
        "sysevr": get_SYS,
        "mulvuldeepecker": get_MULVDP,
        "code2seq": get_C2S,
        "code2vec": get_C2V
    }

    vocab = {
        "token": Vocabulary_token,
        "vuldeepecker": Vocabulary_token,
        "vgdetector": Vocabulary_token,
        "sysevr": Vocabulary_token,
        "mulvuldeepecker": Vocabulary_token,
        "code2seq": Vocabulary_c2s,
        "code2vec": Vocabulary_c2s
    }
    if config.name not in known_models:
        print(f"Unknown model: {config.name}, try on of {known_models.keys()}")
        return
    if os.path.exists(
            join(config.data_folder, config.name, config.dataset.name,
                 "vocab.pkl")):
        vocabulary = vocab[config.name].load_vocabulary(
            join(config.data_folder, config.name, config.dataset.name,
                 "vocab.pkl"))
    else:
        vocabulary = None
    model, data_module = known_models[config.name](config, vocabulary)
    # define logger
    # wandb logger
    # wandb_logger = WandbLogger(project=f"{config.name}-{config.dataset.name}",
    #                            log_model=True,
    #                            offline=config.log_offline)
    # wandb_logger.watch(model)
    # checkpoint_callback = ModelCheckpoint(
    #     dirpath=wandb_logger.experiment.dir,
    #     filename="{epoch:02d}-{val_loss:.4f}",
    #     period=config.save_every_epoch,
    #     save_top_k=-1,
    # )
    # upload_checkpoint_callback = UploadCheckpointCallback(
    #     wandb_logger.experiment.dir)

    # tensorboard logger
    tensorlogger = TensorBoardLogger(join("ts_logger", config.name),
                                     config.dataset.name)
    # define model checkpoint callback
    checkpoint_callback = ModelCheckpoint(
        dirpath=join(tensorlogger.log_dir, "checkpoints"),
        monitor="val_loss",
        filename="{epoch:02d}-{val_loss:.4f}",
        period=config.save_every_epoch,
        save_top_k=3,
    )
    upload_checkpoint_callback = UploadCheckpointCallback(
        join(tensorlogger.log_dir, "checkpoints"))

    # define early stopping callback
    early_stopping_callback = EarlyStopping(
        patience=config.hyper_parameters.patience,
        monitor="val_loss",
        verbose=True,
        mode="min")
    # define callback for printing intermediate result
    print_epoch_result_callback = PrintEpochResultCallback("train", "val")
    collect_test_res_callback = CollectTestResCallback(config)
    # use gpu if it exists
    gpu = 1 if torch.cuda.is_available() else None
    # define learning rate logger
    lr_logger = LearningRateMonitor("step")
    trainer = Trainer(
        max_epochs=config.hyper_parameters.n_epochs,
        gradient_clip_val=config.hyper_parameters.clip_norm,
        deterministic=True,
        check_val_every_n_epoch=config.val_every_epoch,
        log_every_n_steps=config.log_every_epoch,
        logger=[tensorlogger],
        reload_dataloaders_every_epoch=config.hyper_parameters.
        reload_dataloader,
        gpus=gpu,
        progress_bar_refresh_rate=config.progress_bar_refresh_rate,
        callbacks=[
            lr_logger, early_stopping_callback, checkpoint_callback,
            print_epoch_result_callback, upload_checkpoint_callback,
            collect_test_res_callback
        ],
        resume_from_checkpoint=resume_from_checkpoint,
    )

    trainer.fit(model=model, datamodule=data_module)
    trainer.test()
Ejemplo n.º 21
0
def test_can_prepare_data(tmpdir):

    dm = TrialMNISTDataModule()
    trainer = Trainer()
    trainer.datamodule = dm

    # 1 no DM
    # prepare_data_per_node = True
    # local rank = 0   (True)
    trainer.prepare_data_per_node = True
    trainer.local_rank = 0
    assert trainer.data_connector.can_prepare_data()

    # local rank = 1   (False)
    trainer.local_rank = 1
    assert not trainer.data_connector.can_prepare_data()

    # prepare_data_per_node = False (prepare across all nodes)
    # global rank = 0   (True)
    trainer.prepare_data_per_node = False
    trainer.node_rank = 0
    trainer.local_rank = 0
    assert trainer.data_connector.can_prepare_data()

    # global rank = 1   (False)
    trainer.node_rank = 1
    trainer.local_rank = 0
    assert not trainer.data_connector.can_prepare_data()
    trainer.node_rank = 0
    trainer.local_rank = 1
    assert not trainer.data_connector.can_prepare_data()

    # 2 dm
    # prepar per node = True
    # local rank = 0 (True)
    trainer.prepare_data_per_node = True
    trainer.local_rank = 0

    # is_overridden prepare data = True
    # has been called
    # False
    dm._has_prepared_data = True
    assert not trainer.data_connector.can_prepare_data()

    # has not been called
    # True
    dm._has_prepared_data = False
    assert trainer.data_connector.can_prepare_data()

    # is_overridden prepare data = False
    # True
    dm.prepare_data = None
    assert trainer.data_connector.can_prepare_data()
Ejemplo n.º 22
0
        file.close()
        if _hyperparams.k_fold_validation:
            all_subjects = range(len(keys))
            for leave_one_out_idx in all_subjects:
                _train_subjects.append(all_subjects[:leave_one_out_idx] + all_subjects[leave_one_out_idx + 1:])
                _valid_subjects.append([leave_one_out_idx])  # Note that this is a hack and should not be used to get results for papers
                _test_subjects.append([leave_one_out_idx])
        else:
            _train_subjects.append(keys[1:])
            _valid_subjects.append([keys[0]])
            _test_subjects.append([keys[0]])

    for fold, (train_s, valid_s, test_s) in enumerate(zip(_train_subjects, _valid_subjects, _test_subjects)):
        complete_path = os.path.abspath(os.path.join(_hyperparams.save_dir, "fold_{}/".format(fold)))

        _model = TrainRTGENE(hparams=_hyperparams, train_subjects=train_s, validate_subjects=valid_s, test_subjects=test_s)
        # save all models
        checkpoint_callback = ModelCheckpoint(filepath=os.path.join(complete_path, "{epoch}-{val_loss:.3f}"), monitor='val_loss', mode='min', verbose=True,
                                              save_top_k=-1 if not _hyperparams.augment else 5)
        early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, verbose=True, patience=20 if _hyperparams.augment else 2, mode='min')
        # start training
        trainer = Trainer(gpus=_hyperparams.gpu,
                          checkpoint_callback=checkpoint_callback,
                          early_stop_callback=early_stop_callback,
                          progress_bar_refresh_rate=1,
                          min_epochs=64 if _hyperparams.augment else 3,
                          max_epochs=128 if _hyperparams.augment else 5,
                          accumulate_grad_batches=_hyperparams.accumulate_grad_batches)
        trainer.fit(_model)
        trainer.test()
Ejemplo n.º 23
0
def test_trainer_callback_system(tmpdir):
    """Test the callback system."""

    hparams = EvalModelTemplate.get_default_hparams()
    model = EvalModelTemplate(**hparams)

    def _check_args(trainer, pl_module):
        assert isinstance(trainer, Trainer)
        assert isinstance(pl_module, LightningModule)

    class TestCallback(Callback):
        def __init__(self):
            super().__init__()
            self.setup_called = False
            self.teardown_called = False
            self.on_init_start_called = False
            self.on_init_end_called = False
            self.on_fit_start_called = False
            self.on_fit_end_called = False
            self.on_sanity_check_start_called = False
            self.on_sanity_check_end_called = False
            self.on_epoch_start_called = False
            self.on_epoch_end_called = False
            self.on_batch_start_called = False
            self.on_batch_end_called = False
            self.on_validation_batch_start_called = False
            self.on_validation_batch_end_called = False
            self.on_test_batch_start_called = False
            self.on_test_batch_end_called = False
            self.on_train_start_called = False
            self.on_train_end_called = False
            self.on_validation_start_called = False
            self.on_validation_end_called = False
            self.on_test_start_called = False
            self.on_test_end_called = False

        def setup(self, trainer, step: str):
            assert isinstance(trainer, Trainer)
            self.setup_called = True

        def teardown(self, trainer, step: str):
            assert isinstance(trainer, Trainer)
            self.teardown_called = True

        def on_init_start(self, trainer):
            assert isinstance(trainer, Trainer)
            self.on_init_start_called = True

        def on_init_end(self, trainer):
            assert isinstance(trainer, Trainer)
            self.on_init_end_called = True

        def on_fit_start(self, trainer):
            assert isinstance(trainer, Trainer)
            self.on_fit_start_called = True

        def on_fit_end(self, trainer):
            assert isinstance(trainer, Trainer)
            self.on_fit_end_called = True

        def on_sanity_check_start(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_sanity_check_start_called = True

        def on_sanity_check_end(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_sanity_check_end_called = True

        def on_epoch_start(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_epoch_start_called = True

        def on_epoch_end(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_epoch_end_called = True

        def on_batch_start(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_batch_start_called = True

        def on_batch_end(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_batch_end_called = True

        def on_validation_batch_start(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_validation_batch_start_called = True

        def on_validation_batch_end(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_validation_batch_end_called = True

        def on_test_batch_start(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_test_batch_start_called = True

        def on_test_batch_end(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_test_batch_end_called = True

        def on_train_start(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_train_start_called = True

        def on_train_end(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_train_end_called = True

        def on_validation_start(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_validation_start_called = True

        def on_validation_end(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_validation_end_called = True

        def on_test_start(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_test_start_called = True

        def on_test_end(self, trainer, pl_module):
            _check_args(trainer, pl_module)
            self.on_test_end_called = True

    test_callback = TestCallback()

    trainer_options = dict(
        callbacks=[test_callback],
        max_epochs=1,
        limit_val_batches=0.1,
        limit_train_batches=0.2,
        progress_bar_refresh_rate=0,
    )

    assert not test_callback.setup_called
    assert not test_callback.teardown_called
    assert not test_callback.on_init_start_called
    assert not test_callback.on_init_end_called
    assert not test_callback.on_fit_start_called
    assert not test_callback.on_fit_end_called
    assert not test_callback.on_sanity_check_start_called
    assert not test_callback.on_sanity_check_end_called
    assert not test_callback.on_epoch_start_called
    assert not test_callback.on_epoch_start_called
    assert not test_callback.on_batch_start_called
    assert not test_callback.on_batch_end_called
    assert not test_callback.on_validation_batch_start_called
    assert not test_callback.on_validation_batch_end_called
    assert not test_callback.on_test_batch_start_called
    assert not test_callback.on_test_batch_end_called
    assert not test_callback.on_train_start_called
    assert not test_callback.on_train_end_called
    assert not test_callback.on_validation_start_called
    assert not test_callback.on_validation_end_called
    assert not test_callback.on_test_start_called
    assert not test_callback.on_test_end_called

    # fit model
    trainer = Trainer(**trainer_options)

    assert trainer.callbacks[0] == test_callback
    assert test_callback.on_init_start_called
    assert test_callback.on_init_end_called
    assert not test_callback.setup_called
    assert not test_callback.teardown_called
    assert not test_callback.on_fit_start_called
    assert not test_callback.on_fit_end_called
    assert not test_callback.on_sanity_check_start_called
    assert not test_callback.on_sanity_check_end_called
    assert not test_callback.on_epoch_start_called
    assert not test_callback.on_epoch_start_called
    assert not test_callback.on_batch_start_called
    assert not test_callback.on_batch_end_called
    assert not test_callback.on_validation_batch_start_called
    assert not test_callback.on_validation_batch_end_called
    assert not test_callback.on_test_batch_start_called
    assert not test_callback.on_test_batch_end_called
    assert not test_callback.on_train_start_called
    assert not test_callback.on_train_end_called
    assert not test_callback.on_validation_start_called
    assert not test_callback.on_validation_end_called
    assert not test_callback.on_test_start_called
    assert not test_callback.on_test_end_called

    trainer.fit(model)

    assert test_callback.setup_called
    assert test_callback.teardown_called
    assert test_callback.on_init_start_called
    assert test_callback.on_init_end_called
    assert test_callback.on_fit_start_called
    assert test_callback.on_fit_end_called
    assert test_callback.on_sanity_check_start_called
    assert test_callback.on_sanity_check_end_called
    assert test_callback.on_epoch_start_called
    assert test_callback.on_epoch_start_called
    assert test_callback.on_batch_start_called
    assert test_callback.on_batch_end_called
    assert test_callback.on_validation_batch_start_called
    assert test_callback.on_validation_batch_end_called
    assert test_callback.on_train_start_called
    assert test_callback.on_train_end_called
    assert test_callback.on_validation_start_called
    assert test_callback.on_validation_end_called
    assert not test_callback.on_test_batch_start_called
    assert not test_callback.on_test_batch_end_called
    assert not test_callback.on_test_start_called
    assert not test_callback.on_test_end_called

    # reset setup teardown callback
    test_callback.teardown_called = False
    test_callback.setup_called = False

    test_callback = TestCallback()
    trainer_options.update(callbacks=[test_callback])
    trainer = Trainer(**trainer_options)
    trainer.test(model)

    assert test_callback.setup_called
    assert test_callback.teardown_called
    assert test_callback.on_test_batch_start_called
    assert test_callback.on_test_batch_end_called
    assert test_callback.on_test_start_called
    assert test_callback.on_test_end_called
    assert not test_callback.on_validation_start_called
    assert not test_callback.on_validation_end_called
    assert not test_callback.on_validation_batch_end_called
    assert not test_callback.on_validation_batch_start_called
def test_manual_optimization_and_accumulated_gradient(tmpdir):
    """This test verify that in `automatic_optimization=False`, step is being called only when we shouldn't
    accumulate."""
    seed_everything(234)

    class ExtendedModel(BoringModel):

        count = 1
        called = collections.defaultdict(int)
        detach = False

        def __init__(self):
            super().__init__()
            self.automatic_optimization = False

        @property
        def should_update(self):
            return self.count % 2 == 0

        @property
        def should_have_updated(self):
            return self.count % 4 == 0

        @property
        def has_gradient(self):
            return self.layer.weight.grad is not None

        def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
            self.called["on_train_batch_start"] += 1
            self.weight_before = self.layer.weight.clone()

        def training_step(self, batch, batch_idx):
            self.called["training_step"] += 1
            opt = self.optimizers()
            output = self.layer(batch)

            loss = self.loss(batch, output)
            loss /= loss.clone().detach()
            loss *= 0.1

            if self.should_update:

                self.manual_backward(loss)
                if self.should_have_updated:
                    opt.step()
                    opt.zero_grad()

            return loss.detach() if self.detach else loss

        def on_train_batch_end(self, outputs, batch, batch_idx,
                               dataloader_idx):
            self.called["on_train_batch_end"] += 1
            after_before = self.layer.weight.clone()
            if self.should_update and self.should_have_updated:
                assert not torch.equal(self.weight_before,
                                       after_before), self.count
                assert torch.all(self.layer.weight.grad == 0)
            else:
                assert torch.equal(self.weight_before, after_before)
                if self.count > 1:
                    if self.count % 4 == 1:
                        assert torch.all(self.layer.weight.grad == 0)
                    else:
                        assert torch.sum(self.layer.weight.grad) != 0
            self.count += 1

        def on_train_epoch_end(self, *_, **__):
            assert self.called["training_step"] == 20
            assert self.called["on_train_batch_start"] == 20
            assert self.called["on_train_batch_end"] == 20

    model = ExtendedModel()
    model.training_step_end = None
    model.training_epoch_end = None

    trainer = Trainer(
        max_epochs=1,
        default_root_dir=tmpdir,
        limit_train_batches=20,
        limit_test_batches=0,
        limit_val_batches=0,
        precision=16,
        amp_backend="native",
        gpus=1,
    )
    trainer.fit(model)
Ejemplo n.º 25
0
def create_lightning_trainer(container: LightningContainer,
                             resume_from_checkpoint: Optional[Path] = None,
                             num_nodes: int = 1,
                             **kwargs: Dict[str, Any]) -> \
        Tuple[Trainer, Optional[StoringLogger]]:
    """
    Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers
    and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second
    return value.
    :param container: The container with model and data.
    :param resume_from_checkpoint: If provided, training resumes from this checkpoint point.
    :param num_nodes: The number of nodes to use in distributed training.
    :param kwargs: Any additional keyowrd arguments will be passed to the constructor of Trainer.
    :return: A tuple [Trainer object, diagnostic logger]
    """
    # For now, stick with the legacy behaviour of always saving only the last epoch checkpoint. For large segmentation
    # models, this still appears to be the best way of choosing them because validation loss on the relatively small
    # training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but
    # not for the HeadAndNeck model.
    best_checkpoint_callback = ModelCheckpoint(
        dirpath=str(container.checkpoint_folder),
        # filename=BEST_CHECKPOINT_FILE_NAME,
        # monitor=f"{VALIDATION_PREFIX}{MetricType.LOSS.value}",
        # save_top_k=1,
        save_last=True)

    # Recovery checkpoints: {epoch} will turn into a string like "epoch=1"
    # Store 1 recovery checkpoint every recovery_checkpoint_save_interval epochs, keep the last
    # recovery_checkpoints_save_last_k.
    recovery_checkpoint_callback = InnerEyeRecoveryCheckpointCallback(
        container)

    num_gpus = container.num_gpus_per_node
    effective_num_gpus = num_gpus * num_nodes
    # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory).
    # For unit tests, only "ddp_spawn" works
    accelerator = "ddp" if effective_num_gpus > 1 else None
    if effective_num_gpus > 1:
        # Initialize the DDP plugin with find_unused_parameters=False by default. If True (default), it prints out
        # lengthy warnings about the performance impact of find_unused_parameters
        plugins = [
            InnerEyeDDPPlugin(
                num_nodes=num_nodes,
                sync_batchnorm=True,
                find_unused_parameters=container.pl_find_unused_parameters)
        ]
    else:
        plugins = []
    logging.info(
        f"Using {num_gpus} GPUs per node with accelerator '{accelerator}'")
    tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder),
                                           name="Lightning",
                                           version="")
    loggers = [tensorboard_logger, AzureMLLogger()]
    storing_logger: Optional[StoringLogger]
    if isinstance(container, InnerEyeContainer):
        storing_logger = StoringLogger()
        loggers.append(storing_logger)
    else:
        storing_logger = None
    # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag.
    precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32
    # The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark
    # https://pytorch.org/docs/stable/notes/randomness.html
    # For the classification models, we observed only a small performance deterioration (increase in 10sec on total
    # training time of 22min) when switching to deterministic.
    if container.pl_deterministic:
        deterministic = True
        benchmark = False
    else:
        deterministic = False
        benchmark = True
    # If the users provides additional callbacks via get_trainer_arguments (for custom
    # containers
    callbacks = [best_checkpoint_callback, recovery_checkpoint_callback]
    if "callbacks" in kwargs:
        callbacks.append(kwargs.pop("callbacks"))  # type: ignore
    is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
    progress_bar_refresh_rate = container.pl_progress_bar_refresh_rate
    if progress_bar_refresh_rate is None and is_azureml_run:
        # When running in AzureML, the default progress bar clutters the output files with thousands of lines.
        progress_bar_refresh_rate = 50
        logging.info(
            f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. "
            f"To change, modify the pl_progress_bar_refresh_rate field of the container."
        )
    # Read out additional model-specific args here.
    # We probably want to keep essential ones like numgpu and logging.
    trainer = Trainer(default_root_dir=str(container.outputs_folder),
                      deterministic=deterministic,
                      benchmark=benchmark,
                      accelerator=accelerator,
                      max_epochs=container.num_epochs,
                      num_sanity_val_steps=container.pl_num_sanity_val_steps,
                      callbacks=callbacks,
                      logger=loggers,
                      progress_bar_refresh_rate=progress_bar_refresh_rate,
                      num_nodes=num_nodes,
                      gpus=num_gpus,
                      precision=precision,
                      sync_batchnorm=True,
                      terminate_on_nan=container.detect_anomaly,
                      resume_from_checkpoint=str(resume_from_checkpoint)
                      if resume_from_checkpoint else None,
                      plugins=plugins,
                      **kwargs)
    return trainer, storing_logger
def test_multiple_optimizers_step(tmpdir):
    """Tests that `step` works with several optimizers."""
    class TestModel(ManualOptModel):

        called = False

        def on_before_optimizer_step(self, *args):
            self.called = True
            norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2)
            if not (torch.isinf(norm) or torch.isnan(norm)):
                assert norm.item() < 100, norm.item()

        def training_step(self, batch, batch_idx):
            # manual
            opt_a, opt_b = self.optimizers()
            x = batch[0]

            loss_1 = self(x)
            loss_1 = self.loss(loss_1, loss_1)

            # make sure there are no grads
            if self.layer.weight.grad is not None:
                assert torch.all(self.layer.weight.grad == 0)

            self.manual_backward(loss_1)
            opt_a.step()

            # fake discriminator
            loss_2 = self(x)
            loss_2 = self.loss(loss_2, loss_2)

            # ensure we forward the correct params to the optimizer
            # without retain_graph we can't do multiple backward passes
            self.manual_backward(loss_2, retain_graph=True)
            self.manual_backward(loss_2, retain_graph=True)

            assert self.layer.weight.grad is not None
            opt_b.step()
            opt_b.zero_grad()

            return {"loss1": loss_1.detach(), "loss2": loss_2.detach()}

        def training_epoch_end(self, outputs) -> None:
            # outputs should be an array with an entry per optimizer
            assert len(outputs) == 2

    model = TestModel()
    model.val_dataloader = None

    limit_train_batches = 2
    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_train_batches=limit_train_batches,
        limit_val_batches=2,
        max_epochs=1,
        log_every_n_steps=1,
        weights_summary=None,
        precision=16,
        amp_backend="native",
        gpus=1,
    )

    with mock.patch.object(Accelerator,
                           "backward",
                           wraps=trainer.accelerator.backward) as bwd_mock:
        trainer.fit(model)
    assert bwd_mock.call_count == limit_train_batches * 3
    assert model.called
def test_val_step_only_step_metrics(tmpdir):
    """
    Make sure the logged + pbar metrics are allocated accordingly at every step when requested
    """
    # enable internal debugging actions
    os.environ['PL_DEV_DEBUG'] = '1'

    model = DeterministicModel()
    model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks
    model.training_step_end = None
    model.training_epoch_end = None
    model.validation_step = model.validation_step_result_only_step_metrics
    model.validation_step_end = None
    model.validation_epoch_end = None

    batches = 3
    epochs = 3
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=epochs,
        row_log_interval=1,
        limit_train_batches=batches,
        limit_val_batches=batches,
        weights_summary=None,
    )
    trainer.fit(model)

    # make sure correct steps were called
    assert model.validation_step_called
    assert not model.validation_step_end_called
    assert not model.validation_epoch_end_called

    # no early stopping
    assert len(trainer.dev_debugger.early_stopping_history) == 0

    # make sure we logged the exact number of metrics
    assert len(
        trainer.dev_debugger.logged_metrics) == epochs * batches + (epochs)
    assert len(
        trainer.dev_debugger.pbar_added_metrics) == epochs * batches + (epochs)

    # make sure we logged the correct epoch metrics
    total_empty_epoch_metrics = 0
    epoch = 0
    for metric in trainer.dev_debugger.logged_metrics:
        if 'epoch' in metric:
            epoch += 1
        if len(metric) > 2:
            assert 'no_val_no_pbar' not in metric
            assert 'val_step_pbar_acc' not in metric
            assert metric[f'val_step_log_acc/epoch_{epoch}']
            assert metric[f'val_step_log_pbar_acc/epoch_{epoch}']
        else:
            total_empty_epoch_metrics += 1

    assert total_empty_epoch_metrics == 3

    # make sure we logged the correct epoch pbar metrics
    total_empty_epoch_metrics = 0
    for metric in trainer.dev_debugger.pbar_added_metrics:
        if 'epoch' in metric:
            epoch += 1
        if len(metric) > 2:
            assert 'no_val_no_pbar' not in metric
            assert 'val_step_log_acc' not in metric
            assert metric['val_step_log_pbar_acc']
            assert metric['val_step_pbar_acc']
        else:
            total_empty_epoch_metrics += 1

    assert total_empty_epoch_metrics == 3

    # only 1 checkpoint expected since values didn't change after that
    assert len(trainer.dev_debugger.checkpoint_callback_history) == 1

    # make sure the last known metric is correct
    assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 189
def test_step_with_optimizer_closure(tmpdir):
    """Tests that `step` works with optimizer_closure."""
    class TestModel(BoringModel):

        _losses = []

        def __init__(self):
            super().__init__()
            self.automatic_optimization = False

        def training_step(self, batch, batch_idx):
            # make sure there are no grads
            if self.layer.weight.grad is not None:
                assert torch.all(self.layer.weight.grad == 0)

            opt = self.optimizers()

            def compute_loss():
                x = batch[0]
                x = F.dropout(x, 0.1)
                predictions = self(x)
                predictions = F.dropout(predictions, 0.1)
                loss = self.loss(None, predictions)
                return loss

            def optimizer_closure():
                # emulate bayesian optimization.
                num_backward = 2
                losses = []
                for backward_idx in range(num_backward):
                    loss = compute_loss()
                    losses.append(loss)
                    retain_graph = (num_backward - 1) != backward_idx
                    self.manual_backward(loss, retain_graph=retain_graph)
                # emulate MC dropout training
                loss = torch.stack(losses).mean()
                self._losses.append(loss)
                self.log("train_loss",
                         loss,
                         on_step=True,
                         prog_bar=True,
                         on_epoch=True)
                assert losses[0] != losses[1]

            weight_before = self.layer.weight.clone()

            opt.step(closure=optimizer_closure)
            opt.zero_grad()

            weight_after = self.layer.weight.clone()
            assert not torch.equal(weight_before, weight_after)

        def configure_optimizers(self):
            return torch.optim.SGD(self.layer.parameters(), lr=0.1)

    model = TestModel()
    model.val_dataloader = None
    model.training_epoch_end = None

    limit_train_batches = 2
    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_train_batches=limit_train_batches,
        limit_val_batches=2,
        max_epochs=1,
        log_every_n_steps=1,
    )

    with mock.patch.object(Accelerator,
                           "backward",
                           wraps=trainer.accelerator.backward) as bwd_mock:
        trainer.fit(model)
    assert bwd_mock.call_count == limit_train_batches * 2
    assert trainer.progress_bar_metrics["train_loss_step"] == model._losses[-1]
    assert trainer.progress_bar_metrics["train_loss_epoch"] == torch.stack(
        model._losses).mean()
Ejemplo n.º 29
0
def cli_main():

    parser = ArgumentParser()
    parser.add_argument("--DATA_PATH",
                        type=str,
                        help="path to folders with images")
    parser.add_argument("--MODEL_PATH",
                        default=None,
                        type=str,
                        help="path to model checkpoint")
    parser.add_argument("--batch_size",
                        default=128,
                        type=int,
                        help="batch size for SSL")
    parser.add_argument("--image_size",
                        default=256,
                        type=int,
                        help="image size for SSL")
    parser.add_argument("--num_workers",
                        default=1,
                        type=int,
                        help="number of CPU cores to use for data processing")
    parser.add_argument("--image_embedding_size",
                        default=128,
                        type=int,
                        help="size of image representation of SIMCLR")
    parser.add_argument("--epochs",
                        default=200,
                        type=int,
                        help="number of epochs to train model")
    parser.add_argument("--lr",
                        default=1e-3,
                        type=float,
                        help="learning rate for training model")
    parser.add_argument(
        "--patience",
        default=-1,
        type=int,
        help=
        "automatically cuts off training if validation does not drop for (patience) epochs. Leave blank to have no validation based early stopping."
    )
    parser.add_argument("--val_split",
                        default=0.2,
                        type=float,
                        help="percent in validation data")
    parser.add_argument(
        "--pretrain_encoder",
        default=False,
        type=bool,
        help=
        "initialize resnet encoder with pretrained imagenet weights. Cannot be true if passing previous SSL model checkpoint."
    )
    parser.add_argument(
        "--withold_train_percent",
        default=0,
        type=float,
        help=
        "decimal from 0-1 representing how much of the training data to withold during SSL training"
    )
    parser.add_argument("--version",
                        default="0",
                        type=str,
                        help="version to name checkpoint for saving")
    parser.add_argument("--gpus",
                        default=1,
                        type=int,
                        help="number of gpus to use for training")
    parser.add_argument("--num_workers",
                        default=0,
                        type=int,
                        help="number of workers to use to fetch data")

    args = parser.parse_args()
    URL = args.DATA_PATH
    batch_size = args.batch_size
    image_size = args.image_size
    num_workers = args.num_workers
    embedding_size = args.image_embedding_size
    epochs = args.epochs
    lr = args.lr
    patience = args.patience
    val_split = args.val_split
    pretrain = args.pretrain_encoder
    withold_train_percent = args.withold_train_percent
    version = args.version
    model_checkpoint = args.MODEL_PATH
    gpus = args.gpus
    num_workers = args.num_workers

    train_transform = SimCLRTrainDataTransform(256)
    val_transform = SimCLREvalDataTransform(256)
    dm = ImageDataModule(URL,
                         train_transform=train_transform,
                         val_transform=val_transform,
                         val_split=val_split,
                         num_workers=num_workers)
    dm.setup()

    #init model with batch size, num_samples (len of data), epochs to train, and autofinds learning rate
    model = SimCLR(arch='resnet18',
                   batch_size=batch_size,
                   num_samples=dm.num_samples,
                   gpus=gpus,
                   dataset='None',
                   max_epochs=epochs,
                   learning_rate=lr)  #

    model.encoder = resnet18(pretrained=pretrain,
                             first_conv=model.first_conv,
                             maxpool1=model.maxpool1,
                             return_all_feature_maps=False)
    model.projection = Projection(input_dim=512,
                                  hidden_dim=256,
                                  output_dim=embedding_size)  #overrides

    if patience > 0:
        cb = EarlyStopping('val_loss', patience=patience)
        trainer = Trainer(gpus=gpus,
                          max_epochs=epochs,
                          callbacks=[cb],
                          progress_bar_refresh_rate=5)
    else:
        trainer = Trainer(gpus=gpus,
                          max_epochs=epochs,
                          progress_bar_refresh_rate=5)

    if model_checkpoint is not None:
        model.load_state_dict(torch.load(model_checkpoint))
        print(
            'Successfully loaded your checkpoint. Keep in mind that this does not preserve the previous trainer states, only the model weights'
        )

    model.cuda()

    print('Model Initialized')
    trainer.fit(model, dm)

    Path(f"./models/SSL/SIMCLR_SSL_{version}").mkdir(parents=True,
                                                     exist_ok=True)
    torch.save(model.state_dict(),
               f"./models/SSL/SIMCLR_SSL_{version}/SIMCLR_SSL_{version}.pt")
Ejemplo n.º 30
0
def test_exception_when_no_tpu_found(tmpdir):
    """Test if exception is thrown when xla devices are not available"""

    with pytest.raises(MisconfigurationException, match='No TPU devices were found.'):
        Trainer(tpu_cores=8)