def test_check_val_every_n_epoch_with_max_steps(tmpdir):
    data_samples_train = 2
    check_val_every_n_epoch = 3
    max_epochs = 4

    class TestModel(BoringModel):
        def __init__(self):
            super().__init__()
            self.validation_called_at_step = set()

        def validation_step(self, *args):
            self.validation_called_at_step.add(self.global_step)
            return super().validation_step(*args)

        def train_dataloader(self):
            return DataLoader(RandomDataset(32, data_samples_train))

    model = TestModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_steps=data_samples_train * max_epochs,
        check_val_every_n_epoch=check_val_every_n_epoch,
        num_sanity_val_steps=0,
    )

    trainer.fit(model)

    assert trainer.current_epoch == max_epochs
    assert trainer.global_step == max_epochs * data_samples_train
    assert list(model.validation_called_at_step) == [
        data_samples_train * check_val_every_n_epoch
    ]
def main(args):

    wand_logger = WandbLogger(offline=False,
                              project='Transformer',
                              save_dir='./lightning_logs/')
    wand_logger.log_hyperparams(params=args)

    checkpoint = ModelCheckpoint(
        filepath='./lightning_logs/checkpoints/checkpoints',
        monitor='val_loss',
        verbose=0,
        save_top_k=2)

    model = TransformerModel(**vars(args))
    trainer = Trainer(
        logger=wand_logger,
        early_stop_callback=False,
        checkpoint_callback=checkpoint,
        # fast_dev_run=True,
        # overfit_pct=0.03,
        # profiler=True,
        auto_lr_find=False,
        # val_check_interval=1.0,
        # log_save_interval=50000,
        # row_log_interval=50000,
        max_epochs=args.epochs,
        min_epochs=1,
    )
    # lr_finder = trainer.lr_find(model)
    # print(lr_finder.results)
    trainer.fit(model)
def test_check_val_every_n_epoch(tmpdir, max_epochs, expected_val_loop_calls,
                                 expected_val_batches):
    class TestModel(BoringModel):
        val_epoch_calls = 0
        val_batches = []

        def on_train_epoch_end(self, *args, **kwargs):
            self.val_batches.append(
                self.trainer.progress_bar_callback.total_val_batches)

        def on_validation_epoch_start(self) -> None:
            self.val_epoch_calls += 1

    model = TestModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=max_epochs,
        num_sanity_val_steps=0,
        limit_val_batches=2,
        check_val_every_n_epoch=2,
        logger=False,
    )
    trainer.fit(model)
    assert trainer.state.finished, f"Training failed with {trainer.state}"

    assert model.val_epoch_calls == expected_val_loop_calls
    assert model.val_batches == expected_val_batches
def test_validation_check_interval_exceed_data_length_wrong():
    trainer = Trainer(
        limit_train_batches=10,
        val_check_interval=100,
    )

    model = BoringModel()
    with pytest.raises(
            ValueError,
            match=
            "must be less than or equal to the number of the training batches"
    ):
        trainer.fit(model)
def main(hparams):
    model = LightningModel(hparams)
    if hparams.seed is not None:
        random.seed(hparams.seed)
        t.manual_seed(hparams.seed)
        cudnn.deterministic = True
    exp_root = 'exp'
    log_folder = 'lightning_logs'
    log_root = os.path.join(exp_root, log_folder)
    logger = TestTubeLogger(exp_root, name=log_folder, version=2000)
    checkpoint = ModelCheckpoint(
        filepath='exp/lightning_logs/version_2000/checkpoints/',
        monitor='val_mer',
        verbose=1,
        save_top_k=-1)
    trainer = Trainer(
        logger=logger,
        early_stop_callback=False,
        accumulate_grad_batches=4,
        checkpoint_callback=checkpoint,
        # checkpoint_callback=checkpoint,
        # fast_dev_run=True,
        # overfit_pct=0.03,
        # profiler=True,
        default_save_path='exp/',
        val_check_interval=0.3,
        log_save_interval=50000,
        row_log_interval=50000,
        gpus=1,
        val_percent_check=1,
        # distributed_backend='dp',
        nb_gpu_nodes=hparams.nb_gpu_nodes,
        max_nb_epochs=hparams.epochs,
        gradient_clip_val=5.0,
        min_nb_epochs=3000,
        use_amp=True,
        precision=16,
        nb_sanity_val_steps=0,
        progress_bar_refresh_rate=1,
        resume_from_checkpoint=
        'exp/lightning_logs/version_2000/checkpoints/epoch=114_v1.ckpt')
    # if hparams.evaluate:
    #     trainer.run_evaluation()
    # else:
    trainer.fit(model)
Exemple #6
0
def main(hparams):
    data_path = os.environ['HOME'] + '/data/asr_data/'
    model = LightningModel(hparams)
    if hparams.seed is not None:
        random.seed(hparams.seed)
        t.manual_seed(hparams.seed)
        cudnn.deterministic = True
    exp_root = 'exp'
    log_folder = 'lightning_logs'
    log_root = os.path.join(exp_root, log_folder)
    logger = TestTubeLogger(exp_root, name=log_folder, version=1020)
    checkpoint = ModelCheckpoint(filepath=data_path + '/checkpoints/',
                                 monitor='val_mer',
                                 verbose=1,
                                 save_top_k=-1)
    trainer = Trainer(
        logger=logger,
        early_stop_callback=False,
        checkpoint_callback=checkpoint,
        # checkpoint_callback=checkpoint,
        # fast_dev_run=True,
        # overfit_pct=0.03,
        # profiler=True,
        default_save_path=data_path,
        val_check_interval=1.0,
        log_save_interval=100,
        row_log_interval=10,
        gpus=1,
        precision=16,
        distributed_backend='dp',
        nb_gpu_nodes=hparams.nb_gpu_nodes,
        max_nb_epochs=hparams.epochs,
        gradient_clip_val=5.0,
        min_nb_epochs=3000,
        use_amp=True,
        amp_level='O1',
        nb_sanity_val_steps=0,
        log_gpu_memory='all')
    # if hparams.evaluate:
    #     trainer.run_evaluation()
    # else:
    trainer.fit(model)
Exemple #7
0
def main(hparams):
    model = LightningModel(hparams)
    if hparams.seed is not None:
        random.seed(hparams.seed)
        t.manual_seed(hparams.seed)
        cudnn.deterministic = True
    exp_root = 'exp'
    log_folder = 'lightning_logs'
    log_root = os.path.join(exp_root, log_folder)
    logger = TestTubeLogger(exp_root, name=log_folder, version=5005)
    checkpoint = ModelCheckpoint(filepath='exp/lightning_logs/version_5005/checkpoints/',
                                 monitor='val_loss', verbose=True, save_top_k=-1, mode='min')
    trainer = Trainer(
        logger=logger,
        nb_sanity_val_steps=5,
        early_stop_callback=False,
        checkpoint_callback=checkpoint,
        accumulate_grad_batches=8,
        progress_bar_refresh_rate=10,
        default_save_path='exp/',
        val_check_interval=1.0,
        log_save_interval=50000,
        row_log_interval=50000,
        # gpus=1,
        nb_gpu_nodes=hparams.nb_gpu_nodes,
        max_nb_epochs=hparams.epochs,
        gradient_clip_val=5.0,
        min_nb_epochs=3000,
        gpus=1,
        # num_nodes=1,
        # distributed_backend='dp',
        use_amp=False,
        precision=32,
        # amp_level='O1',
        resume_from_checkpoint='exp/lightning_logs/version_5005/checkpoints/epoch=108.ckpt'
    )
    # if hparams.evaluate:
    #     trainer.run_evaluation()
    # else:
    trainer.fit(model)
def main(hparams):
    model = LightningModel(hparams)
    if hparams.seed is not None:
        random.seed(hparams.seed)
        t.manual_seed(hparams.seed)
        cudnn.deterministic = True
    exp_root = 'exp'
    log_folder = 'lightning_logs'
    log_root = os.path.join(exp_root, log_folder)
    logger = TestTubeLogger(exp_root, name=log_folder, version=4000)
    checkpoint = ModelCheckpoint(
        filepath='exp/lightning_logs/version_4000/checkpoints/',
        monitor='val_mer',
        verbose=1,
        save_top_k=-1)
    trainer = Trainer(
        logger=logger,
        early_stop_callback=False,
        checkpoint_callback=checkpoint,
        accumulate_grad_batches=8,
        # checkpoint_callback=checkpoint,
        # fast_dev_run=True,
        # overfit_pct=0.03,
        # profiler=True,
        default_save_path='exp/',
        val_check_interval=1.0,
        log_save_interval=50000,
        row_log_interval=50000,
        gpus=1,
        nb_gpu_nodes=hparams.nb_gpu_nodes,
        max_nb_epochs=hparams.epochs,
        gradient_clip_val=5.0,
        min_nb_epochs=3000,
        use_amp=True,
        amp_level='O1',
        nb_sanity_val_steps=0)
    # if hparams.evaluate:
    #     trainer.run_evaluation()
    # else:
    trainer.fit(model)
def test_validation_check_interval_exceed_data_length_correct(
        tmpdir, use_infinite_dataset, accumulate_grad_batches):
    data_samples_train = 4
    max_epochs = 3
    max_steps = data_samples_train * max_epochs
    max_opt_steps = max_steps // accumulate_grad_batches

    class TestModel(BoringModel):
        def __init__(self):
            super().__init__()
            self.validation_called_at_step = set()

        def validation_step(self, *args):
            self.validation_called_at_step.add(
                self.trainer.fit_loop.total_batch_idx + 1)
            return super().validation_step(*args)

        def train_dataloader(self):
            train_ds = (RandomIterableDataset(32, count=max_steps +
                                              100) if use_infinite_dataset else
                        RandomDataset(32, length=data_samples_train))
            return DataLoader(train_ds)

    model = TestModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_val_batches=1,
        max_steps=max_opt_steps,
        val_check_interval=3,
        check_val_every_n_epoch=None,
        num_sanity_val_steps=0,
        accumulate_grad_batches=accumulate_grad_batches,
    )

    trainer.fit(model)

    assert trainer.current_epoch == 1 if use_infinite_dataset else max_epochs
    assert trainer.global_step == max_opt_steps
    assert sorted(list(model.validation_called_at_step)) == [3, 6, 9, 12]
def test_val_check_interval(tmpdir, max_epochs, denominator):
    class TestModel(BoringModel):
        def __init__(self):
            super().__init__()
            self.train_epoch_calls = 0
            self.val_epoch_calls = 0

        def on_train_epoch_start(self) -> None:
            self.train_epoch_calls += 1

        def on_validation_epoch_start(self) -> None:
            if not self.trainer.sanity_checking:
                self.val_epoch_calls += 1

    model = TestModel()
    trainer = Trainer(max_epochs=max_epochs,
                      val_check_interval=1 / denominator,
                      logger=False)
    trainer.fit(model)

    assert model.train_epoch_calls == max_epochs
    assert model.val_epoch_calls == max_epochs * denominator
Exemple #11
0
def tune_model(config, ptl_model, dset, train_inds, n_workers, n_val = None, 
               val_inds = None, tune_metrics = None, mode = 'tune', **trainer_kwargs):
    ''' A generic function to hp-tuning and model training with ray and pytorch-lightning '''
    
    model = ptl_model(config = config)
    
    if val_inds is None:
        shuffle(train_inds)

    train_dl = DataLoader(
        torch.utils.data.Subset(dset, train_inds[n_val:] if val_inds is None else train_inds),
        batch_size = config['batch_size'],
        num_workers = n_workers,
        drop_last = True,
        shuffle = True
    )
    val_dl = DataLoader(
        torch.utils.data.Subset(dset, train_inds[:n_val] if val_inds is None else val_inds),
        num_workers = n_workers,
        batch_size = config['batch_size'],
        drop_last = True,
        shuffle = False
    )
    
    callbacks = model.callbacks
    if mode == 'tune':
        callbacks += [
            TuneReportCallback(
                tune_metrics, 
                on = 'validation_end'
            )
        ]

    trainer = PLTrainer(callbacks = callbacks, **trainer_kwargs)
    trainer.fit(model, train_dl, val_dl)
    
    return trainer
def main(config_path):
    seed_everything(42)
    initializer = Initializer(None)
    initializer.load_from_yaml(config_path)
    config = initializer.config
    train_loader = initializer.get_train_dataloader()
    val_loader = initializer.get_dev_dataloader()
    model = initializer.get_lightning_model()
    model_name = config.model['class'].split('.')[-1]
    logger = TensorBoardLogger(**config.logger_ckpt, name=model_name)
    file_path = f'{logger.save_dir}/{model_name}/version_{logger.version}/' + '{epoch}-{val_loss: .4f}-{val_mer: .4f}'
    model_checkpoint = ModelCheckpoint(filepath=file_path,
                                       monitor='val_loss',
                                       verbose=True,
                                       save_top_k=2)
    trainer = Trainer(
        **config.trainer,
        checkpoint_callback=model_checkpoint,
        logger=logger,
        profiler=True,
    )
    trainer.fit(model,
                train_dataloader=train_loader,
                val_dataloaders=val_loader)
def test_no_val_on_train_epoch_loop_restart(tmpdir):
    """Test that training validation loop doesn't get triggered at the beginning of a restart."""
    trainer_kwargs = {
        "max_epochs": 1,
        "limit_train_batches": 1,
        "limit_val_batches": 1,
        "num_sanity_val_steps": 0,
        "enable_checkpointing": False,
    }
    trainer = Trainer(**trainer_kwargs)
    model = BoringModel()
    trainer.fit(model)
    ckpt_path = str(tmpdir / "last.ckpt")
    trainer.save_checkpoint(ckpt_path)

    trainer_kwargs["max_epochs"] = 2
    trainer = Trainer(**trainer_kwargs)

    with patch.object(trainer.fit_loop.epoch_loop.val_loop,
                      "advance",
                      wraps=trainer.fit_loop.epoch_loop.val_loop.advance
                      ) as advance_mocked:
        trainer.fit(model, ckpt_path=ckpt_path)
        assert advance_mocked.call_count == 1
Exemple #14
0
        self.__dataroot = dataroot

    def get_dataset_name(self) -> str:
        return self.__identifier

    def get_dataset(self, train: bool) -> VisionDataset:
        return get_mnist_dataset(self.__dataroot, train, False)

    def get_eval_dataset(self) -> VisionDataset:
        return get_mnist_dataset(self.__dataroot, False, False)


parser = parse_program_args()
hparams: BaseArguments = parser.parse_args()  # type: ignore


checkpoint_callback = ModelCheckpoint(
    verbose=True,
    monitor='avg_acc',
    mode='max'
)

dataset_factory: DatasetFactory = ClassificationDatasetFactory(hparams.dataset, hparams.dataroot)  # type: ignore
trainer = Trainer(gpus=1, callbacks=[checkpoint_callback])
image_dataset_data_module = ImageDatasetDataModule(dataset_factory, hparams.batch_size,
                                                   hparams.batch_size, hparams.workers)

model = ClassifierMNIST()
print('Starting training!')
trainer.fit(model, image_dataset_data_module)