def test_check_val_every_n_epoch_with_max_steps(tmpdir): data_samples_train = 2 check_val_every_n_epoch = 3 max_epochs = 4 class TestModel(BoringModel): def __init__(self): super().__init__() self.validation_called_at_step = set() def validation_step(self, *args): self.validation_called_at_step.add(self.global_step) return super().validation_step(*args) def train_dataloader(self): return DataLoader(RandomDataset(32, data_samples_train)) model = TestModel() trainer = Trainer( default_root_dir=tmpdir, max_steps=data_samples_train * max_epochs, check_val_every_n_epoch=check_val_every_n_epoch, num_sanity_val_steps=0, ) trainer.fit(model) assert trainer.current_epoch == max_epochs assert trainer.global_step == max_epochs * data_samples_train assert list(model.validation_called_at_step) == [ data_samples_train * check_val_every_n_epoch ]
def main(args): wand_logger = WandbLogger(offline=False, project='Transformer', save_dir='./lightning_logs/') wand_logger.log_hyperparams(params=args) checkpoint = ModelCheckpoint( filepath='./lightning_logs/checkpoints/checkpoints', monitor='val_loss', verbose=0, save_top_k=2) model = TransformerModel(**vars(args)) trainer = Trainer( logger=wand_logger, early_stop_callback=False, checkpoint_callback=checkpoint, # fast_dev_run=True, # overfit_pct=0.03, # profiler=True, auto_lr_find=False, # val_check_interval=1.0, # log_save_interval=50000, # row_log_interval=50000, max_epochs=args.epochs, min_epochs=1, ) # lr_finder = trainer.lr_find(model) # print(lr_finder.results) trainer.fit(model)
def test_check_val_every_n_epoch(tmpdir, max_epochs, expected_val_loop_calls, expected_val_batches): class TestModel(BoringModel): val_epoch_calls = 0 val_batches = [] def on_train_epoch_end(self, *args, **kwargs): self.val_batches.append( self.trainer.progress_bar_callback.total_val_batches) def on_validation_epoch_start(self) -> None: self.val_epoch_calls += 1 model = TestModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=max_epochs, num_sanity_val_steps=0, limit_val_batches=2, check_val_every_n_epoch=2, logger=False, ) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" assert model.val_epoch_calls == expected_val_loop_calls assert model.val_batches == expected_val_batches
def test_validation_check_interval_exceed_data_length_wrong(): trainer = Trainer( limit_train_batches=10, val_check_interval=100, ) model = BoringModel() with pytest.raises( ValueError, match= "must be less than or equal to the number of the training batches" ): trainer.fit(model)
def main(hparams): model = LightningModel(hparams) if hparams.seed is not None: random.seed(hparams.seed) t.manual_seed(hparams.seed) cudnn.deterministic = True exp_root = 'exp' log_folder = 'lightning_logs' log_root = os.path.join(exp_root, log_folder) logger = TestTubeLogger(exp_root, name=log_folder, version=2000) checkpoint = ModelCheckpoint( filepath='exp/lightning_logs/version_2000/checkpoints/', monitor='val_mer', verbose=1, save_top_k=-1) trainer = Trainer( logger=logger, early_stop_callback=False, accumulate_grad_batches=4, checkpoint_callback=checkpoint, # checkpoint_callback=checkpoint, # fast_dev_run=True, # overfit_pct=0.03, # profiler=True, default_save_path='exp/', val_check_interval=0.3, log_save_interval=50000, row_log_interval=50000, gpus=1, val_percent_check=1, # distributed_backend='dp', nb_gpu_nodes=hparams.nb_gpu_nodes, max_nb_epochs=hparams.epochs, gradient_clip_val=5.0, min_nb_epochs=3000, use_amp=True, precision=16, nb_sanity_val_steps=0, progress_bar_refresh_rate=1, resume_from_checkpoint= 'exp/lightning_logs/version_2000/checkpoints/epoch=114_v1.ckpt') # if hparams.evaluate: # trainer.run_evaluation() # else: trainer.fit(model)
def main(hparams): data_path = os.environ['HOME'] + '/data/asr_data/' model = LightningModel(hparams) if hparams.seed is not None: random.seed(hparams.seed) t.manual_seed(hparams.seed) cudnn.deterministic = True exp_root = 'exp' log_folder = 'lightning_logs' log_root = os.path.join(exp_root, log_folder) logger = TestTubeLogger(exp_root, name=log_folder, version=1020) checkpoint = ModelCheckpoint(filepath=data_path + '/checkpoints/', monitor='val_mer', verbose=1, save_top_k=-1) trainer = Trainer( logger=logger, early_stop_callback=False, checkpoint_callback=checkpoint, # checkpoint_callback=checkpoint, # fast_dev_run=True, # overfit_pct=0.03, # profiler=True, default_save_path=data_path, val_check_interval=1.0, log_save_interval=100, row_log_interval=10, gpus=1, precision=16, distributed_backend='dp', nb_gpu_nodes=hparams.nb_gpu_nodes, max_nb_epochs=hparams.epochs, gradient_clip_val=5.0, min_nb_epochs=3000, use_amp=True, amp_level='O1', nb_sanity_val_steps=0, log_gpu_memory='all') # if hparams.evaluate: # trainer.run_evaluation() # else: trainer.fit(model)
def main(hparams): model = LightningModel(hparams) if hparams.seed is not None: random.seed(hparams.seed) t.manual_seed(hparams.seed) cudnn.deterministic = True exp_root = 'exp' log_folder = 'lightning_logs' log_root = os.path.join(exp_root, log_folder) logger = TestTubeLogger(exp_root, name=log_folder, version=5005) checkpoint = ModelCheckpoint(filepath='exp/lightning_logs/version_5005/checkpoints/', monitor='val_loss', verbose=True, save_top_k=-1, mode='min') trainer = Trainer( logger=logger, nb_sanity_val_steps=5, early_stop_callback=False, checkpoint_callback=checkpoint, accumulate_grad_batches=8, progress_bar_refresh_rate=10, default_save_path='exp/', val_check_interval=1.0, log_save_interval=50000, row_log_interval=50000, # gpus=1, nb_gpu_nodes=hparams.nb_gpu_nodes, max_nb_epochs=hparams.epochs, gradient_clip_val=5.0, min_nb_epochs=3000, gpus=1, # num_nodes=1, # distributed_backend='dp', use_amp=False, precision=32, # amp_level='O1', resume_from_checkpoint='exp/lightning_logs/version_5005/checkpoints/epoch=108.ckpt' ) # if hparams.evaluate: # trainer.run_evaluation() # else: trainer.fit(model)
def main(hparams): model = LightningModel(hparams) if hparams.seed is not None: random.seed(hparams.seed) t.manual_seed(hparams.seed) cudnn.deterministic = True exp_root = 'exp' log_folder = 'lightning_logs' log_root = os.path.join(exp_root, log_folder) logger = TestTubeLogger(exp_root, name=log_folder, version=4000) checkpoint = ModelCheckpoint( filepath='exp/lightning_logs/version_4000/checkpoints/', monitor='val_mer', verbose=1, save_top_k=-1) trainer = Trainer( logger=logger, early_stop_callback=False, checkpoint_callback=checkpoint, accumulate_grad_batches=8, # checkpoint_callback=checkpoint, # fast_dev_run=True, # overfit_pct=0.03, # profiler=True, default_save_path='exp/', val_check_interval=1.0, log_save_interval=50000, row_log_interval=50000, gpus=1, nb_gpu_nodes=hparams.nb_gpu_nodes, max_nb_epochs=hparams.epochs, gradient_clip_val=5.0, min_nb_epochs=3000, use_amp=True, amp_level='O1', nb_sanity_val_steps=0) # if hparams.evaluate: # trainer.run_evaluation() # else: trainer.fit(model)
def test_validation_check_interval_exceed_data_length_correct( tmpdir, use_infinite_dataset, accumulate_grad_batches): data_samples_train = 4 max_epochs = 3 max_steps = data_samples_train * max_epochs max_opt_steps = max_steps // accumulate_grad_batches class TestModel(BoringModel): def __init__(self): super().__init__() self.validation_called_at_step = set() def validation_step(self, *args): self.validation_called_at_step.add( self.trainer.fit_loop.total_batch_idx + 1) return super().validation_step(*args) def train_dataloader(self): train_ds = (RandomIterableDataset(32, count=max_steps + 100) if use_infinite_dataset else RandomDataset(32, length=data_samples_train)) return DataLoader(train_ds) model = TestModel() trainer = Trainer( default_root_dir=tmpdir, limit_val_batches=1, max_steps=max_opt_steps, val_check_interval=3, check_val_every_n_epoch=None, num_sanity_val_steps=0, accumulate_grad_batches=accumulate_grad_batches, ) trainer.fit(model) assert trainer.current_epoch == 1 if use_infinite_dataset else max_epochs assert trainer.global_step == max_opt_steps assert sorted(list(model.validation_called_at_step)) == [3, 6, 9, 12]
def test_val_check_interval(tmpdir, max_epochs, denominator): class TestModel(BoringModel): def __init__(self): super().__init__() self.train_epoch_calls = 0 self.val_epoch_calls = 0 def on_train_epoch_start(self) -> None: self.train_epoch_calls += 1 def on_validation_epoch_start(self) -> None: if not self.trainer.sanity_checking: self.val_epoch_calls += 1 model = TestModel() trainer = Trainer(max_epochs=max_epochs, val_check_interval=1 / denominator, logger=False) trainer.fit(model) assert model.train_epoch_calls == max_epochs assert model.val_epoch_calls == max_epochs * denominator
def tune_model(config, ptl_model, dset, train_inds, n_workers, n_val = None, val_inds = None, tune_metrics = None, mode = 'tune', **trainer_kwargs): ''' A generic function to hp-tuning and model training with ray and pytorch-lightning ''' model = ptl_model(config = config) if val_inds is None: shuffle(train_inds) train_dl = DataLoader( torch.utils.data.Subset(dset, train_inds[n_val:] if val_inds is None else train_inds), batch_size = config['batch_size'], num_workers = n_workers, drop_last = True, shuffle = True ) val_dl = DataLoader( torch.utils.data.Subset(dset, train_inds[:n_val] if val_inds is None else val_inds), num_workers = n_workers, batch_size = config['batch_size'], drop_last = True, shuffle = False ) callbacks = model.callbacks if mode == 'tune': callbacks += [ TuneReportCallback( tune_metrics, on = 'validation_end' ) ] trainer = PLTrainer(callbacks = callbacks, **trainer_kwargs) trainer.fit(model, train_dl, val_dl) return trainer
def main(config_path): seed_everything(42) initializer = Initializer(None) initializer.load_from_yaml(config_path) config = initializer.config train_loader = initializer.get_train_dataloader() val_loader = initializer.get_dev_dataloader() model = initializer.get_lightning_model() model_name = config.model['class'].split('.')[-1] logger = TensorBoardLogger(**config.logger_ckpt, name=model_name) file_path = f'{logger.save_dir}/{model_name}/version_{logger.version}/' + '{epoch}-{val_loss: .4f}-{val_mer: .4f}' model_checkpoint = ModelCheckpoint(filepath=file_path, monitor='val_loss', verbose=True, save_top_k=2) trainer = Trainer( **config.trainer, checkpoint_callback=model_checkpoint, logger=logger, profiler=True, ) trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader)
def test_no_val_on_train_epoch_loop_restart(tmpdir): """Test that training validation loop doesn't get triggered at the beginning of a restart.""" trainer_kwargs = { "max_epochs": 1, "limit_train_batches": 1, "limit_val_batches": 1, "num_sanity_val_steps": 0, "enable_checkpointing": False, } trainer = Trainer(**trainer_kwargs) model = BoringModel() trainer.fit(model) ckpt_path = str(tmpdir / "last.ckpt") trainer.save_checkpoint(ckpt_path) trainer_kwargs["max_epochs"] = 2 trainer = Trainer(**trainer_kwargs) with patch.object(trainer.fit_loop.epoch_loop.val_loop, "advance", wraps=trainer.fit_loop.epoch_loop.val_loop.advance ) as advance_mocked: trainer.fit(model, ckpt_path=ckpt_path) assert advance_mocked.call_count == 1
self.__dataroot = dataroot def get_dataset_name(self) -> str: return self.__identifier def get_dataset(self, train: bool) -> VisionDataset: return get_mnist_dataset(self.__dataroot, train, False) def get_eval_dataset(self) -> VisionDataset: return get_mnist_dataset(self.__dataroot, False, False) parser = parse_program_args() hparams: BaseArguments = parser.parse_args() # type: ignore checkpoint_callback = ModelCheckpoint( verbose=True, monitor='avg_acc', mode='max' ) dataset_factory: DatasetFactory = ClassificationDatasetFactory(hparams.dataset, hparams.dataroot) # type: ignore trainer = Trainer(gpus=1, callbacks=[checkpoint_callback]) image_dataset_data_module = ImageDatasetDataModule(dataset_factory, hparams.batch_size, hparams.batch_size, hparams.workers) model = ClassifierMNIST() print('Starting training!') trainer.fit(model, image_dataset_data_module)