def test_train_validate(self, cfg, trainer): for key in ["test"]: if key in cfg.dataset.keys(): del cfg.dataset[key] model = HydraMixin.create_model(cfg) trainer.fit(model)
def test_schedule_length_correct(torch, cfg, hydra, gpus, gpu_count, accum_grad_batches, num_nodes): cfg["trainer"]["params"]["gpus"] = gpus cfg["trainer"]["params"]["accumulate_grad_batches"] = accum_grad_batches cfg["trainer"]["params"]["num_nodes"] = num_nodes cfg["trainer"]["params"]["fast_dev_run"] = False cfg["trainer"]["params"]["max_epochs"] = 10 cfg["dataset"]["train"]["params"]["size"] = 10000 cfg["dataset"]["validate"]["params"]["size"] = 10000 cfg["dataset"]["test"]["params"]["size"] = 10000 cfg["model"]["params"] model = HydraMixin.create_model(cfg) model.get_datasets() _, schedulers = model.configure_optimizers() schedule = schedulers[0] assert schedule["interval"] == "step" assert schedule["frequency"] == 1 scheduler = schedule["scheduler"] max_lr = cfg.schedule["params"]["max_lr"] div_factor = cfg.schedule["params"]["div_factor"] cfg.schedule["params"]["final_div_factor"] num_epochs = cfg.trainer["params"]["max_epochs"] expected_steps = (math.ceil( len(model.train_dataloader()) / (gpu_count * accum_grad_batches * num_nodes)) * num_epochs) cfg.schedule["params"]["pct_start"] assert abs(scheduler.get_last_lr()[0] - max_lr / div_factor) <= 1e-5 assert scheduler.total_steps == expected_steps
def test_configure_multiple_optimizers(torch, cfg, hydra, scheduled): num_optimizers = 2 cfg["optimizer"] = [ cfg["optimizer"], ] * num_optimizers if not scheduled: del cfg["schedule"] else: cfg["schedule"] = [ cfg["schedule"], ] * num_optimizers model = HydraMixin.create_model(cfg) model.get_optimizer_parameters = lambda idx: model.parameters() model.get_datasets() if not scheduled: optims = model.configure_optimizers() assert len(optims) == 2 assert isinstance(optims[0], torch.optim.Adam) assert isinstance(optims[1], torch.optim.Adam) else: optims, schedulers = model.configure_optimizers() assert len(optims) == 2 assert len(schedulers) == 2 assert isinstance(optims[0], torch.optim.Adam) assert isinstance(optims[1], torch.optim.Adam) assert isinstance(schedulers[0], dict) assert isinstance(schedulers[0]["scheduler"], torch.optim.lr_scheduler.OneCycleLR) assert isinstance(schedulers[1], dict) assert isinstance(schedulers[1]["scheduler"], torch.optim.lr_scheduler.OneCycleLR)
def test_get_lr(scheduled, cfg, hydra): if not scheduled: del cfg["schedule"] model = HydraMixin.create_model(cfg) model.get_datasets() model.configure_optimizers() assert model.get_lr() == cfg["optimizer"]["params"]["lr"]
def test_configure_optimizer_warn_no_monitor_key(torch, cfg, hydra): cfg["model"]["params"] del cfg["schedule"]["monitor"] model = HydraMixin.create_model(cfg) model.get_datasets() with pytest.warns(UserWarning): optims, schedulers = model.configure_optimizers()
def test_configure_optimizer_missing_keys(torch, cfg, hydra, missing): cfg["model"]["params"] del cfg["schedule"][missing] model = HydraMixin.create_model(cfg) model.get_datasets() with pytest.raises(pl.utilities.exceptions.MisconfigurationException): optims, schedulers = model.configure_optimizers()
def auto_lr_find(cfg: DictConfig, model: pl.LightningModule) -> Optional[float]: r"""Performs automatic learning rate selection using PyTorch Lightning. This is essentially a wrapper function that invokes PyTorch Lightning's auto LR selection using Hydra inputs. The model's learning rate is automatically set to the selected learning rate, and the selected learning rate is logged. If possible, a plot of the learning rate selection curve will also be produced. Args: cfg (DictConfig): The Hydra config model (LightningModule): The model to select a learning rate for. Returns: The learning rate if one was found, otherwise ``None``. """ lr = None try: model.prepare_data() lr_trainer: pl.Trainer = HydraMixin.instantiate(cfg.trainer) if hasattr(lr_trainer, "tuner"): # pl >= 1.0.0 lr_finder = lr_trainer.tuner.lr_find(model) else: # pl <= 1.0.0 lr_finder = lr_trainer.lr_find(model) # type: ignore lr = lr_finder.suggestion() log.info("Found learning rate %f", lr) cfg.optimizer["params"]["lr"] = lr # save lr curve figure try: cwd = os.getcwd() path = os.path.join(cwd, "lr_curve.png") fig = lr_finder.plot(suggest=True) log.info("Saving LR curve to %s", path) fig.savefig(path) plt.close() except Exception as err: log.exception(err) log.info("No learning rate curve was saved") except Exception as err: log.exception(err) log.info( "Learning rate auto-find failed, using learning rate specified in config" ) _exceptions.append(err) finally: cfg.trainer["params"]["auto_lr_find"] = False return lr
def test_get_datasets_forced(cfg, force, mocker): model = HydraMixin.create_model(cfg) mock = mocker.MagicMock(spec_set=bool, name="train_ds") model.get_datasets() model.train_ds = mock model.get_datasets(force=force) if force: assert model.train_ds != mock else: assert model.train_ds == mock
def test_test_dataloader(cfg, present): if not present: del cfg.dataset["test"] model = HydraMixin.create_model(cfg) model.get_datasets() dataloader = model.test_dataloader() if present: assert isinstance(dataloader, torch.utils.data.DataLoader) else: assert dataloader is None
def test_recursive_instantiate(cfg, params, key): cfg["model"]["params"]["criterion"] = { key: "torch.nn.BCELoss", "params": {} } if params: cfg["model"]["params"]["criterion"]["params"] = {"reduction": "none"} model = HydraMixin.create_model(cfg) assert isinstance(model.criterion, torch.nn.BCELoss) assert model.hparams.keys() == cfg.keys()
def test_configure_optimizer(torch, cfg, hydra, scheduled): if not scheduled: del cfg["schedule"] model = HydraMixin.create_model(cfg) model.get_datasets() if not scheduled: optim = model.configure_optimizers() assert isinstance(optim, torch.optim.Adam) else: optims, schedulers = model.configure_optimizers() assert isinstance(optims[0], torch.optim.Adam) assert isinstance(schedulers[0], dict) assert isinstance(schedulers[0]["scheduler"], torch.optim.lr_scheduler.OneCycleLR)
def test_dataloader_override_batch_size(cfg, subset): model_batch_size = cfg.dataset.train["batch_size"] new_batch_size = model_batch_size + 1 cfg.dataset[subset]["batch_size"] = new_batch_size model = HydraMixin.create_model(cfg) model.get_datasets() if subset == "test": dataloader = model.test_dataloader() else: dataloader = model.val_dataloader() train_dl = model.train_dataloader() assert train_dl.batch_size == model_batch_size assert dataloader.batch_size == new_batch_size
def test_get_lr_multiple_optimizers(scheduled, cfg, hydra): num_optimizers = 2 cfg["optimizer"] = [ cfg["optimizer"], ] * num_optimizers if not scheduled: del cfg["schedule"] else: cfg["schedule"] = [ cfg["schedule"], ] * num_optimizers model = HydraMixin.create_model(cfg) model.get_optimizer_parameters = lambda idx: model.parameters() model.configure_optimizers() lr1 = model.get_lr(0, 0) lr2 = model.get_lr(1, 0) assert lr1 == cfg["optimizer"][0]["params"]["lr"] assert lr2 == cfg["optimizer"][1]["params"]["lr"]
def test_dataloader_from_subset(cfg, subset, split): if subset == "test": if split: cfg.dataset["test"] = 10 del cfg.dataset["validate"] else: if split: cfg.dataset["validate"] = 10 del cfg.dataset["test"] model = HydraMixin.create_model(cfg) model.get_datasets() if subset == "test": dataloader = model.test_dataloader() else: dataloader = model.val_dataloader() train_dl = model.train_dataloader() # TODO should check shuffle=False, but this is hidden in dataloader.sampler assert isinstance(dataloader, torch.utils.data.DataLoader) for key in ["pin_memory", "batch_size", "num_workers", "drop_last"]: assert getattr(dataloader, key) == getattr(train_dl, key)
def test_recursive_instantiate_list(hydra, key): cfg = { key: "torch.nn.Sequential", "params": [ { key: "torch.nn.Linear", "params": { "in_features": 10, "out_features": 10, }, }, { key: "torch.nn.Linear", "params": { "in_features": 10, "out_features": 10, }, }, ], } model = HydraMixin.instantiate(cfg) assert isinstance(model, torch.nn.Sequential)
def test_train_validate_test(self, cfg, trainer): model = HydraMixin.create_model(cfg) trainer.fit(model)
def test_recursive_instantiate_preserves_cfg(cfg): key = {"_target_": "torch.nn.BCELoss", "params": {"reduction": "none"}} cfg["model"]["params"]["criterion"] = key model = HydraMixin.create_model(cfg) assert "criterion" in model.hparams["model"]["params"].keys() assert model.hparams["model"]["params"]["criterion"] == key
def trainer(self, cfg): return HydraMixin.instantiate(cfg.trainer)
def main( cfg: DictConfig, process_results_fn: Optional[Callable[[Tuple[Any, Any]], Any]] = None) -> None: r"""Main method for training/testing of a model using PyTorch Lightning and Hydra. This method is robust to exceptions (other than :class:`SystemExit` or :class:`KeyboardInterrupt`), making it useful when using Hydra's multirun feature. If one combination of hyperparameters results in an exception, other combinations will still be attempted. This behavior can be overriden by providing a ``check_exceptions`` bool value under ``config.trainer``. Such an override is useful when writing tests. Automatic learning rate selection is handled using :func:`auto_lr_find`. Training / testing is automatically performed based on the configuration keys present in ``config.dataset``. Additionally, the following Hydra overrides are supported: * ``trainer.load_from_checkpoint`` - Load model weights (but not training state) from a checkpoint * ``trainer.test_only`` - Skip training even if a training dataset is given Args: cfg (DictConfig): The Hydra config process_results_fn (callable, optional): If given, call ``process_results_fn`` on the ``(train_results, test_results)`` tuple returned by this method. This is useful for processing training/testing results into a scalar return value when using an optimization sweeper (like Ax). Example:: >>> # define main method as per Hydra that calls combustion.main() >>> @hydra.main(config_path="./conf", config_name="config") >>> def main(cfg): >>> combustion.main(cfg) >>> >>> if __name__ == "__main__": >>> main() Example (multirun from config file):: >>> combustion.initialize(config_path="./conf", config_name="config") >>> >>> @hydra.main(config_path="./conf", config_name="config") >>> def main(cfg): >>> return combustion.main(cfg) >>> >>> if __name__ == "__main__": >>> main() >>> combustion.check_exceptions() Example (inference-time command):: ``python -m my_module trainer.load_from_checkpoint=foo.ckpt trainer.test_only=True`` """ train_results, test_results = None, None model: pl.LightningModule trainer: Optional[pl.Trainer] = None try: _log_versions() log.info("Configuration: \n%s", OmegaConf.to_yaml(cfg)) deterministic = cfg.trainer.params.get("deterministic", False) if deterministic: seed_val = 42 log.info( "Determinstic training requested, seeding everything with %d", seed_val) pl.seed_everything(seed_val) # instantiate model (and optimizer) selected in yaml # see pytorch lightning docs: https://pytorch-lightning.rtfd.io/en/latest try: model = HydraMixin.create_model(cfg) except RuntimeError: log.error("Failed to instantiate model") log.error("Model Config:\n%s", cfg.model.to_yaml()) raise # preprocess data preprocess_training_path = cfg.trainer.get("preprocess_train_path", None) preprocess_training_epochs = cfg.trainer.get("preprocess_train_epochs", 1) if preprocess_training_path is not None: if not os.path.isdir(preprocess_training_path): raise NotADirectoryError(preprocess_training_path) # clean non-empty directory if os.listdir(preprocess_training_path): pattern = preprocess_training_path + "*example*.pth" files = glob(pattern) log.info("Cleaning destination directory") [os.remove(f) for f in files] log.info("Writing preprocessed training set to %s", preprocess_training_path) model.prepare_data() train_ds = model.train_dataloader().dataset # type: ignore for i in range(preprocess_training_epochs): save_torch(train_ds, preprocess_training_path, f"epoch_{i}_example_") log.info( "Finished writing preprocessed training set. Update Hydra config and rerun training." ) return # load model checkpoint if requested and not resume_from_checkpoint load_from_checkpoint = cfg.trainer.get("load_from_checkpoint", None) resume_from_checkpoint = cfg.trainer.params.get( "resume_from_checkpoint", None) if load_from_checkpoint is not None: if resume_from_checkpoint is not None: log.info( "Skipping checkpoint loading because resume_from_checkpoint was given" ) else: log.info("Loading checkpoint %s", load_from_checkpoint) # TODO this still needs some work model = model.__class__.load_from_checkpoint( load_from_checkpoint) model._hparams = cfg # run auto learning rate find if requested lr_find = cfg.trainer.params.get("auto_lr_find", False) fast_dev_run = cfg.trainer.params.get("fast_dev_run", False) test_only = cfg.trainer.get("test_only", False) if lr_find: if fast_dev_run: log.info( "Skipping auto learning rate find when fast_dev_run is true" ) elif test_only: log.info( "Skipping auto learning rate find when test_only is true") else: auto_lr_find(cfg, model) # instantiate trainer with params as selected in yaml # handles tensorboard, checkpointing, etc trainer = HydraMixin.instantiate(cfg.trainer) trainer = typing.cast(pl.Trainer, trainer) # train if "train" in cfg.dataset: if test_only: log.info("test_only flag was set, skipping training") else: log.info("Starting training") train_results = trainer.fit(model) log.info("Train results: %s", train_results) else: log.info("No training dataset given") # test if "test" in cfg.dataset: log.info("Starting testing") test_results = trainer.test(model) log.info("Test results: %s", test_results) else: log.info("No test dataset given") log.info("Finished!") # guard to continue when using Hydra multiruns # SystemExit/KeyboardInterrupt are not caught and will trigger shutdown except Exception as err: catch_exceptions = cfg.trainer.get("catch_exceptions", True) if catch_exceptions: log.exception(err) _exceptions.append(err) else: raise err finally: # flush logger to ensure free memory for next run if trainer is not None: experiment = trainer.logger.experiment # type: ignore if experiment is not None and hasattr(experiment, "flush"): experiment.flush() log.debug("Flushed experiment to disk") if experiment is not None and hasattr(experiment, "close"): experiment.close() log.debug("Closed experiment writer") # postprocess results if desired (e.g. to scalars for bayesian optimization) if process_results_fn is not None: log.debug("Running results postprocessing") output = process_results_fn(train_results, test_results) # type: ignore else: output = train_results, test_results return output # type: ignore
def test_instantiate_report_error(hydra, target, exception): cfg = {"_target_": target} with pytest.raises(exception): HydraMixin.instantiate(cfg)
def test_get_datasets_missing_items(cfg, missing, present): for k in missing: del cfg.dataset[k] model = HydraMixin.create_model(cfg) model.get_datasets()
def test_get_datasets(cfg, check): model = HydraMixin.create_model(cfg) model.get_datasets() assert hasattr(model, check) assert isinstance(getattr(model, check), torch.utils.data.Dataset)
def test_create_model(cfg, hydra): model = HydraMixin.create_model(cfg) assert isinstance(model, Subclass) assert model.hparams.keys() == cfg.keys()