def test_mlflow_logger_experiment_calls(client, mlflow, time, tmpdir): """ Test that the logger calls methods on the mlflow experiment correctly. """ time.return_value = 1 logger = MLFlowLogger("test", save_dir=tmpdir, artifact_location="my_artifact_location") logger._mlflow_client.get_experiment_by_name.return_value = None params = {"test": "test_param"} logger.log_hyperparams(params) logger.experiment.log_param.assert_called_once_with( logger.run_id, "test", "test_param") metrics = {"some_metric": 10} logger.log_metrics(metrics) logger.experiment.log_metric.assert_called_once_with( logger.run_id, "some_metric", 10, 1000, None) logger._mlflow_client.create_experiment.assert_called_once_with( name="test", artifact_location="my_artifact_location")
def objective(trial, args): params = get_trial_params(trial) params['hidden_size'] = 2**params['hidden_size'] params['acc_grads'] = 2**params['acc_grads'] early_stopper = EarlyStopping( monitor='val_loss', min_delta=0.005, patience=3, mode='min') callbacks = [early_stopper, PyTorchLightningPruningCallback( trial, monitor="val_loss")] if args.model_type == 'attnlstm': params['attn_width'] = trial.suggest_int("attn_width", 3, 64) if 'split' in args.val_mode: dataset_hour = args.data.split('_')[-1] logger = MLFlowLogger(experiment_name=f'Optuna_{dataset_hour}h_{args.val_mode[-1]}_split') print(f'Optuna_{dataset_hour}_{args.val_mode[-1]}_split') val_losses = [] for _split_id in range(int(args.val_mode[-1])): print(f"Split {_split_id} Trial {trial.number}") args.__dict__["split_id"] = 0 for key in params: args.__dict__[str(key)] = params.get(key) model = LitLSTM(args) trainer = Trainer( logger=logger, callbacks=callbacks, **get_trainer_params(args), ) logger.log_hyperparams(model.args) args.__dict__["val_mode"] = args.val_mode args.__dict__["split_id"] = _split_id model._get_data(args, data_mode='init') trainer.fit(model) trainer.test(model, test_dataloaders=model.test_dataloader()) # logger.finalize() val_losses.append(model.metrics['val_loss']) # log mean val loss for later retrieval of best model mean_val_loss = torch.stack(val_losses).mean() logger.log_metrics({"mean_val_loss": mean_val_loss}, step=0) logger.finalize() return mean_val_loss elif args.val_mode == 'full': logger = MLFlowLogger(experiment_name='Optuna_full') for key in params: args.__dict__[str(key)] = params.get(key) model = LitLSTM(args) trainer = Trainer( logger=logger, callbacks=callbacks, **get_trainer_params(args), ) logger.log_hyperparams(model.args) trainer.fit(model) trainer.test(model, test_dataloaders=model.test_dataloader()) model.save_preds_and_targets(to_disk=True) logger.finalize() return model.metrics['val_loss']
def train(args): seed_everything(args.seed) model = LitLSTM(args) logger = MLFlowLogger(experiment_name='Default') early_stop_callback = EarlyStopping( monitor='val_loss', min_delta=0.005, patience=3, verbose=args.verbose, mode='min') if args.early else None trainer = Trainer( log_gpu_memory='all' if args.verbose else None, track_grad_norm=2 if args.verbose else -1, logger=logger, weights_summary='full', callbacks=[early_stop_callback], accumulate_grad_batches=args.acc_grads, profiler=args.verbose, **get_trainer_params(args), ) logger.log_hyperparams(model.args) trainer.fit(model) trainer.test(model, test_dataloaders=model.test_dataloader()) model.save_preds_and_targets(to_disk=True) logger.finalize() return logger.run_id
def test_mlflow_logger_experiment_calls(client, mlflow, time, tmpdir): """ Test that the logger calls methods on the mlflow experiment correctly. """ time.return_value = 1 logger = MLFlowLogger('test', save_dir=tmpdir, artifact_location='my_artifact_location') logger._mlflow_client.get_experiment_by_name.return_value = None params = {'test': 'test_param'} logger.log_hyperparams(params) logger.experiment.log_param.assert_called_once_with( logger.run_id, 'test', 'test_param') metrics = {'some_metric': 10} logger.log_metrics(metrics) logger.experiment.log_metric.assert_called_once_with( logger.run_id, 'some_metric', 10, 1000, None) logger._mlflow_client.create_experiment.assert_called_once_with( name='test', artifact_location='my_artifact_location', )
def test_mlflow_logger_with_long_param_value(client, mlflow, tmpdir): """Test that the logger raises warning with special characters not accepted by MLFlow.""" logger = MLFlowLogger("test", save_dir=tmpdir) value = "test" * 100 key = "test_param" params = {key: value} with pytest.warns(RuntimeWarning, match=f"Discard {key}={value}"): logger.log_hyperparams(params)
def _train( task: str, ov: List[str], do_sweep: bool, ): """ Run training Args: task: task to run training for ov: overwrites for config manager do_sweep: determine best emprical parameters for run """ print(f"Overwrites: {ov}") initialize_config_module(config_module="nndet.conf") cfg = compose(task, "config.yaml", overrides=ov if ov is not None else []) assert cfg.host.parent_data is not None, 'Parent data can not be None' assert cfg.host.parent_results is not None, 'Output dir can not be None' train_dir = init_train_dir(cfg) pl_logger = MLFlowLogger( experiment_name=cfg["task"], tags={ "host": socket.gethostname(), "fold": cfg["exp"]["fold"], "task": cfg["task"], "job_id": os.getenv('LSB_JOBID', 'no_id'), "mlflow.runName": cfg["exp"]["id"], }, save_dir=os.getenv("MLFLOW_TRACKING_URI", "./mlruns"), ) pl_logger.log_hyperparams( flatten_mapping( {"model": OmegaConf.to_container(cfg["model_cfg"], resolve=True)})) pl_logger.log_hyperparams( flatten_mapping({ "trainer": OmegaConf.to_container(cfg["trainer_cfg"], resolve=True) })) logger.remove() logger.add(sys.stdout, format="{level} {message}", level="INFO") log_file = Path(os.getcwd()) / "train.log" logger.add(log_file, level="INFO") logger.info(f"Log file at {log_file}") meta_data = {} meta_data["torch_version"] = str(torch.__version__) meta_data["date"] = str(datetime.now()) meta_data["git"] = log_git(nndet.__path__[0], repo_name="nndet") save_json(meta_data, "./meta.json") try: write_requirements_to_file("requirements.txt") except Exception as e: logger.error(f"Could not log req: {e}") plan_path = Path(str(cfg.host["plan_path"])) plan = load_pickle(plan_path) save_json(create_debug_plan(plan), "./plan_debug.json") data_dir = Path(cfg.host["preprocessed_output_dir"] ) / plan["data_identifier"] / "imagesTr" datamodule = Datamodule( augment_cfg=OmegaConf.to_container(cfg["augment_cfg"], resolve=True), plan=plan, data_dir=data_dir, fold=cfg["exp"]["fold"], ) module = MODULE_REGISTRY[cfg["module"]]( model_cfg=OmegaConf.to_container(cfg["model_cfg"], resolve=True), trainer_cfg=OmegaConf.to_container(cfg["trainer_cfg"], resolve=True), plan=plan, ) callbacks = [] checkpoint_cb = ModelCheckpoint( dirpath=train_dir, filename='model_best', save_last=True, save_top_k=1, monitor=cfg["trainer_cfg"]["monitor_key"], mode=cfg["trainer_cfg"]["monitor_mode"], ) checkpoint_cb.CHECKPOINT_NAME_LAST = 'model_last' callbacks.append(checkpoint_cb) callbacks.append(LearningRateMonitor(logging_interval="epoch")) OmegaConf.save(cfg, str(Path(os.getcwd()) / "config.yaml")) OmegaConf.save(cfg, str(Path(os.getcwd()) / "config_resolved.yaml"), resolve=True) save_pickle(plan, train_dir / "plan.pkl") # backup plan splits = load_pickle( Path(cfg.host.preprocessed_output_dir) / datamodule.splits_file) save_pickle(splits, train_dir / "splits.pkl") trainer_kwargs = {} if cfg["train"]["mode"].lower() == "resume": trainer_kwargs[ "resume_from_checkpoint"] = train_dir / "model_last.ckpt" num_gpus = cfg["trainer_cfg"]["gpus"] logger.info(f"Using {num_gpus} GPUs for training") plugins = cfg["trainer_cfg"].get("plugins", None) logger.info(f"Using {plugins} plugins for training") trainer = pl.Trainer( gpus=list(range(num_gpus)) if num_gpus > 1 else num_gpus, accelerator=cfg["trainer_cfg"]["accelerator"], precision=cfg["trainer_cfg"]["precision"], amp_backend=cfg["trainer_cfg"]["amp_backend"], amp_level=cfg["trainer_cfg"]["amp_level"], benchmark=cfg["trainer_cfg"]["benchmark"], deterministic=cfg["trainer_cfg"]["deterministic"], callbacks=callbacks, logger=pl_logger, max_epochs=module.max_epochs, progress_bar_refresh_rate=None if bool(int(os.getenv("det_verbose", 1))) else 0, reload_dataloaders_every_epoch=False, num_sanity_val_steps=10, weights_summary='full', plugins=plugins, terminate_on_nan=True, # TODO: make modular move_metrics_to_cpu=True, **trainer_kwargs) trainer.fit(module, datamodule=datamodule) if do_sweep: case_ids = splits[cfg["exp"]["fold"]]["val"] if "debug" in cfg and "num_cases_val" in cfg["debug"]: case_ids = case_ids[:cfg["debug"]["num_cases_val"]] inference_plan = module.sweep( cfg=OmegaConf.to_container(cfg, resolve=True), save_dir=train_dir, train_data_dir=data_dir, case_ids=case_ids, run_prediction=True, ) plan["inference_plan"] = inference_plan save_pickle(plan, train_dir / "plan_inference.pkl") ensembler_cls = module.get_ensembler_cls( key="boxes", dim=plan["network_dim"]) # TODO: make this configurable for restore in [True, False]: target_dir = train_dir / "val_predictions" if restore else \ train_dir / "val_predictions_preprocessed" extract_results( source_dir=train_dir / "sweep_predictions", target_dir=target_dir, ensembler_cls=ensembler_cls, restore=restore, **inference_plan, ) _evaluate( task=cfg["task"], model=cfg["exp"]["id"], fold=cfg["exp"]["fold"], test=False, do_boxes_eval=True, # TODO: make this configurable do_analyze_boxes=True, # TODO: make this configurable )