def cli_main(): pl.seed_everything(1234) # ------------ # args # ------------ parser = ArgumentParser() parser.add_argument("--batch_size", default=32, type=int) parser.add_argument("--hidden_dim", type=int, default=128) parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args() # ------------ # data # ------------ dataset = MNIST("", train=True, download=True, transform=transforms.ToTensor()) mnist_test = MNIST("", train=False, download=True, transform=transforms.ToTensor()) mnist_train, mnist_val = random_split(dataset, [55000, 5000]) train_loader = DataLoader(mnist_train, batch_size=args.batch_size) val_loader = DataLoader(mnist_val, batch_size=args.batch_size) test_loader = DataLoader(mnist_test, batch_size=args.batch_size) # ------------ # model # ------------ model = LitAutoEncoder() # ------------ # logging # ------------ # get azureml run object run = Run.get_context() # get the tracking uri for the azureml workspace mlflow_uri = run.experiment.workspace.get_mlflow_tracking_uri() # get the azureml experiment name exp_name = run.experiment.name mlf_logger = MLFlowLogger(experiment_name=exp_name, tracking_uri=mlflow_uri) # link the mlflowlogger run ID to the azureml run ID mlf_logger._run_id = run.id # ------------ # training # ------------ trainer = pl.Trainer.from_argparse_args(args, logger=mlf_logger) trainer.fit(model, train_loader, val_loader) # ------------ # testing # ------------ result = trainer.test(test_dataloaders=test_loader) print(result)
def test_mlflow_run_name_setting(client, mlflow, tmpdir): """Test that the run_name argument makes the MLFLOW_RUN_NAME tag.""" tags = resolve_tags({MLFLOW_RUN_NAME: "run-name-1"}) # run_name is appended to tags logger = MLFlowLogger("test", run_name="run-name-1", save_dir=tmpdir) logger = mock_mlflow_run_creation(logger, experiment_id="exp-id") _ = logger.experiment client.return_value.create_run.assert_called_with(experiment_id="exp-id", tags=tags) # run_name overrides tags[MLFLOW_RUN_NAME] logger = MLFlowLogger("test", run_name="run-name-1", tags={MLFLOW_RUN_NAME: "run-name-2"}, save_dir=tmpdir) logger = mock_mlflow_run_creation(logger, experiment_id="exp-id") _ = logger.experiment client.return_value.create_run.assert_called_with(experiment_id="exp-id", tags=tags) # default run_name (= None) does not append new tag logger = MLFlowLogger("test", save_dir=tmpdir) logger = mock_mlflow_run_creation(logger, experiment_id="exp-id") _ = logger.experiment default_tags = resolve_tags(None) client.return_value.create_run.assert_called_with(experiment_id="exp-id", tags=default_tags)
def test_mlflow_logger_with_unexpected_characters(client, mlflow, tmpdir): """Test that the logger raises warning with special characters not accepted by MLFlow.""" logger = MLFlowLogger("test", save_dir=tmpdir) metrics = {"[some_metric]": 10} with pytest.warns(RuntimeWarning, match="special characters in metric name"): logger.log_metrics(metrics)
def test_mlflow_logger(tmpdir): """Verify that basic functionality of mlflow logger works.""" tutils.reset_seed() hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) mlflow_dir = os.path.join(tmpdir, 'mlruns') logger = MLFlowLogger('test', tracking_uri=f'file:{os.sep * 2}{mlflow_dir}') # Test already exists logger2 = MLFlowLogger('test', tracking_uri=f'file:{os.sep * 2}{mlflow_dir}') _ = logger2.run_id # Try logging string logger.log_metrics({'acc': 'test'}) trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, train_percent_check=0.05, logger=logger ) trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1, 'Training failed'
def test_mlflow_logger_exists(tmpdir): """ Test launching two independent loggers. """ logger = MLFlowLogger('test', save_dir=tmpdir) # same name leads to same experiment id, but different runs get recorded logger2 = MLFlowLogger('test', save_dir=tmpdir) assert logger.experiment_id == logger2.experiment_id assert logger.run_id != logger2.run_id logger3 = MLFlowLogger('new', save_dir=tmpdir) assert logger3.experiment_id != logger.experiment_id
def test_mlflow_logger_with_long_param_value(client, mlflow, tmpdir): """Test that the logger raises warning with special characters not accepted by MLFlow.""" logger = MLFlowLogger("test", save_dir=tmpdir) value = "test" * 100 key = "test_param" params = {key: value} with pytest.warns(RuntimeWarning, match=f"Discard {key}={value}"): logger.log_hyperparams(params)
def test_mlflow_logger_exists(client, mlflow, tmpdir): """Test launching three independent loggers with either same or different experiment name.""" run1 = MagicMock() run1.info.run_id = "run-id-1" run1.info.experiment_id = "exp-id-1" run2 = MagicMock() run2.info.run_id = "run-id-2" run3 = MagicMock() run3.info.run_id = "run-id-3" # simulate non-existing experiment creation client.return_value.get_experiment_by_name = MagicMock(return_value=None) client.return_value.create_experiment = MagicMock( return_value="exp-id-1") # experiment_id client.return_value.create_run = MagicMock(return_value=run1) logger = MLFlowLogger("test", save_dir=tmpdir) assert logger._experiment_id is None assert logger._run_id is None _ = logger.experiment assert logger.experiment_id == "exp-id-1" assert logger.run_id == "run-id-1" assert logger.experiment.create_experiment.asset_called_once() client.reset_mock(return_value=True) # simulate existing experiment returns experiment id exp1 = MagicMock() exp1.experiment_id = "exp-id-1" client.return_value.get_experiment_by_name = MagicMock(return_value=exp1) client.return_value.create_run = MagicMock(return_value=run2) # same name leads to same experiment id, but different runs get recorded logger2 = MLFlowLogger("test", save_dir=tmpdir) assert logger2.experiment_id == logger.experiment_id assert logger2.run_id == "run-id-2" assert logger2.experiment.create_experiment.call_count == 0 assert logger2.experiment.create_run.asset_called_once() client.reset_mock(return_value=True) # simulate a 3rd experiment with new name client.return_value.get_experiment_by_name = MagicMock(return_value=None) client.return_value.create_experiment = MagicMock(return_value="exp-id-3") client.return_value.create_run = MagicMock(return_value=run3) # logger with new experiment name causes new experiment id and new run id to be created logger3 = MLFlowLogger("new", save_dir=tmpdir) assert logger3.experiment_id == "exp-id-3" != logger.experiment_id assert logger3.run_id == "run-id-3"
def train_model(config, gpus, w2v, num_epochs=10): early_stop_callback = EarlyStopping(monitor="val_Accuracy", min_delta=0.0, patience=5, verbose=True, mode="min") checkpoint_callback = ModelCheckpoint( "models/wav2vec_kws/", save_top_k=1, verbose=True, monitor='val_Accuracy', mode='min', ) tune_callback = TuneReportCallback({"acc": "val_Accuracy"}, on="validation_end") logger = TensorBoardLogger("tb_logs", name="wav2vec_kws_tune") mlf_logger = MLFlowLogger(experiment_name="wav2vec_kws", tracking_uri="http://192.168.0.32") mlflow.pytorch.autolog() trainer = pl.Trainer( gpus=gpus, callbacks=[checkpoint_callback, early_stop_callback, tune_callback], logger=[logger, mlf_logger], accumulate_grad_batches=4, amp_level="O0", max_epochs=num_epochs, progress_bar_refresh_rate=1, log_every_n_steps=1, flush_logs_every_n_steps=1) config = argparse.Namespace(**config) model = Wav2VecKWS(config, w2v) trainer.fit(model)
def main(args: DictConfig): # torch.autograd.set_detect_anomaly(True) model = SuperGlueLightning(args) mlf_logger = MLFlowLogger(experiment_name='SuperGlue', tracking_uri=args.exp.mlflow_uri) mlf_logger.experiment.log_param( mlf_logger.run_id, 'chekpoints_path', f'{os.getcwd()}/{args.exp.checkpoint_path}') checkpoint_callback = ModelCheckpoint(dirpath=args.exp.checkpoint_path, verbose=True, save_weights_only=True) trainer = Trainer( gpus=eval(str(args.exp.gpus)), logger=mlf_logger if args.exp.logging else None, max_epochs=args.exp.epochs, accumulate_grad_batches=args.exp.accumulate_grad_batches, checkpoint_callback=checkpoint_callback if args.exp.checkpoint else None, val_check_interval=args.exp.val_check_interval, # limit_val_batches=args.data.val_size, auto_lr_find=False, accelerator='ddp', log_every_n_steps=100, num_sanity_val_steps=0, ) trainer.fit(model)
def test_mlflow_experiment_id_retrieved_once(client, mlflow, tmpdir): """Test that the logger experiment_id retrieved only once.""" logger = MLFlowLogger("test", save_dir=tmpdir) _ = logger.experiment _ = logger.experiment _ = logger.experiment assert logger.experiment.get_experiment_by_name.call_count == 1
def test_mlflow_logger_dirs_creation(tmpdir): """ Test that the logger creates the folders and files in the right place. """ if not _MLFLOW_AVAILABLE: pytest.xfail("test for explicit file creation requires mlflow dependency to be installed.") assert not os.listdir(tmpdir) logger = MLFlowLogger('test', save_dir=tmpdir) assert logger.save_dir == tmpdir assert set(os.listdir(tmpdir)) == {'.trash'} run_id = logger.run_id exp_id = logger.experiment_id # multiple experiment calls should not lead to new experiment folders for i in range(2): _ = logger.experiment assert set(os.listdir(tmpdir)) == {'.trash', exp_id} assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} model = EvalModelTemplate() trainer = Trainer( default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_val_batches=3, log_gpu_memory=True, ) trainer.fit(model) assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} assert 'epoch' in os.listdir(tmpdir / exp_id / run_id / 'metrics') assert set(os.listdir(tmpdir / exp_id / run_id / 'params')) == model.hparams.keys() assert trainer.checkpoint_callback.dirpath == (tmpdir / exp_id / run_id / 'checkpoints') assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0-step=9.ckpt'}
def __init__(self, experiment, name, max_epochs, min_epochs, patience, val_check_interval): # logger = TensorBoardLogger('../logs', name=name) tags = {'mlflow.runName': name} logger = MLFlowLogger(experiment, 'file:../logs/mlruns', tags) if patience == 0: early_stopping = False else: early_stopping = EarlyStopping(patience=patience, monitor='val_loss', mode='min') # filepath = pathlib.Path('../logs') / name / f'version_{logger.version}' / 'model' filepath = pathlib.Path('../models') / name / 'model' model_checkpoint = ModelCheckpoint(str(filepath), monitor='val_loss', mode='min') super().__init__( default_save_path='../logs', gpus=1, max_epochs=max_epochs, min_epochs=min_epochs, early_stop_callback=early_stopping, logger=logger, row_log_interval=100, checkpoint_callback=model_checkpoint, val_check_interval=val_check_interval, )
def test_mlflow_logger_dirs_creation(tmpdir): """ Test that the logger creates the folders and files in the right place. """ assert not os.listdir(tmpdir) logger = MLFlowLogger('test', save_dir=tmpdir) assert logger.save_dir == tmpdir assert set(os.listdir(tmpdir)) == {'.trash'} run_id = logger.run_id exp_id = logger.experiment_id # multiple experiment calls should not lead to new experiment folders for i in range(2): _ = logger.experiment assert set(os.listdir(tmpdir)) == {'.trash', exp_id} assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} model = EvalModelTemplate() trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_val_batches=3) trainer.fit(model) assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} assert 'epoch' in os.listdir(tmpdir / exp_id / run_id / 'metrics') assert set(os.listdir(tmpdir / exp_id / run_id / 'params')) == model.hparams.keys() assert trainer.checkpoint_callback.dirpath == (tmpdir / exp_id / run_id / 'checkpoints') assert set(os.listdir( trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'}
def main(hparams) -> None: experiment_name = hparams.algo_name save_folder = 'model_weights/' + experiment_name if not os.path.exists(save_folder): os.mkdir(save_folder) checkpoint_callback = ModelCheckpoint( filepath=save_folder+'/model_{epoch:02d}') mlf_logger = MLFlowLogger( experiment_name=experiment_name, tracking_uri="file:./mlruns" ) # telegram token = telegram_config['token'] user_id = telegram_config['user_id'] bot = RLBot(token=token, user_id=user_id) telegramCallback = TelegramRLCallback(bot) trainer = Trainer(checkpoint_callback=checkpoint_callback, max_epochs=10000, early_stop_callback=False, val_check_interval=100, logger=mlf_logger, callbacks=[telegramCallback], ) model = ValueRL(hparams) trainer.fit(model)
def test_mlflow_log_dir(client, mlflow, tmpdir): """Test that the trainer saves checkpoints in the logger's save dir.""" # simulate experiment creation with mlflow client mock run = MagicMock() run.info.run_id = "run-id" client.return_value.get_experiment_by_name = MagicMock(return_value=None) client.return_value.create_experiment = MagicMock(return_value="exp-id") client.return_value.create_run = MagicMock(return_value=run) # test construction of default log dir path logger = MLFlowLogger("test", save_dir=tmpdir) assert logger.save_dir == tmpdir assert logger.version == "run-id" assert logger.name == "exp-id" model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_train_batches=1, limit_val_batches=3) assert trainer.log_dir == logger.save_dir trainer.fit(model) assert trainer.checkpoint_callback.dirpath == (tmpdir / "exp-id" / "run-id" / "checkpoints") assert set(os.listdir( trainer.checkpoint_callback.dirpath)) == {"epoch=0-step=1.ckpt"} assert trainer.log_dir == logger.save_dir
def main(hparams): engine = Module(hparams) mlf_logger = MLFlowLogger(experiment_name=hparams.exp_name, tracking_uri="./mlruns", tags=args.tags) exp = mlf_logger.experiment.get_experiment_by_name(hparams.exp_name) artifacts_dir = os.path.join(exp.artifact_location, mlf_logger.run_id, "artifacts") checkpoint_callback = ModelCheckpoint( filepath=artifacts_dir, save_top_k=-1, verbose=True, monitor="val_loss_avg", mode="min", prefix="", ) trainer = Trainer(logger=mlf_logger, checkpoint_callback=checkpoint_callback, max_epochs=hparams.num_epochs, gpus=[0]) trainer.fit(engine)
def main(): load_dotenv('cassava.env') seed_everything(SEED) root_path = os.getenv('ROOT_PATH') train_csv_path = root_path + 'train.csv' train_root_path = root_path + 'train_images' num_classes = int(os.getenv('NUM_CLASSES', 5)) num_epoch = int(os.getenv('NUM_EPOCH', 10)) num_folds = int(os.getenv('NUM_FOLDS', 5)) batch_size = int(os.getenv('BATCH_SIZE'), 16) grad_acc = int(os.getenv('GRAD_ACC', 8)) resize = os.getenv('RESIZE', 224) normalize = A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transform = A.Compose([ A.HorizontalFlip(), A.ShiftScaleRotate(p=1.0), A.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.2, hue=0.0, p=1.0, always_apply=False), A.RandomResizedCrop(resize, resize, p=1.0, always_apply=True), normalize, ToTensorV2(p=1.0), ], p=1.0) test_transform = A.Compose([ A.Resize(int(resize * 1.5), int(resize * 1.5)), normalize, ToTensorV2(p=1.0), ], p=1.0) tta_transform = tta.Compose([ tta.FiveCrops(resize, resize), ]) criterion = MixedLabelLoss(nn.CrossEntropyLoss(reduction='none')) augmentations = [snapmix, ] df = pd.read_csv(train_csv_path) folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=SEED).split(df['image_id'], df['label']) for _fold, (train, test) in enumerate(folds): train = df.iloc[train] test = df.iloc[test] scheduler = optim.lr_scheduler.CosineAnnealingLR model = TimmNet('efficientnet_b3a', num_classes, criterion, learning_rate=1e-3, scheduler=scheduler, n_epoch=num_epoch, eta_min=1e-6, augmentations=augmentations, tta_transform=tta_transform) dm = DataFrameDataModule(train, train_root_path, test, batch_size=batch_size, train_transform=train_transform, test_transform=test_transform) mlf_logger = MLFlowLogger( experiment_name='cassava', tracking_uri='file:./cassava' ) trainer = Trainer(gpus=-1, precision=32, deterministic=True, accumulate_grad_batches=grad_acc, profiler='simple', val_check_interval=1.0, logger=mlf_logger, max_epochs=num_epoch) trainer.fit(model, datamodule=dm)
def test_mlflow_experiment_id_retrieved_once(tmpdir): logger = MLFlowLogger('test', save_dir=tmpdir) get_experiment_name = logger._mlflow_client.get_experiment_by_name with mock.patch.object(MlflowClient, 'get_experiment_by_name', wraps=get_experiment_name) as mocked: _ = logger.experiment _ = logger.experiment _ = logger.experiment assert mocked.call_count == 1
def create_logger() -> Union[bool, LightningLoggerBase]: """ Loosely imitate: https://github.com/Azure/azureml-examples/blob/main/tutorials/using-pytorch-lightning/3.log-with-mlflow.ipynb """ run = Run.get_context() if isinstance(run, _SubmittedRun): experiment = run.experiment tracking_uri = experiment.workspace.get_mlflow_tracking_uri() exp_name = run.experiment.name log.info( f"Using MLFlow logger with tracking URI {tracking_uri} and experiment name {exp_name}" ) rv = MLFlowLogger(exp_name, tracking_uri) rv._run_id = run.id else: log.warning("Unable to get AML run context! Logging locally.") rv = True return rv
def main(config): pl.seed_everything(config.seed) gpus = [0] if torch.cuda.is_available() else None filepath_list_train=generate_pathlist.make_datapath_list( config.project_dir+config.train_dir, ) dm = image_datamodule.ImageDataModule( filepath_list_train=filepath_list_train, filepath_list_test=filepath_list_train, ) discriminator=dcgan.Discriminator() generator=dcgan.Generator() criterion=nn.BCEWithLogitsLoss(reduction="mean") model = GAN( discriminator=discriminator, generator=generator, criterion=criterion, **dc.asdict(config), ) mlflow_tags={} mlflow_tags["mlflow.runName"]=config.run_name mlflow_tags["mlflow.user"]=config.user mlflow_tags["mlflow.source.name"]=str(os.path.abspath(__file__)).replace("/",'\\') mlf_logger = MLFlowLogger( experiment_name=config.experiment_name, tracking_uri=config.tracking_uri, tags=mlflow_tags ) now=datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') checkpoint_callback = ModelCheckpoint( filepath=f"{config.checkpoint_dir}{now}_{mlf_logger.run_id}", save_top_k=None, monitor=None, ) trainer = pl.Trainer( max_epochs=config.max_epochs, logger=mlf_logger, gpus=gpus, checkpoint_callback=checkpoint_callback, resume_from_checkpoint=None, ) trainer.fit(model, datamodule=dm) # save to mlflow mlf_logger.experiment.log_artifact(mlf_logger.run_id, config.log_dir+"/"+config.log_normal) mlf_logger.experiment.log_artifact(mlf_logger.run_id, config.log_dir+"/"+config.log_error)
def test_mlflow_logger_experiment_calls(client, mlflow, time, tmpdir): """ Test that the logger calls methods on the mlflow experiment correctly. """ time.return_value = 1 logger = MLFlowLogger("test", save_dir=tmpdir, artifact_location="my_artifact_location") logger._mlflow_client.get_experiment_by_name.return_value = None params = {"test": "test_param"} logger.log_hyperparams(params) logger.experiment.log_param.assert_called_once_with( logger.run_id, "test", "test_param") metrics = {"some_metric": 10} logger.log_metrics(metrics) logger.experiment.log_metric.assert_called_once_with( logger.run_id, "some_metric", 10, 1000, None) logger._mlflow_client.create_experiment.assert_called_once_with( name="test", artifact_location="my_artifact_location")
def test_mlflow_logger_experiment_calls(client, mlflow, time, tmpdir): """ Test that the logger calls methods on the mlflow experiment correctly. """ time.return_value = 1 logger = MLFlowLogger('test', save_dir=tmpdir, artifact_location='my_artifact_location') logger._mlflow_client.get_experiment_by_name.return_value = None params = {'test': 'test_param'} logger.log_hyperparams(params) logger.experiment.log_param.assert_called_once_with( logger.run_id, 'test', 'test_param') metrics = {'some_metric': 10} logger.log_metrics(metrics) logger.experiment.log_metric.assert_called_once_with( logger.run_id, 'some_metric', 10, 1000, None) logger._mlflow_client.create_experiment.assert_called_once_with( name='test', artifact_location='my_artifact_location', )
def train(args): seed_everything(args.seed) model = LitLSTM(args) logger = MLFlowLogger(experiment_name='Default') early_stop_callback = EarlyStopping( monitor='val_loss', min_delta=0.005, patience=3, verbose=args.verbose, mode='min') if args.early else None trainer = Trainer( log_gpu_memory='all' if args.verbose else None, track_grad_norm=2 if args.verbose else -1, logger=logger, weights_summary='full', callbacks=[early_stop_callback], accumulate_grad_batches=args.acc_grads, profiler=args.verbose, **get_trainer_params(args), ) logger.log_hyperparams(model.args) trainer.fit(model) trainer.test(model, test_dataloaders=model.test_dataloader()) model.save_preds_and_targets(to_disk=True) logger.finalize() return logger.run_id
def main(args: Args): train_image = Path(args.train_image) train_label = Path(args.train_label) test_image = Path(args.test_image) test_label = Path(args.test_label) data = Mnist(32, 0.9, train_image, train_label, test_image, test_label) model = MnistEncoder(28, 64, 3) # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.loggers.mlflow.html mlflow_logger = MLFlowLogger() trainer = pl.Trainer(max_epochs=1, logger=mlflow_logger) trainer.fit(model, train_dataloader=data)
def test_mlflow_pickle(tmpdir): """Verify that pickling trainer with mlflow logger works.""" tutils.reset_seed() mlflow_dir = os.path.join(tmpdir, 'mlruns') logger = MLFlowLogger('test', tracking_uri=f'file:{os.sep * 2}{mlflow_dir}') trainer_options = dict(default_save_path=tmpdir, max_epochs=1, logger=logger) trainer = Trainer(**trainer_options) pkl_bytes = pickle.dumps(trainer) trainer2 = pickle.loads(pkl_bytes) trainer2.logger.log_metrics({'acc': 1.0})
def main(config): pl.seed_everything(config.seed) gpus = [0] if torch.cuda.is_available() else None filepath_list_train, label_list_train = generate_pathlist.make_datapath_list( config.project_dir + config.train_dir, config.project_dir + config.train_label_path) filepath_list_test, label_list_test = generate_pathlist.make_datapath_list( config.project_dir + config.test_dir, config.project_dir + config.test_label_path) dm = image_datamodule.ImageDataModule( filepath_list_train=filepath_list_train, filepath_list_test=filepath_list_test, label_list_train=label_list_train, label_list_test=label_list_test, ) net = image_network.CNN() model = LitClassifier( model=net, learning_rate=config.learning_rate, ) mlflow_tags = {} mlflow_tags["mlflow.runName"] = config.run_name mlflow_tags["mlflow.user"] = config.user mlflow_tags["mlflow.source.name"] = str(os.path.abspath(__file__)).replace( "/", '\\') mlf_logger = MLFlowLogger(experiment_name=config.experiment_name, tracking_uri=config.tracking_uri, tags=mlflow_tags) trainer = pl.Trainer( max_epochs=config.max_epochs, logger=mlf_logger, gpus=gpus, resume_from_checkpoint=None, ) trainer.fit(model, datamodule=dm) result = trainer.test(model, datamodule=dm) pprint(result) mlf_logger.experiment.log_artifact( mlf_logger.run_id, config.log_dir + "/" + config.log_normal) mlf_logger.experiment.log_artifact(mlf_logger.run_id, config.log_dir + "/" + config.log_error)
def test_mlflow_run_id_setting(client, mlflow, tmpdir): """Test that the run_id argument uses the provided run_id.""" run = MagicMock() run.info.run_id = "run-id" run.info.experiment_id = "experiment-id" # simulate existing run client.return_value.get_run = MagicMock(return_value=run) # run_id exists uses the existing run logger = MLFlowLogger("test", run_id=run.info.run_id, save_dir=tmpdir) _ = logger.experiment client.return_value.get_run.assert_called_with(run.info.run_id) assert logger.experiment_id == run.info.experiment_id assert logger.run_id == run.info.run_id client.reset_mock(return_value=True)
def test_mlflow_logger_dirs_creation(tmpdir): """Test that the logger creates the folders and files in the right place.""" if not _MLFLOW_AVAILABLE: pytest.xfail( "test for explicit file creation requires mlflow dependency to be installed." ) assert not os.listdir(tmpdir) logger = MLFlowLogger("test", save_dir=tmpdir) assert logger.save_dir == tmpdir assert set(os.listdir(tmpdir)) == {".trash"} run_id = logger.run_id exp_id = logger.experiment_id # multiple experiment calls should not lead to new experiment folders for i in range(2): _ = logger.experiment assert set(os.listdir(tmpdir)) == {".trash", exp_id} assert set(os.listdir(tmpdir / exp_id)) == {run_id, "meta.yaml"} class CustomModel(BoringModel): def training_epoch_end(self, *args, **kwargs): super().training_epoch_end(*args, **kwargs) self.log("epoch", self.current_epoch) model = CustomModel() limit_batches = 5 trainer = Trainer( default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_train_batches=limit_batches, limit_val_batches=limit_batches, log_gpu_memory=True, ) trainer.fit(model) assert set(os.listdir(tmpdir / exp_id)) == {run_id, "meta.yaml"} assert "epoch" in os.listdir(tmpdir / exp_id / run_id / "metrics") assert set(os.listdir(tmpdir / exp_id / run_id / "params")) == model.hparams.keys() assert trainer.checkpoint_callback.dirpath == (tmpdir / exp_id / run_id / "checkpoints") assert os.listdir(trainer.checkpoint_callback.dirpath) == [ f"epoch=0-step={limit_batches - 1}.ckpt" ]
def test_mlflow_pickle(tmpdir): """Verify that pickling trainer with mlflow logger works.""" tutils.reset_seed() # hparams = tutils.get_hparams() # model = LightningTestModel(hparams) mlflow_dir = os.path.join(tmpdir, "mlruns") logger = MLFlowLogger("test", tracking_uri=f"file:{os.sep * 2}{mlflow_dir}") trainer_options = dict(default_save_path=tmpdir, max_epochs=1, logger=logger) trainer = Trainer(**trainer_options) pkl_bytes = pickle.dumps(trainer) trainer2 = pickle.loads(pkl_bytes) trainer2.logger.log_metrics({"acc": 1.0})
def main(): parser = ArgumentParser() # using this will log all params in mlflow board automatically parser = Trainer.add_argparse_args(parser) parser = MLP.add_model_specific_args(parser) args = parser.parse_args() experiment_name = 'mlp' # tb_logger = loggers.TensorBoardLogger('logs') mlf_logger = MLFlowLogger( experiment_name=experiment_name, tracking_uri="file:./mlruns" ) save_folder = 'model_weights/' + experiment_name + '/' if not os.path.exists(save_folder): os.mkdir(save_folder) save_folder = save_folder + mlf_logger.run_id + '/' if not os.path.exists(save_folder): os.mkdir(save_folder) early_stopping = EarlyStopping('val_loss') # saves checkpoints to 'save_folder' whenever 'val_loss' has a new min checkpoint_callback = ModelCheckpoint( filepath=save_folder+'/model_{epoch:02d}-{val_loss:.2f}') # telegram token = telegram_config['token'] user_id = telegram_config['user_id'] bot = DLBot(token=token, user_id=user_id) telegramCallback = TelegramBotCallback(bot) model = MLP(args) trainer = Trainer(checkpoint_callback=checkpoint_callback, early_stop_callback=early_stopping, fast_dev_run=False, # make this as True only to check for bugs max_epochs=1000, resume_from_checkpoint=None, # change this to model_path logger=mlf_logger, # mlflow logger callbacks=[telegramCallback], # telegrad ) trainer.fit(model) trainer.test()