def test_mlflow(ray_start_4_cpus, tmp_path): config = TestConfig() params = {"p1": "p1"} temp_dir = tmp_path num_workers = 4 def train_func(config): train.report(episode_reward_mean=4) train.report(episode_reward_mean=5) train.report(episode_reward_mean=6) return 1 callback = MLflowLoggerCallback(experiment_name="test_exp", logdir=temp_dir) trainer = Trainer(config, num_workers=num_workers) trainer.start() trainer.run(train_func, config=params, callbacks=[callback]) from mlflow.tracking import MlflowClient client = MlflowClient( tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri()) experiment_id = client.get_experiment_by_name("test_exp").experiment_id all_runs = callback.mlflow_util._mlflow.search_runs( experiment_ids=[experiment_id]) assert len(all_runs) == 1 # all_runs is a pandas dataframe. all_runs = all_runs.to_dict(orient="records") run_id = all_runs[0]["run_id"] run = client.get_run(run_id) assert run.data.params == params assert ("episode_reward_mean" in run.data.metrics and run.data.metrics["episode_reward_mean"] == 6.0) assert (TRAINING_ITERATION in run.data.metrics and run.data.metrics[TRAINING_ITERATION] == 3.0) metric_history = client.get_metric_history(run_id=run_id, key="episode_reward_mean") assert len(metric_history) == 3 iterations = [metric.step for metric in metric_history] assert iterations == [1, 2, 3] rewards = [metric.value for metric in metric_history] assert rewards == [4, 5, 6]
def main(num_workers=2, use_gpu=False): trainer = Trainer( backend="torch", num_workers=num_workers, use_gpu=use_gpu) trainer.start() final_results = trainer.run( train_func=train_func, config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }, callbacks=[ MLflowLoggerCallback(experiment_name="train_fashion_mnist") ]) print("Full losses for rank 0 worker: ", final_results)
"num_layers": NUM_LAYERS, "dropout_every": DROPOUT_EVERY, "dropout_prob": DROPOUT_PROB, "num_features": num_features, } # Create 2 callbacks: one for Tensorboard Logging and one for MLflow # logging. Pass these into Trainer, and all results that are # reported by ``train.report()`` will be logged to these 2 places. # TODO: TBXLoggerCallback should create nonexistent logdir # and should also create 1 directory per file. tbx_logdir = "./runs" os.makedirs(tbx_logdir, exist_ok=True) callbacks = [ TBXLoggerCallback(logdir=tbx_logdir), MLflowLoggerCallback(experiment_name="cuj-big-data-training", save_artifact=True), ] # Remove CPU resource so Datasets can be scheduled. resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None trainer = Trainer( backend="torch", num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker, ) trainer.start() results = trainer.run(train_func=train_func, config=config, callbacks=callbacks,