def test_mlflow_bad_metric_name_handling(dirname): import mlflow true_values = [123.0, 23.4, 333.4] with MLflowLogger(os.path.join(dirname, "mlruns")) as mlflow_logger: active_run = mlflow.active_run() handler = OutputHandler(tag="training", metric_names="all") engine = Engine(lambda e, b: None) engine.state = State(metrics={ "metric:0 in %": 123.0, "metric 0": 1000.0, }) with pytest.warns(UserWarning, match=r"MLflowLogger output_handler encountered an invalid metric name"): engine.state.epoch = 1 handler(engine, mlflow_logger, event_name=Events.EPOCH_COMPLETED) for i, v in enumerate(true_values): engine.state.epoch += 1 engine.state.metrics['metric 0'] = v handler(engine, mlflow_logger, event_name=Events.EPOCH_COMPLETED) from mlflow.tracking import MlflowClient client = MlflowClient(tracking_uri=os.path.join(dirname, "mlruns")) stored_values = client.get_metric_history(active_run.info.run_id, "training metric 0") for t, s in zip([1000.0, ] + true_values, stored_values): assert t == s.value
def test_autolog_early_stopping_callback(): mlflow.paddle.autolog() early_stopping = paddle.callbacks.EarlyStopping("loss", mode="min", patience=1, min_delta=0) with mlflow.start_run() as run: train_model(callbacks=[early_stopping]) client = MlflowClient() data = client.get_run(run.info.run_id).data for param_key in ["monitor", "patience", "min_delta", "baseline"]: assert param_key in data.params assert data.params[param_key] == str(getattr(early_stopping, param_key)) for metric_key in ["stopped_epoch", "best_value"]: assert metric_key in data.metrics assert float(data.metrics[metric_key]) == getattr( early_stopping, metric_key) for metric_key in ["loss", "step"]: assert metric_key in data.metrics metric_history = client.get_metric_history(run.info.run_id, metric_key) assert len(metric_history) == NUM_EPOCHS
def test_autolog_logs_expected_data(): mlflow.paddle.autolog() with mlflow.start_run() as run: train_model() client = MlflowClient() data = client.get_run(run.info.run_id).data # Testing params are logged for param_key, expected_param_value in [("optimizer_name", "Adam"), ("learning_rate", "0.01")]: assert param_key in data.params assert data.params[param_key] == expected_param_value # Testing metrics are logged for metric_key in [ "batch_size", "loss", "step", "eval_batch_size", "eval_loss", "eval_step" ]: assert metric_key in data.metrics metric_history = client.get_metric_history(run.info.run_id, metric_key) assert len(metric_history) == NUM_EPOCHS # Testing model_summary.txt is saved artifacts = client.list_artifacts(run.info.run_id) assert any(x.path == "model_summary.txt" for x in artifacts)
def test_mlflow(ray_start_4_cpus, tmp_path): config = TestConfig() params = {"p1": "p1"} temp_dir = tmp_path num_workers = 4 def train_func(config): train.report(episode_reward_mean=4) train.report(episode_reward_mean=5) train.report(episode_reward_mean=6) return 1 callback = MLflowLoggerCallback(experiment_name="test_exp", logdir=temp_dir) trainer = Trainer(config, num_workers=num_workers) trainer.start() trainer.run(train_func, config=params, callbacks=[callback]) from mlflow.tracking import MlflowClient client = MlflowClient( tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri()) experiment_id = client.get_experiment_by_name("test_exp").experiment_id all_runs = callback.mlflow_util._mlflow.search_runs( experiment_ids=[experiment_id]) assert len(all_runs) == 1 # all_runs is a pandas dataframe. all_runs = all_runs.to_dict(orient="records") run_id = all_runs[0]["run_id"] run = client.get_run(run_id) assert run.data.params == params assert ("episode_reward_mean" in run.data.metrics and run.data.metrics["episode_reward_mean"] == 6.0) assert (TRAINING_ITERATION in run.data.metrics and run.data.metrics[TRAINING_ITERATION] == 3.0) metric_history = client.get_metric_history(run_id=run_id, key="episode_reward_mean") assert len(metric_history) == 3 iterations = [metric.step for metric in metric_history] assert iterations == [1, 2, 3] rewards = [metric.value for metric in metric_history] assert rewards == [4, 5, 6]
def test_integration(dirname): n_epochs = 5 data = list(range(50)) losses = torch.rand(n_epochs * len(data)) losses_iter = iter(losses) def update_fn(engine, batch): return next(losses_iter) trainer = Engine(update_fn) mlflow_logger = MLflowLogger(tracking_uri=os.path.join(dirname, "mlruns")) true_values = [] def dummy_handler(engine, logger, event_name): global_step = engine.state.get_event_attrib_value(event_name) v = global_step * 0.1 true_values.append(v) logger.log_metrics({"{}".format("test_value"): v}, step=global_step) mlflow_logger.attach(trainer, log_handler=dummy_handler, event_name=Events.EPOCH_COMPLETED) import mlflow active_run = mlflow.active_run() trainer.run(data, max_epochs=n_epochs) mlflow_logger.close() from mlflow.tracking import MlflowClient client = MlflowClient(tracking_uri=os.path.join(dirname, "mlruns")) stored_values = client.get_metric_history(active_run.info.run_id, "test_value") for t, s in zip(true_values, stored_values): assert pytest.approx(t) == s.value
def _load(self) -> MetricsDict: """Load MlflowMetricDataSet. Returns: Dict[str, Union[int, float]]: Dictionary with MLflow metrics dataset. """ client = MlflowClient() run_id = self.run_id all_metrics = client._tracking_client.store.get_all_metrics(run_uuid=run_id) dataset_metrics = filter(self._is_dataset_metric, all_metrics) dataset = reduce( lambda xs, x: self._update_metric( # get_all_metrics returns last saved values per metric key. # All values are required here. client.get_metric_history(run_id, x.key), xs, ), dataset_metrics, {}, ) return dataset
def _load(self): self._validate_run_id() mlflow_client = MlflowClient() metric_history = mlflow_client.get_metric_history( run_id=self.run_id, key=self.key) # gets active run if no run_id was given # the metric history is always a list of mlflow.entities.metric.Metric # we want the value of the last one stored because this dataset only deal with one single metric step = self._load_args.get("step") if step is None: # we take the last value recorded metric_value = metric_history[-1].value else: # we should take the last historical value with the given step # (it is possible to have several values with the same step) metric_value = next(metric.value for metric in reversed(metric_history) if metric.step == step) return metric_value
def _save(self, data: float): if self._logging_activated: self._validate_run_id() run_id = ( self.run_id ) # we access it once instead of calling self.run_id everywhere to avoid looking or an active run each time mlflow_client = MlflowClient() # get the metric history if it has been saved previously to ensure # to retrieve the right data # reminder: this is True even if no run_id was originally specified but a run is active metric_history = (mlflow_client.get_metric_history( run_id=run_id, key=self.key) if self._exists() else []) save_args = deepcopy(self._save_args) step = save_args.pop("step", None) if step is None: if self.mode == "overwrite": step = max([metric.step for metric in metric_history], default=0) elif self.mode == "append": # I put a max([]) default to -1 so that default "step" equals 0 step = (max([metric.step for metric in metric_history], default=-1) + 1) else: raise ValueError( f"save_args['mode'] must be one of {self.SUPPORTED_SAVE_MODES}, got '{self.mode}' instead." ) mlflow_client.log_metric( run_id=run_id, key=self.key, value=data, step=step, **save_args, )
def assert_are_metrics_logged( data: Dict[str, Union[float, List[float]]], client: MlflowClient, run_id: str, prefix: Optional[str] = None, ) -> bool: """Helper function which checks if given metrics where logged. Args: data: (Dict[str, Union[float, List[float]]]): Logged metrics. client: (MlflowClient): MLflow client instance. run_id: (str): id of run where data was logged. prefix: (Optional[str]) """ for key in data.keys(): metric_key = f"{prefix}.{key}" if prefix else key metric = client.get_metric_history(run_id, metric_key) data_len = len(data[key]) if isinstance(data[key], list) else 1 assert len(metric) == data_len for idx, item in enumerate(metric): data_value = (data[key][idx]["value"] if isinstance( data[key], list) else data[key]["value"]) assert item.value == data_value and item.key == metric_key assert True
def _load(self): self._validate_run_id() mode = self._load_args.get("mode", "list") mlflow_client = MlflowClient() metric_history = mlflow_client.get_metric_history(self.run_id, key=self.key) if mode == "list": simplified_history = [metric.value for metric in metric_history] elif mode == "dict": simplified_history = { metric.step: metric.value for metric in metric_history } elif mode == "history": # history is a list of dict whom keys are "log_metric" arguments. The following is equivalent to dict mode: # [{"step": 0, "value": 0.1}, {"step": 1, "value": 0.2}, {"step": 2, "value": 0.3}] simplified_history = [{ "step": metric.step, "value": metric.value, "timestamp": metric.timestamp, } for metric in metric_history] return simplified_history
class MlflowHelper: def __init__( self, tracking_uri: str = "http://localhost:5000", local_mlflow_dir_prefix: str = "../gsim01/mlruns/", experiment_name: str = "Domain Guided Monitoring", experiment_id: Optional[str] = "1", pkl_file: Optional[Path] = None, ): self.mlflow_client = MlflowClient(tracking_uri=tracking_uri) self.experiment_id = experiment_id if experiment_id is not None else self.mlflow_client.get_experiment_by_name(experiment_name).experiment_id self.local_mlflow_dir = local_mlflow_dir_prefix + str(self.experiment_id) + "/" if pkl_file is not None and pkl_file.exists(): self.run_df = pd.read_pickle("mlflow_run_df.pkl") print("Initialized with", len(self.run_df), "MLFlow runs from pkl") else: self.run_df = pd.DataFrame(columns=["info_run_id"]) self.metric_history_names: Set[str] = set() def query_valid_runs(self, pkl_file: Optional[Path] = None, valid_sequence_types: List[str] = ['mimic', 'huawei_logs'], filter_string_suffix: Optional[str] = " and params.ModelConfigrnn_type = 'gru'"): for sequence_type in valid_sequence_types: filter_string = "tags.sequence_type = '" + sequence_type + "'" if filter_string_suffix is not None: filter_string = filter_string + filter_string_suffix self.query_runs(filter_string=filter_string) print("Queried", len(self.run_df), "runs from MLFlow for", sequence_type) if pkl_file is not None: self.run_df.to_pickle(pkl_file) def query_runs(self, filter_string: Optional[str] = None, pkl_file: Optional[Path] = None,): runs = self.mlflow_client.search_runs( experiment_ids=[self.experiment_id], max_results=10000, filter_string=filter_string, ) for run in tqdm(runs, desc="Querying data per run..."): self._handle_run(run) if pkl_file is not None: self.run_df.to_pickle(pkl_file) def _handle_run(self, run): if ( len(self.run_df) > 0 and run.info.run_id in set(self.run_df["info_run_id"]) and run.info.status == "FINISHED" and len( self.run_df[ (self.run_df["info_run_id"] == run.info.run_id) & (self.run_df["info_status"] == run.info.status) ] ) == 1 ): return if not run.info.status == "FINISHED" and not run.info.run_id in set( self.run_df["info_run_id"] ): return run_dict = { (k + "_" + sk): v for k, sd in run.to_dictionary().items() for sk, v in sd.items() } final_run_dict = { (k + "_" + sk): v for k, sd in run_dict.items() if type(sd) == type(dict()) for sk, v in sd.items() } final_run_dict.update( {k: v for k, v in run_dict.items() if not (type(v) == type(dict()))} ) if ( final_run_dict.get("data_tags_model_type", "") == "causal" and final_run_dict.get( "data_params_KnowledgeConfigadd_causality_prefix", "False" ) == "True" ): final_run_dict["data_tags_model_type"] = "causal2" if ( (final_run_dict.get("data_tags_model_type", "") == "causal" or final_run_dict.get("data_tags_model_type", "") == "causal2") and final_run_dict.get("data_tags_sequence_type", "") == "huawei_logs" and final_run_dict.get("data_params_HuaweiPreprocessorConfiglog_only_causality", "") == "True" ): final_run_dict["data_tags_model_type"] = final_run_dict["data_tags_model_type"] + "_logonly" if ( final_run_dict.get("data_tags_model_type", "") == "text" and final_run_dict.get( "data_params_KnowledgeConfigbuild_text_hierarchy", "False" ) == "True" ): final_run_dict["data_tags_model_type"] = "text_hierarchy" if ( final_run_dict.get("data_tags_model_type", "") == "gram" and final_run_dict.get("data_tags_sequence_type", "") == "huawei_logs" and final_run_dict.get("data_params_KnowledgeConfigadd_causality_prefix") and final_run_dict.get( "data_params_HuaweiPreprocessorConfiguse_log_hierarchy", "False" ) == "True" ): final_run_dict["data_tags_model_type"] = "gram_logs" self.run_df = self.run_df.append( final_run_dict, ignore_index=True ).drop_duplicates(subset=["info_run_id"], keep="last", ignore_index=True) def mimic_run_df( self, include_noise: bool = False, include_refinements: bool = False, risk_prediction: bool = False, valid_x_columns: List[str]=["level_0"], valid_y_columns: List[str]=["level_3"], ) -> pd.DataFrame: mimic_run_df = self.run_df[ (self.run_df["data_tags_sequence_type"] == "mimic") & (self.run_df["data_params_ModelConfigrnn_type"] == "gru") & (self.run_df["data_params_SequenceConfigtest_percentage"].fillna("").astype(str) == "0.2") & (self.run_df["data_params_ModelConfigbest_model_metric"] == "val_loss") & (self.run_df["info_status"] == "FINISHED") & (self.run_df["data_params_ModelConfigrnn_dim"] == "200") & (self.run_df["data_params_ModelConfigoptimizer"].fillna("adam") == "adam") & (self.run_df["data_params_ModelConfigdropout_rate"].fillna("0.0").astype(str) == "0.5") & (self.run_df["data_params_ModelConfigrnn_dropout"].fillna("0.0").astype(str) == "0.0") & (self.run_df["data_params_ModelConfigkernel_regularizer_scope"].fillna("[]") == "[]") & (self.run_df["data_params_SequenceConfigpredict_full_y_sequence_wide"].astype(str).fillna("") == "True") & ( ( (self.run_df["data_params_SequenceConfigy_sequence_column_name"].astype(str) == "level_3") & (self.run_df["data_params_ExperimentConfigbatch_size"].astype(str).fillna("") == "128") ) | ( (self.run_df["data_params_SequenceConfigy_sequence_column_name"].astype(str) == "level_2") & (self.run_df["data_params_ExperimentConfigbatch_size"].astype(str).fillna("") == "16") ) ) & (self.run_df["data_params_MimicPreprocessorConfigreplace_keys"].fillna("[]") == "[]") ] if risk_prediction: mimic_run_df = mimic_run_df[ (mimic_run_df["data_tags_task_type"] == "risk_prediction") & (mimic_run_df["data_params_ModelConfigfinal_activation_function"] == "sigmoid") ] else: mimic_run_df = mimic_run_df[ (mimic_run_df["data_params_ModelConfigfinal_activation_function"] == "softmax") & (mimic_run_df["data_params_SequenceConfigflatten_y"] == "True") ] if len(valid_x_columns) > 0: mimic_run_df = mimic_run_df[ mimic_run_df["data_params_SequenceConfigx_sequence_column_name"].apply(lambda x: x in valid_x_columns) ] if len(valid_y_columns) > 0: mimic_run_df = mimic_run_df[ mimic_run_df["data_params_SequenceConfigy_sequence_column_name"].apply(lambda x: x in valid_y_columns) ] if not include_noise: mimic_run_df = mimic_run_df[ (mimic_run_df["data_tags_noise_type"].fillna("").apply(len) == 0) ] if not include_refinements: mimic_run_df = mimic_run_df[ (mimic_run_df["data_tags_refinement_type"].fillna("") == "") ] return mimic_run_df def huawei_run_df( self, include_noise: bool = False, include_refinements: bool = False, risk_prediction: bool = False, valid_x_columns: List[str]=["log_cluster_template", "fine_log_cluster_template"], valid_y_columns: List[str]=["attributes"], include_drain_hierarchy: bool=False, ) -> pd.DataFrame: huawei_run_df = self.run_df[ (self.run_df["data_tags_sequence_type"] == "huawei_logs") & (self.run_df["data_params_ModelConfigrnn_type"] == "gru") & (self.run_df["data_params_SequenceConfigtest_percentage"].fillna("").astype(str) == "0.1") & (self.run_df["data_params_ModelConfigbest_model_metric"] == "val_loss") & (self.run_df["info_status"] == "FINISHED") & (self.run_df["data_params_ModelConfigrnn_dim"] == "200") & (self.run_df["data_params_ModelConfigoptimizer"].fillna("adam") == "adam") & (self.run_df["data_params_ModelConfigdropout_rate"].fillna("0.0").astype(str) == "0.5") & (self.run_df["data_params_ModelConfigrnn_dropout"].fillna("0.0").astype(str) == "0.0") & (self.run_df["data_params_ModelConfigkernel_regularizer_scope"].fillna("[]") == "[]") & (self.run_df["data_params_ExperimentConfigbatch_size"].astype(str).fillna("") == "128") & ( (self.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_st"].astype(str).fillna("") == "0.75") | (self.run_df["data_params_HuaweiPreprocessorConfigdrain_log_st"].astype(str).fillna("") == "0.75") ) & ( (self.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_depth"].astype(str).fillna("") == "10") | (self.run_df["data_params_HuaweiPreprocessorConfigdrain_log_depth"].astype(str).fillna("") == "10") ) & ( (~ ( (self.run_df["data_params_SequenceConfigx_sequence_column_name"].astype(str).fillna("") == "coarse_log_cluster_template") | (self.run_df["data_params_SequenceConfigy_sequence_column_name"].astype(str).fillna("") == "coarse_log_cluster_template") | (self.run_df["data_params_HuaweiPreprocessorConfigdrain_log_sts"].fillna("[]").astype(str).apply(len) > 2) )) | ( (self.run_df["data_params_HuaweiPreprocessorConfigcoarse_drain_log_st"].astype(str).fillna("") == "0.2") & (self.run_df["data_params_HuaweiPreprocessorConfigcoarse_drain_log_depth"].astype(str).fillna("") == "4") ) ) ] if risk_prediction: huawei_run_df = huawei_run_df[ (huawei_run_df["data_tags_task_type"] == "risk_prediction") & (huawei_run_df["data_params_ModelConfigfinal_activation_function"] == "sigmoid") ] else: huawei_run_df = huawei_run_df[ (huawei_run_df["data_params_ModelConfigfinal_activation_function"] == "softmax") & (huawei_run_df["data_params_SequenceConfigflatten_y"] == "True") ] if len(valid_x_columns) > 0: huawei_run_df = huawei_run_df[ huawei_run_df["data_params_SequenceConfigx_sequence_column_name"].apply(lambda x: x in valid_x_columns) ] if len(valid_y_columns) > 0: huawei_run_df = huawei_run_df[ huawei_run_df["data_params_SequenceConfigy_sequence_column_name"].apply(lambda x: x in valid_y_columns) ] if not include_noise: huawei_run_df = huawei_run_df[ (huawei_run_df["data_tags_noise_type"].fillna("").apply(len) == 0) ] if not include_refinements: huawei_run_df = huawei_run_df[ (huawei_run_df["data_tags_refinement_type"].fillna("") == "") & (huawei_run_df["data_params_HuaweiPreprocessorConfigmin_causality"].fillna(0.0).astype(str) == "0.01") ] if not include_drain_hierarchy: huawei_run_df = huawei_run_df[ huawei_run_df["data_params_HuaweiPreprocessorConfigdrain_log_sts"].fillna("[]").astype(str).apply(len) <= 2 ] return huawei_run_df def _load_metrics_from_local(self, run_id: str) -> Optional[Dict[str, List[float]]]: local_run_dir = Path(self.local_mlflow_dir + "/" + run_id + "/metrics/") if not local_run_dir.exists() or not local_run_dir.is_dir(): return None metric_dict: Dict[str, List[float]] = {} for metric_file in local_run_dir.iterdir(): metric = metric_file.name metric_history = pd.read_csv(metric_file, sep=" ", names=["time", "value", "step"]).to_dict(orient='index') metric_dict[metric+"_history"] = [x["value"] for x in sorted(metric_history.values(), key=lambda x: x["step"])] metric_dict[metric+"_times"] = [x["time"] for x in sorted(metric_history.values(), key=lambda x: x["step"])] return metric_dict def _load_metrics_from_remote(self, run_id: str) -> Dict[str, List[float]]: run = self.mlflow_client.get_run(run_id) metric_dict: Dict[str, Any] = {} for metric in run.data.metrics.keys(): metric_history = self.mlflow_client.get_metric_history( run.info.run_id, metric ) metric_dict[metric + "_history"] = [ metric.value for metric in sorted(metric_history, key=lambda x: x.step) ] metric_dict[metric + "_times"] = [ metric.time for metric in sorted(metric_history, key=lambda x: x.step) ] return metric_dict def load_metric_history_for_ids( self, run_ids: Set[str], ): metric_records = [] for run_id in tqdm(run_ids, desc="Querying metrics for runs"): metric_dict = self._load_metrics_from_local(run_id=run_id) if metric_dict is None: metric_dict = self._load_metrics_from_remote(run_id=run_id) for metric, metric_history in metric_dict.items(): for epoch in range(len(metric_history)): metric_records.append({ "run_id": run_id, metric: metric_history[epoch], "epoch": epoch, }) return pd.merge( pd.DataFrame.from_records(metric_records), self.run_df, left_on="run_id", right_on="info_run_id", how="left" ) def load_training_times_for_ids( self, run_ids: Set[str], reference_metric_name: str = "val_loss_times" ): metric_records = [] for run_id in tqdm(run_ids, desc="Querying metrics for runs"): metric_dict = self._load_metrics_from_local(run_id=run_id) if metric_dict is None or reference_metric_name not in metric_dict: metric_dict = self._load_metrics_from_remote(run_id=run_id) if reference_metric_name not in metric_dict: print("Error! Reference Metric not in metric_dict", reference_metric_name, run_id) continue times = [int(x) for x in metric_dict[reference_metric_name]] metric_records.append({ "run_id": run_id, "num_epochs": len(times), "total_duration": max(times) - min(times), "avg_per_epoch": (max(times) - min(times)) / len(times), }) return pd.merge( pd.DataFrame.from_records(metric_records), self.run_df, left_on="run_id", right_on="info_run_id", how="inner" ) def load_best_metrics_for_ids( self, run_ids: Set[str], best_metric_name: str = "val_loss_history" ): metric_records = [] for run_id in tqdm(run_ids, desc="Querying metrics for runs"): metric_dict = self._load_metrics_from_local(run_id=run_id) if metric_dict is None or best_metric_name not in metric_dict: metric_dict = self._load_metrics_from_remote(run_id=run_id) if best_metric_name not in metric_dict: print("Error! Best Metric not in metric_dict", best_metric_name, run_id) continue best_epoch = [ idx for idx, _ in sorted( enumerate(metric_dict[best_metric_name]), key=lambda x: x[1], reverse=False, ) ][0] best_metric_dict = { metric_name + "_best": metric_dict[metric_name][best_epoch] for metric_name in metric_dict if len(metric_dict[metric_name]) > best_epoch } best_metric_dict["run_id"] = run_id best_metric_dict["epoch"] = best_epoch metric_records.append(best_metric_dict) return pd.merge( pd.DataFrame.from_records(metric_records), self.run_df, left_on="run_id", right_on="info_run_id", how="inner" )
from mlflow.tracking import MlflowClient if __name__ == "__main__": def print_metric_info(history): for m in history: print("name: {}".format(m.key)) print("value: {}".format(m.value)) print("step: {}".format(m.step)) print("timestamp: {}".format(m.timestamp)) print("--") # Create a run under the default experiment (whose id is "0"). Since this is low-level # CRUD operation, the method will create a run. To end the run, you'll have # to explicitly end it. client = MlflowClient() experiment_id = "0" run = client.create_run(experiment_id) print("run_id: {}".format(run.info.run_id)) print("--") # Log couple of metrics, update their initial value, and fetch each # logged metrics' history. for k, v in [("m1", 1.5), ("m2", 2.5)]: client.log_metric(run.info.run_id, k, v, step=0) client.log_metric(run.info.run_id, k, v + 1, step=1) client.set_terminated(run.info.run_id) # run terminated, but still you can fetch the metrics print_metric_info(client.get_metric_history(run.info.run_id, k))