def get_or_create_experiment(experiment_name) -> Experiment: """ Creates an mlflow experiment :param experiment_name: str. The name of the experiment to be set in MLFlow :return: the experiment created if it doesn't exist, experiment if it is already created. """ try: client = MlflowClient() experiment: Experiment = client.get_experiment_by_name( name=experiment_name) if experiment and experiment.lifecycle_stage != 'deleted': return experiment else: experiment_id = client.create_experiment(name=experiment_name) return client.get_experiment(experiment_id=experiment_id) except Exception as e: logger.error( f'Unable to get or create experiment {experiment_name}: {e}')
class MLFlowLoggerCallback(LoggerCallback): """MLFlow Logger to automatically log Tune results and config to MLFlow. MLFlow (https://mlflow.org) Tracking is an open source library for recording and querying experiments. This Ray Tune ``LoggerCallback`` sends information (config parameters, training results & metrics, and artifacts) to MLFlow for automatic experiment tracking. Args: tracking_uri (str): The tracking URI for where to manage experiments and runs. This can either be a local file path or a remote server. This arg gets passed directly to mlflow.tracking.MlflowClient initialization. When using Tune in a multi-node setting, make sure to set this to a remote server and not a local file path. registry_uri (str): The registry URI that gets passed directly to mlflow.tracking.MlflowClient initialization. experiment_name (str): The experiment name to use for this Tune run. If None is passed in here, the Logger will automatically then check the MLFLOW_EXPERIMENT_NAME and then the MLFLOW_EXPERIMENT_ID environment variables to determine the experiment name. If the experiment with the name already exists with MlFlow, it will be reused. If not, a new experiment will be created with that name. save_artifact (bool): If set to True, automatically save the entire contents of the Tune local_dir as an artifact to the corresponding run in MlFlow. Example: .. code-block:: python from ray.tune.integration.mlflow import MLFlowLoggerCallback tune.run( train_fn, config={ # define search space here "parameter_1": tune.choice([1, 2, 3]), "parameter_2": tune.choice([4, 5, 6]), }, callbacks=[MLFlowLoggerCallback( experiment_name="experiment1", save_artifact=True)]) """ def __init__(self, tracking_uri: Optional[str] = None, registry_uri: Optional[str] = None, experiment_name: Optional[str] = None, save_artifact: bool = False): mlflow = _import_mlflow() if mlflow is None: raise RuntimeError("MLFlow has not been installed. Please `pip " "install mlflow` to use the MLFlowLogger.") from mlflow.tracking import MlflowClient self.client = MlflowClient(tracking_uri=tracking_uri, registry_uri=registry_uri) if experiment_name is None: # If no name is passed in, then check env vars. # First check if experiment_name env var is set. experiment_name = os.environ.get("MLFLOW_EXPERIMENT_NAME") if experiment_name is not None: # First check if experiment with name exists. experiment = self.client.get_experiment_by_name(experiment_name) if experiment is not None: # If it already exists then get the id. experiment_id = experiment.experiment_id else: # If it does not exist, create the experiment. experiment_id = self.client.create_experiment( name=experiment_name) else: # No experiment_name is passed in and name env var is not set. # Now check the experiment id env var. experiment_id = os.environ.get("MLFLOW_EXPERIMENT_ID") # Confirm that an experiment with this id exists. if experiment_id is None or self.client.get_experiment( experiment_id) is None: raise ValueError("No experiment_name passed, " "MLFLOW_EXPERIMENT_NAME env var is not " "set, and MLFLOW_EXPERIMENT_ID either " "is not set or does not exist. Please " "set one of these to use the " "MLFlowLoggerCallback.") # At this point, experiment_id should be set. self.experiment_id = experiment_id self.save_artifact = save_artifact self._trial_runs = {} def log_trial_start(self, trial: "Trial"): # Create run if not already exists. if trial not in self._trial_runs: run = self.client.create_run(experiment_id=self.experiment_id, tags={"trial_name": str(trial)}) self._trial_runs[trial] = run.info.run_id run_id = self._trial_runs[trial] # Log the config parameters. config = trial.config for key, value in config.items(): self.client.log_param(run_id=run_id, key=key, value=value) def log_trial_result(self, iteration: int, trial: "Trial", result: Dict): run_id = self._trial_runs[trial] for key, value in result.items(): try: value = float(value) except (ValueError, TypeError): logger.debug("Cannot log key {} with value {} since the " "value cannot be converted to float.".format( key, value)) continue self.client.log_metric(run_id=run_id, key=key, value=value, step=iteration) def log_trial_end(self, trial: "Trial", failed: bool = False): run_id = self._trial_runs[trial] # Log the artifact if set_artifact is set to True. if self.save_artifact: self.client.log_artifacts(run_id, local_dir=trial.logdir) # Stop the run once trial finishes. status = "FINISHED" if not failed else "FAILED" self.client.set_terminated(run_id=run_id, status=status)
import warnings from mlflow.tracking import MlflowClient if __name__ == "__main__": warnings.filterwarnings("ignore") def print_experiment_info(experiment): print("Name: {}".format(experiment.name)) print("Experiment_id: {}".format(experiment.experiment_id)) print("Lifecycle_stage: {}".format(experiment.lifecycle_stage)) # Create an experiment name, which must be unique and case sensitive client = MlflowClient() experiment_id = client.create_experiment("Social NLP Experiments") # Fetch experiment metadata information experiment = client.get_experiment(experiment_id) print_experiment_info(experiment) print("--") # Rename and fetch experiment metadata information client.rename_experiment(experiment_id, "Social Media NLP Experiments") experiment = client.get_experiment(experiment_id) print_experiment_info(experiment)
class MlflowLogger(BaseTrainLogger): """A common mlflow logger for pipeline training Parameters ---------- experiment_name The experiment name artifact_location The artifact location used for this experiment run_name If specified, set a name to created run tags Extra arguments used as tags to created experiment run """ __LOGGER = logging.getLogger(__name__) def __init__( self, experiment_name: str = None, artifact_location: str = None, run_name: str = None, **tags, ): self._client = MlflowClient() self._experiment = self._configure_experiment_with_retry( experiment_name, artifact_location ) tags = tags or {} if run_name: tags[mlflow_tags.MLFLOW_RUN_NAME] = run_name run = self._client.create_run(self._experiment.experiment_id, tags=tags) self._run_id = run.info.run_id self._skipped_metrics = ["training_duration"] def _configure_experiment_with_retry( self, experiment_name: str, artifact_location: str, retries: int = 5 ) -> Optional[Experiment]: """Tries to configure (fetch or create) an mlflow experiment with retrying process on errors""" if retries <= 0: return None try: experiment = self._client.get_experiment_by_name( experiment_name or "default" ) if experiment: return experiment return self._client.get_experiment( self._client.create_experiment(experiment_name, artifact_location) ) except Exception as e: self.__LOGGER.debug(e) return self._configure_experiment_with_retry( experiment_name, artifact_location, retries=retries - 1 ) def init_train( self, pipeline: "Pipeline", trainer_configuration: "TrainerConfiguration", training: InstancesDataset, validation: Optional[InstancesDataset] = None, test: Optional[InstancesDataset] = None, ): from pandas import json_normalize for prefix, params_set in [ ("pipeline", json_normalize(pipeline.config.as_dict())), ("trainer", json_normalize(dataclasses.asdict(trainer_configuration))), ]: for key, value in params_set.to_dict(orient="records")[0].items(): if value: self._client.log_param(self._run_id, f"{prefix}.{key}", value) self._client.log_param( self._run_id, key="pipeline.num_parameters", value=pipeline.num_parameters ) self._client.log_param( self._run_id, key="pipeline.num_trainable_parameters", value=pipeline.num_trainable_parameters, ) def log_epoch_metrics(self, epoch: int, metrics: Dict[str, Any]): [ self._client.log_metric(self._run_id, key=k, value=v, step=epoch) for k, v in metrics.items() if k not in self._skipped_metrics ] def end_train(self, results: TrainingResults): try: self._client.log_artifact(self._run_id, local_path=results.model_path) [ self._client.log_metric(self._run_id, key=k, value=v) for k, v in results.metrics.items() if k not in self._skipped_metrics ] finally: self._client.set_terminated(self._run_id)
class TianshouMLFlowLogger(tianshou.utils.BaseLogger): def __init__( self, train_interval=1000, test_interval=1, update_interval=1000, save_interval=1, experiment_name="Default", run_name=None, tracking_uri=None, tags=None, save_dir="./mlruns", prefix="", artifact_location=None, filename=None, info_logger=None, ): super().__init__(train_interval, test_interval, update_interval) self.last_save_step = -1 self.save_interval = save_interval if not tracking_uri: tracking_uri = f"{LOCAL_FILE_URI_PREFIX}{save_dir}" self._experiment_name = experiment_name self._experiment_id = None self._tracking_uri = tracking_uri self._run_name = run_name self._run_id = None self.tags = self._get_mlflow_tags(filename=filename, manual_tags=tags) self._prefix = prefix self._artifact_location = artifact_location self.info_logger = info_logger self._mlflow_client = MlflowClient(tracking_uri) @property def experiment(self): """ Actual MLflow object Example:: self.logger.experiment.some_mlflow_function() """ if self._experiment_id is None: expt = self._mlflow_client.get_experiment_by_name( self._experiment_name) if expt is not None: self._experiment_id = expt.experiment_id else: self._experiment_id = self._mlflow_client.create_experiment( name=self._experiment_name, artifact_location=self._artifact_location, ) if self._run_id is None: if self._run_name is not None: self.tags[MLFLOW_RUN_NAME] = self._run_name run = self._mlflow_client.create_run( experiment_id=self._experiment_id, tags=self.tags) self._run_id = run.info.run_id e = self._mlflow_client.get_experiment(self._experiment_id) return self._mlflow_client @property def run_id(self): """Create the experiment if it does not exist to get the run id. Returns: The run id. """ _ = self.experiment return self._run_id @property def experiment_id(self): """Create the experiment if it does not exist to get the experiment id. Returns: The experiment id. """ _ = self.experiment return self._experiment_id def log_hyperparameters(self, params): params_to_log = process_nested_dict(params) for k, v in params_to_log.items(): if len(str(v)) > 250: f"Mlflow only allows parameters with up to 250 characters. Discard {k}={v}", RuntimeWarning continue self.experiment.log_param(self.run_id, k, v) def write(self, step_type: str, step: int, data: LOG_DATA_TYPE) -> None: """Specify how the writer is used to log data. :param str step_type: namespace which the data dict belongs to. :param int step: stands for the ordinate of the data dict. :param dict data: the data to write with format ``{key: value}``. """ for k, v in data.items(): self.experiment.log_metric(self._run_id, k, v, step) def log_test_data(self, collect_result: dict, step: int) -> None: """Use writer to log statistics generated during evaluating. :param collect_result: a dict containing information of data collected in evaluating stage, i.e., returns of collector.collect(). :param int step: stands for the timestep the collect_result being logged. .. note:: ``collect_result`` will be modified in-place with "rew", "rew_std", "len", and "len_std" keys. """ assert collect_result["n/ep"] > 0 rews, lens = collect_result["rews"], collect_result["lens"] rew, rew_std, len_, len_std = rews.mean(), rews.std(), lens.mean( ), lens.std() collect_result.update(rew=rew, rew_std=rew_std, len=len_, len_std=len_std) if step - self.last_log_test_step >= self.test_interval: log_data = { "test/env_step": step, "test/reward": rew, "test/length": len_, "test/reward_std": rew_std, "test/length_std": len_std, } # Supplement the data to be logged with stuff from info if self.info_logger: info_to_log = self.info_logger.report_for_logging() for k, v in info_to_log.items(): log_data[k] = v self.write("test/env_step", step, log_data) self.last_log_test_step = step def close(self) -> None: """""" self.experiment.set_terminated(self._run_id) def save_data(self, epoch, env_step, gradient_step, save_checkpoint_fn): if save_checkpoint_fn and epoch - self.last_save_step >= self.save_interval: self.last_save_step = epoch checkpoint_path = Path( save_checkpoint_fn(epoch, env_step, gradient_step)) metadata = { "save/epoch": epoch, "save/env_step": env_step, "save/gradient_step": gradient_step, "checkpoint_path": str(checkpoint_path), } metadata_file_path = checkpoint_path.parent / "trainer_metadata.yaml" with open(str(metadata_file_path), "w") as f: yaml.dump(metadata, f) self.experiment.log_artifact(self.run_id, checkpoint_path.parent, "training_checkpoints") @staticmethod def _get_mlflow_tags(filename=None, manual_tags=None): # Can specify filename as string # for example for Jupyter where os.path.basename(__file__) # does not work # Use specified filename if provided # Otherwise resolve automatically if filename: source_name = filename else: source_name = resolve_tags()["mlflow.source.name"] # Use specified working directory if provided work_dir = os.getcwd() source_version = mlflow_utils._get_git_commit(work_dir) tags = { MLFLOW_USER: mlflow_utils._get_user(), MLFLOW_SOURCE_NAME: source_name, } if source_version is not None: tags[MLFLOW_GIT_COMMIT] = source_version repo_url = mlflow_utils._get_git_repo_url(work_dir) if repo_url is not None: tags[MLFLOW_GIT_REPO_URL] = repo_url tags[LEGACY_MLFLOW_GIT_REPO_URL] = repo_url if manual_tags: for k, v in manual_tags.items(): tags[k] = v return tags
'google_job_id': '', 'ref': self.args.ref } print('callback') callback(self.args.callback_uri,data) return False def __getattr__(name: str): def dummy_method(*args, **kwargs): return None if name == 'log_metric' or name == 'log_param': if not mlflow_disabled_reporting(): return getattr(mlflow, name) else: return dummy_method if __name__ == '__main__': client = MlflowClient() # Examine the deleted experiment details. experiment = client.get_experiment(4) print("--") print(experiment) # Restore the experiment and fetch its info client.restore_experiment(2)