def _update_status(self): api_response = self._kube_api.read_namespaced_job_status( name=self._job_name, namespace=self._job_namespace, pretty=True) status = api_response.status with self._status_lock: if RunStatus.is_terminated(self._status): return self._status if self._status == RunStatus.SCHEDULED: if api_response.status.start_time is None: _logger.info("Waiting for Job to start") else: _logger.info("Job started.") self._status = RunStatus.RUNNING if status.conditions is not None: for condition in status.conditions: if condition.status == "True": _logger.info(condition.message) if condition.type == "Failed": self._status = RunStatus.FAILED elif condition.type == "Complete": self._status = RunStatus.FINISHED return self._status
def test_run(use_start_run): submitted_run = mlflow.projects.run( TEST_PROJECT_DIR, entry_point="test_tracking", parameters={"use_start_run": use_start_run}, use_conda=False, experiment_id=FileStore.DEFAULT_EXPERIMENT_ID, ) assert submitted_run.run_id is not None # Blocking runs should be finished when they return validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED) # Test that we can call wait() on a synchronous run & that the run has the correct # status after calling wait(). submitted_run.wait() validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED) # Validate run contents in the FileStore run_id = submitted_run.run_id mlflow_service = mlflow.tracking.MlflowClient() run_infos = mlflow_service.list_run_infos( experiment_id=FileStore.DEFAULT_EXPERIMENT_ID, run_view_type=ViewType.ACTIVE_ONLY) assert len(run_infos) == 1 store_run_id = run_infos[0].run_id assert run_id == store_run_id run = mlflow_service.get_run(run_id) assert run.info.status == RunStatus.to_string(RunStatus.FINISHED) assert run.data.params == { "use_start_run": use_start_run, } assert run.data.metrics == {"some_key": 3} tags = run.data.tags assert tags[MLFLOW_USER] == MOCK_USER assert "file:" in tags[MLFLOW_SOURCE_NAME] assert tags[MLFLOW_SOURCE_TYPE] == SourceType.to_string(SourceType.PROJECT) assert tags[MLFLOW_PROJECT_ENTRY_POINT] == "test_tracking"
def to_mlflow_entity(self): """ Convert DB model to corresponding MLflow entity. :return: :py:class:`mlflow.entities.Run`. """ run_info = RunInfo(run_uuid=self.run_uuid, run_id=self.run_uuid, experiment_id=str(self.experiment_id), name=self.name, source_type=SourceType.from_string( self.source_type), source_name=self.source_name, entry_point_name=self.entry_point_name, user_id=self.user_id, status=RunStatus.from_string(self.status), start_time=self.start_time, end_time=self.end_time, source_version=self.source_version, lifecycle_stage=self.lifecycle_stage, artifact_uri=self.artifact_uri) # only get latest recorded metrics per key all_metrics = [m.to_mlflow_entity() for m in self.metrics] metrics = {} for m in all_metrics: existing_metric = metrics.get(m.key) if (existing_metric is None)\ or ((m.step, m.timestamp, m.value) >= (existing_metric.step, existing_metric.timestamp, existing_metric.value)): metrics[m.key] = m run_data = RunData(metrics=list(metrics.values()), params=[p.to_mlflow_entity() for p in self.params], tags=[t.to_mlflow_entity() for t in self.tags]) return Run(run_info=run_info, run_data=run_data)
def create_run(self, experiment_id, user_id, run_name, source_type, source_name, entry_point_name, start_time, source_version, tags, parent_run_id): experiment = self.get_experiment(experiment_id) if experiment.lifecycle_stage != LifecycleStage.ACTIVE: raise MlflowException( 'Experiment id={} must be active'.format(experiment_id), INVALID_STATE) run_uuid = uuid.uuid4().hex artifact_location = build_path(experiment.artifact_location, run_uuid, SqlAlchemyStore.ARTIFACTS_FOLDER_NAME) run = SqlRun(name=run_name or "", artifact_uri=artifact_location, run_uuid=run_uuid, experiment_id=experiment_id, source_type=SourceType.to_string(source_type), source_name=source_name, entry_point_name=entry_point_name, user_id=user_id, status=RunStatus.to_string(RunStatus.RUNNING), start_time=start_time, end_time=None, source_version=source_version, lifecycle_stage=LifecycleStage.ACTIVE) for tag in tags: run.tags.append(SqlTag(key=tag.key, value=tag.value)) if parent_run_id: run.tags.append( SqlTag(key=MLFLOW_PARENT_RUN_ID, value=parent_run_id)) if run_name: run.tags.append(SqlTag(key=MLFLOW_RUN_NAME, value=run_name)) self._save_to_db([run]) return run.to_mlflow_entity()
def on_pipeline_error( self, error: Exception, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog, ): """Hook invoked when the pipeline execution fails. All the mlflow runs must be closed to avoid interference with further execution. Args: error: (Not used) The uncaught exception thrown during the pipeline run. run_params: (Not used) The params used to run the pipeline. Should be identical to the data logged by Journal with the following schema:: { "project_path": str, "env": str, "kedro_version": str, "tags": Optional[List[str]], "from_nodes": Optional[List[str]], "to_nodes": Optional[List[str]], "node_names": Optional[List[str]], "from_inputs": Optional[List[str]], "load_versions": Optional[List[str]], "pipeline_name": str, "extra_params": Optional[Dict[str, Any]] } pipeline: (Not used) The ``Pipeline`` that will was run. catalog: (Not used) The ``DataCatalog`` used during the run. """ if self._is_mlflow_enabled: while mlflow.active_run(): mlflow.end_run(RunStatus.to_string(RunStatus.FAILED)) else: # pragma: no cover # the catalog is supposed to be reloaded each time with _get_catalog, # hence it should not be modified. this is only a safeguard switch_catalog_logging(catalog, True)
def create_run(self, experiment_id, user_id, start_time, tags): with self.ManagedSessionMaker() as session: experiment = self.get_experiment(experiment_id) self._check_experiment_is_active(experiment) run_id = uuid.uuid4().hex artifact_location = append_to_uri_path(experiment.artifact_location, run_id, SqlAlchemyStore.ARTIFACTS_FOLDER_NAME) run = SqlRun(name="", artifact_uri=artifact_location, run_uuid=run_id, experiment_id=experiment_id, source_type=SourceType.to_string(SourceType.UNKNOWN), source_name="", entry_point_name="", user_id=user_id, status=RunStatus.to_string(RunStatus.RUNNING), start_time=start_time, end_time=None, source_version="", lifecycle_stage=LifecycleStage.ACTIVE) tags_dict = {} for tag in tags: tags_dict[tag.key] = tag.value run.tags = [SqlTag(key=key, value=value) for key, value in tags_dict.items()] self._save_to_db(objs=run, session=session) return run.to_mlflow_entity()
def test_bad_comparators(entity_type, bad_comparators, key, entity_value): run = Run( run_info=RunInfo( run_uuid="hi", run_id="hi", experiment_id=0, user_id="user-id", status=RunStatus.to_string(RunStatus.FAILED), start_time=0, end_time=1, lifecycle_stage=LifecycleStage.ACTIVE, ), run_data=RunData(metrics=[], params=[], tags=[]), ) for bad_comparator in bad_comparators: bad_filter = "{entity_type}.{key} {comparator} {value}".format( entity_type=entity_type, key=key, comparator=bad_comparator, value=entity_value) with pytest.raises(MlflowException) as e: SearchUtils.filter([run], bad_filter) assert "Invalid comparator" in str(e.value.message)
def get_infos(run_uuid, store=None): from mlflow.entities import RunStatus run = get_run(run_uuid, store=store) if run.info.end_time is None: duration = None else: duration = run.info.end_time - run.info.start_time return { ("run", "uuid"): run.info.run_uuid, ("run", "experiment_id"): run.info.experiment_id, ("run", "status"): RunStatus.to_string(run.info.status), ("run", "start_time"): run.info.start_time, ("run", "end_time"): run.info.end_time, ("run", "duration"): duration, **{("metric", m.key): m.value for m in get_all_metrics(run_uuid, store=store)}, **{("param", p.key): p.value for p in get_all_params(run_uuid, store=store)}, }
def create_run(self, experiment_id, user_id, start_time, tags): """ Creates a run with the specified attributes. """ experiment_id = FileStore.DEFAULT_EXPERIMENT_ID if experiment_id is None else experiment_id experiment = self.get_experiment(experiment_id) if experiment is None: raise MlflowException( "Could not create run under experiment with ID %s - no such experiment " "exists." % experiment_id, databricks_pb2.RESOURCE_DOES_NOT_EXIST) if experiment.lifecycle_stage != LifecycleStage.ACTIVE: raise MlflowException( "Could not create run under non-active experiment with ID " "%s." % experiment_id, databricks_pb2.INVALID_STATE) run_uuid = uuid.uuid4().hex artifact_uri = self._get_artifact_dir(experiment_id, run_uuid) run_info = RunInfo(run_uuid=run_uuid, run_id=run_uuid, experiment_id=experiment_id, artifact_uri=artifact_uri, user_id=user_id, status=RunStatus.to_string(RunStatus.RUNNING), start_time=start_time, end_time=None, lifecycle_stage=LifecycleStage.ACTIVE) # Persist run metadata and create directories for logging metrics, parameters, artifacts run_dir = self._get_run_dir(run_info.experiment_id, run_info.run_id) mkdir(run_dir) run_info_dict = _make_persisted_run_info_dict(run_info) write_yaml(run_dir, FileStore.META_DATA_FILE_NAME, run_info_dict) mkdir(run_dir, FileStore.METRICS_FOLDER_NAME) mkdir(run_dir, FileStore.PARAMS_FOLDER_NAME) mkdir(run_dir, FileStore.ARTIFACTS_FOLDER_NAME) for tag in tags: self.set_tag(run_uuid, tag) return self.get_run(run_id=run_uuid)
def end_run(status=RunStatus.to_string(RunStatus.FINISHED)): """End an active MLflow run (if there is one). .. code-block:: python :caption: Example import mlflow # Start run and get status mlflow.start_run() run = mlflow.active_run() print("run_id: {}; status: {}".format(run.info.run_id, run.info.status)) # End run and get status mlflow.end_run() run = mlflow.get_run(run.info.run_id) print("run_id: {}; status: {}".format(run.info.run_id, run.info.status)) print("--") # Check for any active runs print("Active run: {}".format(mlflow.active_run())) .. code-block:: text :caption: Output run_id: b47ee4563368419880b44ad8535f6371; status: RUNNING run_id: b47ee4563368419880b44ad8535f6371; status: FINISHED -- Active run: None """ global _active_run_stack if len(_active_run_stack) > 0: # Clear out the global existing run environment variable as well. env.unset_variable(_RUN_ID_ENV_VAR) run = _active_run_stack.pop() MlflowClient().set_terminated(run.info.run_id, status)
def test_order_by_metric_with_nans_and_infs(): metric_vals_str = ["nan", "inf", "-inf", "-1000", "0", "1000"] runs = [ Run(run_info=RunInfo(run_id=x, run_uuid=x, experiment_id=0, user_id="user", status=RunStatus.to_string(RunStatus.FINISHED), start_time=0, end_time=1, lifecycle_stage=LifecycleStage.ACTIVE), run_data=RunData(metrics=[Metric("x", float(x), 1, 0)])) for x in metric_vals_str ] sorted_runs_asc = [ x.info.run_id for x in SearchUtils.sort(runs, ["metrics.x asc"]) ] sorted_runs_desc = [ x.info.run_id for x in SearchUtils.sort(runs, ["metrics.x desc"]) ] # asc assert ["-inf", "-1000", "0", "1000", "inf", "nan"] == sorted_runs_asc # desc assert ["inf", "1000", "0", "-1000", "-inf", "nan"] == sorted_runs_desc
def test_param_search_estimator( # pylint: disable=unused-argument metric_name, param_search_estimator, spark_session, dataset_regression): mlflow.pyspark.ml.autolog() lr = LinearRegression(solver="l-bfgs", regParam=0.01) lrParamMaps = [ { lr.maxIter: 1, lr.standardization: False }, { lr.maxIter: 200, lr.standardization: True }, { lr.maxIter: 2, lr.standardization: False }, ] best_params = { "LinearRegression.maxIter": 200, "LinearRegression.standardization": True } eva = RegressionEvaluator(metricName=metric_name) estimator = param_search_estimator(estimator=lr, estimatorParamMaps=lrParamMaps, evaluator=eva) with mlflow.start_run() as run: model = estimator.fit(dataset_regression) estimator_info = load_json_artifact("estimator_info.json") metadata = _gen_estimator_metadata(estimator) assert metadata.hierarchy == estimator_info["hierarchy"] param_search_estiamtor_info = estimator_info[ metadata.uid_to_indexed_name_map[estimator.uid]] assert param_search_estiamtor_info[ "tuned_estimator_parameter_map"] == _get_instance_param_map_recursively( lr, 1, metadata.uid_to_indexed_name_map) assert param_search_estiamtor_info[ "tuning_parameter_map_list"] == _get_tuning_param_maps( estimator, metadata.uid_to_indexed_name_map) assert best_params == load_json_artifact("best_parameters.json") search_results = load_json_csv("search_results.csv") uid_to_indexed_name_map = metadata.uid_to_indexed_name_map run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values({ **_get_instance_param_map(estimator, uid_to_indexed_name_map), **{f"best_{k}": v for k, v in best_params.items()}, })) assert run_data.tags == get_expected_class_tags(estimator) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.stages[0].uid == model.uid loaded_best_model = load_model_by_run_id(run_id, "best_model") assert loaded_best_model.stages[0].uid == model.bestModel.uid assert run_data.artifacts == [ "best_model", "best_parameters.json", "estimator_info.json", "model", "search_results.csv", ] client = mlflow.tracking.MlflowClient() child_runs = client.search_runs( run.info.experiment_id, "tags.`mlflow.parentRunId` = '{}'".format(run_id)) assert len(child_runs) == len(search_results) for row_index, row in search_results.iterrows(): row_params = json.loads(row.get("params", "{}")) for param_name, param_value in row_params.items(): assert param_value == row.get(f"param.{param_name}") params_search_clause = " and ".join([ "params.`{}` = '{}'".format(key.split(".")[1], value) for key, value in row_params.items() ]) search_filter = "tags.`mlflow.parentRunId` = '{}' and {}".format( run_id, params_search_clause) child_runs = client.search_runs(run.info.experiment_id, search_filter) assert len(child_runs) == 1 child_run = child_runs[0] assert child_run.info.status == RunStatus.to_string(RunStatus.FINISHED) run_data = get_run_data(child_run.info.run_id) child_estimator = estimator.getEstimator().copy( estimator.getEstimatorParamMaps()[row_index]) assert run_data.tags == get_expected_class_tags(child_estimator) assert run_data.params == truncate_param_dict( stringify_dict_values({ **_get_instance_param_map(child_estimator, uid_to_indexed_name_map) })) assert (child_run.data.tags.get(MLFLOW_AUTOLOGGING) == mlflow.pyspark.ml.AUTOLOGGING_INTEGRATION_NAME) metric_name = estimator.getEvaluator().getMetricName() if isinstance(estimator, CrossValidator): avg_metric_value = model.avgMetrics[row_index] avg_metric_name = f"avg_{metric_name}" else: avg_metric_value = model.validationMetrics[row_index] avg_metric_name = metric_name assert math.isclose(avg_metric_value, run_data.metrics[avg_metric_name], rel_tol=1e-6) assert math.isclose(avg_metric_value, float(row.get(avg_metric_name)), rel_tol=1e-6) if isinstance(estimator, CrossValidator) and Version( pyspark.__version__) >= Version("3.3"): std_metric_name = f"std_{metric_name}" std_metric_value = model.stdMetrics[row_index] assert math.isclose(std_metric_value, run_data.metrics[std_metric_name], rel_tol=1e-6) assert math.isclose(std_metric_value, float(row.get(std_metric_name)), rel_tol=1e-6)
def test_is_terminated(self): self.assertTrue(RunStatus.is_terminated(RunStatus.FAILED)) self.assertTrue(RunStatus.is_terminated(RunStatus.FINISHED)) self.assertTrue(RunStatus.is_terminated(RunStatus.KILLED)) self.assertFalse(RunStatus.is_terminated(RunStatus.SCHEDULED)) self.assertFalse(RunStatus.is_terminated(RunStatus.RUNNING))
from alembic import op from packaging.version import Version from sqlalchemy import CheckConstraint, Enum from mlflow.entities import RunStatus, ViewType from mlflow.entities.lifecycle_stage import LifecycleStage from mlflow.store.tracking.dbmodels.models import SqlRun, SourceTypes # revision identifiers, used by Alembic. revision = "cfd24bdc0731" down_revision = "2b4d017a5e9b" branch_labels = None depends_on = None old_run_statuses = [ RunStatus.to_string(RunStatus.SCHEDULED), RunStatus.to_string(RunStatus.FAILED), RunStatus.to_string(RunStatus.FINISHED), RunStatus.to_string(RunStatus.RUNNING), ] new_run_statuses = [*old_run_statuses, RunStatus.to_string(RunStatus.KILLED)] # Certain SQL backends (e.g., SQLite) do not preserve CHECK constraints during migrations. # For these backends, CHECK constraints must be specified as table arguments. Here, we define # the collection of CHECK constraints that should be preserved when performing the migration. # The "status" constraint is excluded from this set because it is explicitly modified # within the migration's `upgrade()` routine. check_constraint_table_args = [ CheckConstraint(SqlRun.source_type.in_(SourceTypes), name="source_type"), CheckConstraint(
def _create_root(self, root): self.test_root = os.path.join(root, "test_file_store_%d" % random_int()) os.mkdir(self.test_root) self.experiments = [str(random_int(100, int(1e9))) for _ in range(3)] self.exp_data = {} self.run_data = {} # Include default experiment self.experiments.append(FileStore.DEFAULT_EXPERIMENT_ID) for exp in self.experiments: # create experiment exp_folder = os.path.join(self.test_root, str(exp)) os.makedirs(exp_folder) d = { "experiment_id": exp, "name": random_str(), "artifact_location": exp_folder } self.exp_data[exp] = d write_yaml(exp_folder, FileStore.META_DATA_FILE_NAME, d) # add runs self.exp_data[exp]["runs"] = [] for _ in range(2): run_id = uuid.uuid4().hex self.exp_data[exp]["runs"].append(run_id) run_folder = os.path.join(exp_folder, run_id) os.makedirs(run_folder) run_info = { "run_uuid": run_id, "run_id": run_id, "experiment_id": exp, "user_id": random_str(random_int(10, 25)), "status": random.choice(RunStatus.all_status()), "start_time": random_int(1, 10), "end_time": random_int(20, 30), "tags": [], "artifact_uri": "%s/%s" % (run_folder, FileStore.ARTIFACTS_FOLDER_NAME), } write_yaml(run_folder, FileStore.META_DATA_FILE_NAME, run_info) self.run_data[run_id] = run_info # params params_folder = os.path.join(run_folder, FileStore.PARAMS_FOLDER_NAME) os.makedirs(params_folder) params = {} for _ in range(5): param_name = random_str(random_int(4, 12)) param_value = random_str(random_int(10, 15)) param_file = os.path.join(params_folder, param_name) with open(param_file, 'w') as f: f.write(param_value) params[param_name] = param_value self.run_data[run_id]["params"] = params # metrics metrics_folder = os.path.join(run_folder, FileStore.METRICS_FOLDER_NAME) os.makedirs(metrics_folder) metrics = {} for _ in range(3): metric_name = random_str(random_int(6, 10)) timestamp = int(time.time()) metric_file = os.path.join(metrics_folder, metric_name) values = [] for _ in range(10): metric_value = random_int(100, 2000) timestamp += random_int(10000, 2000000) values.append((timestamp, metric_value)) with open(metric_file, 'a') as f: f.write("%d %d\n" % (timestamp, metric_value)) metrics[metric_name] = values self.run_data[run_id]["metrics"] = metrics # artifacts os.makedirs( os.path.join(run_folder, FileStore.ARTIFACTS_FOLDER_NAME))
def validate_exit_status(status_str, expected): assert RunStatus.from_string(status_str) == expected
def test_start_existing_run_status(empty_active_run_stack): # pylint: disable=unused-argument run_id = mlflow.start_run().info.run_id mlflow.end_run() assert MlflowClient().get_run(run_id).info.status == RunStatus.to_string(RunStatus.FINISHED) restarted_run = mlflow.start_run(run_id) assert restarted_run.info.status == RunStatus.to_string(RunStatus.RUNNING)
ViewType, ExperimentTag, ) from mlflow.entities.lifecycle_stage import LifecycleStage from mlflow.store.db.base_sql_model import Base SourceTypes = [ SourceType.to_string(SourceType.NOTEBOOK), SourceType.to_string(SourceType.JOB), SourceType.to_string(SourceType.LOCAL), SourceType.to_string(SourceType.UNKNOWN), SourceType.to_string(SourceType.PROJECT), ] RunStatusTypes = [ RunStatus.to_string(RunStatus.SCHEDULED), RunStatus.to_string(RunStatus.FAILED), RunStatus.to_string(RunStatus.FINISHED), RunStatus.to_string(RunStatus.RUNNING), RunStatus.to_string(RunStatus.KILLED), ] class SqlExperiment(Base): """ DB model for :py:class:`mlflow.entities.Experiment`. These are recorded in ``experiment`` table. """ __tablename__ = "experiments" experiment_id = Column(Integer, autoincrement=True)
def get_status(self): status = self._status return status if RunStatus.is_terminated(status) else self._update_status()
def get_status(self): return RunStatus.to_string(self._get_status())
def wait(self): kube_api = kubernetes.client.BatchV1Api() while not RunStatus.is_terminated(self._update_status(kube_api)): time.sleep(self.POLL_STATUS_INTERVAL) return self._status == RunStatus.FINISHED
import mock from mlflow.entities import (Experiment, Run, RunInfo, RunData, RunTag, Metric, Param, ExperimentTag, RunStatus, LifecycleStage, ViewType) experiment = Experiment(experiment_id="1", name="experiment_name", artifact_location="artifact_location", lifecycle_stage=LifecycleStage.ACTIVE, tags=[]) run_info = RunInfo(run_uuid="1", run_id="1", experiment_id="experiment_id", user_id="unknown", status=RunStatus.to_string(RunStatus.RUNNING), start_time=1, end_time=None, lifecycle_stage=LifecycleStage.ACTIVE, artifact_uri="artifact_uri") run_data = RunData(metrics=[], params=[], tags=[]) run = Run(run_info=run_info, run_data=run_data) metric = Metric(key="metric1", value=1, timestamp=1, step=1) param = Param(key="param1", value="val1") tag = RunTag(key="tag1", value="val1") experiment_tag = ExperimentTag(key="tag1", value="val1")
def get_status(self, databricks_run_id): return RunStatus.to_string(self._get_status(databricks_run_id))
def test_parameter_search_estimators_produce_expected_outputs( cv_class, search_space, backend): mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True) svc = sklearn.svm.SVC() cv_model = cv_class(svc, search_space, n_jobs=5, return_train_score=True) X, y = get_iris() def train_cv_model(): if backend is None: cv_model.fit(X, y) else: with sklearn.utils.parallel_backend(backend=backend): cv_model.fit(X, y) with mlflow.start_run() as run: train_cv_model() run_id = run.info.run_id params, metrics, tags, artifacts = get_run_data(run_id) expected_cv_params = truncate_dict( stringify_dict_values(cv_model.get_params(deep=False))) expected_cv_params.update({ "best_{}".format(param_name): str(param_value) for param_name, param_value in cv_model.best_params_.items() }) assert params == expected_cv_params assert { TRAINING_SCORE: cv_model.score(X, y), "best_cv_score": cv_model.best_score_, }.items() <= metrics.items() assert tags == get_expected_class_tags(cv_model) assert MODEL_DIR in artifacts assert "best_estimator" in artifacts assert "cv_results.csv" in artifacts best_estimator = mlflow.sklearn.load_model( "runs:/{}/best_estimator".format(run_id)) assert isinstance(best_estimator, sklearn.svm.SVC) cv_model = mlflow.sklearn.load_model("runs:/{}/{}".format( run_id, MODEL_DIR)) assert isinstance(cv_model, cv_class) # Ensure that a signature and input example are produced for the best estimator best_estimator_conf = get_model_conf(run.info.artifact_uri, "best_estimator") assert best_estimator_conf.signature == infer_signature( X, best_estimator.predict(X[:5])) best_estimator_path = os.path.join(run.info.artifact_uri, "best_estimator") input_example = _read_example(best_estimator_conf, best_estimator_path) best_estimator.predict( input_example) # Ensure that input example evaluation succeeds client = mlflow.tracking.MlflowClient() child_runs = client.search_runs( run.info.experiment_id, "tags.`mlflow.parentRunId` = '{}'".format(run_id)) cv_results = pd.DataFrame.from_dict(cv_model.cv_results_) # We expect to have created a child run for each point in the parameter search space assert len(child_runs) == len(cv_results) # Verify that each set of parameter search results has a corresponding MLflow run # with the expected data for _, result in cv_results.iterrows(): result_params = result.get("params", {}) params_search_clause = " and ".join([ "params.`{}` = '{}'".format(key, value) for key, value in result_params.items() ]) search_filter = "tags.`mlflow.parentRunId` = '{}' and {}".format( run_id, params_search_clause) child_runs = client.search_runs(run.info.experiment_id, search_filter) assert len(child_runs) == 1 child_run = child_runs[0] assert child_run.info.status == RunStatus.to_string(RunStatus.FINISHED) _, child_metrics, child_tags, _ = get_run_data(child_run.info.run_id) assert child_tags == get_expected_class_tags(svc) assert "mean_test_score" in child_metrics.keys() assert "std_test_score" in child_metrics.keys() # Ensure that we do not capture separate metrics for each cross validation split, which # would produce very noisy metrics results assert len([ metric for metric in child_metrics.keys() if metric.startswith("split") ]) == 0
class SqlRun(Base): """ DB model for :py:class:`mlflow.entities.Run`. These are recorded in ``runs`` table. """ __tablename__ = "runs" run_uuid = Column(String(32), nullable=False) """ Run UUID: `String` (limit 32 characters). *Primary Key* for ``runs`` table. """ name = Column(String(250)) """ Run name: `String` (limit 250 characters). """ source_type = Column(String(20), default=SourceType.to_string(SourceType.LOCAL)) """ Source Type: `String` (limit 20 characters). Can be one of ``NOTEBOOK``, ``JOB``, ``PROJECT``, ``LOCAL`` (default), or ``UNKNOWN``. """ source_name = Column(String(500)) """ Name of source recording the run: `String` (limit 500 characters). """ entry_point_name = Column(String(50)) """ Entry-point name that launched the run run: `String` (limit 50 characters). """ user_id = Column(String(256), nullable=True, default=None) """ User ID: `String` (limit 256 characters). Defaults to ``null``. """ status = Column(String(20), default=RunStatus.to_string(RunStatus.SCHEDULED)) """ Run Status: `String` (limit 20 characters). Can be one of ``RUNNING``, ``SCHEDULED`` (default), ``FINISHED``, ``FAILED``. """ start_time = Column(BigInteger, default=int(time.time())) """ Run start time: `BigInteger`. Defaults to current system time. """ end_time = Column(BigInteger, nullable=True, default=None) """ Run end time: `BigInteger`. """ source_version = Column(String(50)) """ Source version: `String` (limit 50 characters). """ lifecycle_stage = Column(String(20), default=LifecycleStage.ACTIVE) """ Lifecycle Stage of run: `String` (limit 32 characters). Can be either ``active`` (default) or ``deleted``. """ artifact_uri = Column(String(200), default=None) """ Default artifact location for this run: `String` (limit 200 characters). """ experiment_id = Column(Integer, ForeignKey("experiments.experiment_id")) """ Experiment ID to which this run belongs to: *Foreign Key* into ``experiment`` table. """ experiment = relationship("SqlExperiment", backref=backref("runs", cascade="all")) """ SQLAlchemy relationship (many:one) with :py:class:`mlflow.store.dbmodels.models.SqlExperiment`. """ __table_args__ = ( CheckConstraint(source_type.in_(SourceTypes), name="source_type"), CheckConstraint(status.in_(RunStatusTypes), name="status"), CheckConstraint( lifecycle_stage.in_( LifecycleStage.view_type_to_stages(ViewType.ALL)), name="runs_lifecycle_stage", ), PrimaryKeyConstraint("run_uuid", name="run_pk"), ) @staticmethod def get_attribute_name(mlflow_attribute_name): """ Resolves an MLflow attribute name to a `SqlRun` attribute name. """ # Currently, MLflow Search attributes defined in `SearchUtils.VALID_SEARCH_ATTRIBUTE_KEYS` # share the same names as their corresponding `SqlRun` attributes. Therefore, this function # returns the same attribute name return mlflow_attribute_name def to_mlflow_entity(self): """ Convert DB model to corresponding MLflow entity. :return: :py:class:`mlflow.entities.Run`. """ run_info = RunInfo( run_uuid=self.run_uuid, run_id=self.run_uuid, experiment_id=str(self.experiment_id), user_id=self.user_id, status=self.status, start_time=self.start_time, end_time=self.end_time, lifecycle_stage=self.lifecycle_stage, artifact_uri=self.artifact_uri, ) run_data = RunData( metrics=[m.to_mlflow_entity() for m in self.latest_metrics], params=[p.to_mlflow_entity() for p in self.params], tags=[t.to_mlflow_entity() for t in self.tags], ) return Run(run_info=run_info, run_data=run_data)
class SqlRun(Base): """ DB model for :py:class:`mlflow.entities.Run`. These are recorded in ``runs`` table. """ __tablename__ = 'runs' run_uuid = Column(String(32), nullable=False) """ Run UUID: `String` (limit 32 characters). *Primary Key* for ``runs`` table. """ name = Column(String(250)) """ Run name: `String` (limit 250 characters). """ source_type = Column(String(20), default=SourceType.to_string(SourceType.LOCAL)) """ Source Type: `String` (limit 20 characters). Can be one of ``NOTEBOOK``, ``JOB``, ``PROJECT``, ``LOCAL`` (default), or ``UNKNOWN``. """ source_name = Column(String(500)) """ Name of source recording the run: `String` (limit 500 characters). """ entry_point_name = Column(String(50)) """ Entry-point name that launched the run run: `String` (limit 50 characters). """ user_id = Column(String(256), nullable=True, default=None) """ User ID: `String` (limit 256 characters). Defaults to ``null``. """ status = Column(String(20), default=RunStatus.to_string(RunStatus.SCHEDULED)) """ Run Status: `String` (limit 20 characters). Can be one of ``RUNNING``, ``SCHEDULED`` (default), ``FINISHED``, ``FAILED``. """ start_time = Column(BigInteger, default=int(time.time())) """ Run start time: `BigInteger`. Defaults to current system time. """ end_time = Column(BigInteger, nullable=True, default=None) """ Run end time: `BigInteger`. """ source_version = Column(String(50)) """ Source version: `String` (limit 50 characters). """ lifecycle_stage = Column(String(20), default=LifecycleStage.ACTIVE) """ Lifecycle Stage of run: `String` (limit 32 characters). Can be either ``active`` (default) or ``deleted``. """ artifact_uri = Column(String(200), default=None) """ Default artifact location for this run: `String` (limit 200 characters). """ experiment_id = Column(Integer, ForeignKey('experiments.experiment_id')) """ Experiment ID to which this run belongs to: *Foreign Key* into ``experiment`` table. """ experiment = relationship('SqlExperiment', backref=backref('runs', cascade='all')) """ SQLAlchemy relationship (many:one) with :py:class:`mlflow.store.dbmodels.models.SqlExperiment`. """ __table_args__ = ( CheckConstraint(source_type.in_(SourceTypes), name='source_type'), CheckConstraint(status.in_(RunStatusTypes), name='status'), CheckConstraint(lifecycle_stage.in_(LifecycleStage.view_type_to_stages(ViewType.ALL)), name='lifecycle_stage'), PrimaryKeyConstraint('run_uuid', name='run_pk') ) def to_mlflow_entity(self): """ Convert DB model to corresponding MLflow entity. :return: :py:class:`mlflow.entities.Run`. """ # run has diff parameter names in __init__ than in properties_ so we do this manually info = _create_entity(RunInfo, self) data = _create_entity(RunData, self) return Run(run_info=info, run_data=data)
def _populate_tables(self, exp_count=3, run_count=2, param_count=5, metric_count=3, values_count=10): print("populate tables") self.experiments = [ str(random_int(100, int(1e9))) for _ in range(exp_count) ] self.exp_data = {} self.run_data = {} self.experiments.append(TestDynamodbStore.DEFAULT_EXPERIMENT_ID) for exp in self.experiments: # create experiment exp_folder = os.path.join(self.table_prefix, exp) d = { "experiment_id": exp, "name": random_str(), "artifact_location": exp_folder, "lifecycle_stage": LifecycleStage.ACTIVE, # Required for tests } self.exp_data[exp] = d self._write_table(DynamodbStore.EXPERIMENT_TABLE, d) # add runs self.exp_data[exp]["runs"] = [] for _ in range(run_count): run_id = uuid.uuid4().hex self.exp_data[exp]["runs"].append(run_id) run_folder = os.path.join(exp_folder, run_id) run_info = { "run_uuid": run_id, "run_id": run_id, "experiment_id": exp, "user_id": random_str(random_int(10, 25)), "status": random.choice(RunStatus.all_status()), "start_time": random_int(1, 10), "end_time": random_int(20, 30), "tags": [], "artifact_uri": "{}/artifacts".format(run_folder), "lifecycle_stage": LifecycleStage.ACTIVE, # Required for tests } self._write_table("run", run_info) self.run_data[run_id] = run_info # params params = {} for _ in range(param_count): param_name = random_str(random_int(4, 12)) param_value = random_str(random_int(10, 15)) self._write_table( "run_param", { "run_id": run_id, "key": param_name, "value": param_value }, ) params[param_name] = param_value self.run_data[run_id]["params"] = params # metrics metrics = {} for _ in range(metric_count): metric_name = random_str(random_int(6, 10)) timestamp = int(time.time()) values, values_map = [], [] for i in range(values_count): metric_value = random_int(i * 100, (i * 1) * 100) timestamp += random_int(i * 1000, (i + 1) * 1000) values.append((timestamp, metric_value)) values_map.insert(0, { "timestamp": timestamp, "value": metric_value }) self._write_table( "run_metric", { "run_id": run_id, "key": metric_name, "metrics": values_map }, ) metrics[metric_name] = values self.run_data[run_id]["metrics"] = metrics
def flownet_ahm_run(x: list, args: argparse.Namespace): """ Run individual ahm using the actual hyperparameter values for the run. Args: x: Actual values for the hyperparameters. args: The argparse namespace given by the user. Returns: Nothing """ config = create_ahm_config( base_config=args.config, hyperparameter_values=x, update_config=args.update_config, ) mlflow.set_tracking_uri(str(args.output_folder)) mlflow.set_experiment(f"{config.name}") mlflow.start_run(run_name=config.name) run_args = copy.deepcopy(args) run_args.output_folder = pathlib.Path( mlflow.get_artifact_uri().rsplit("artifacts")[0] + "flownet_run") try: parameters = list_hyperparameters_names( yaml.safe_load(args.config.read_text()), []) for (parameter, param_value) in zip(parameters, x): mlflow.log_param(key=parameter, value=param_value) run_flownet_history_matching(config, run_args) df_analytics = pd.read_csv( (run_args.output_folder / config.ert.analysis[0].outfile ).with_suffix(".csv")).drop_duplicates() hyperopt_loss = 0.0 for _, row in df_analytics.iterrows(): for i, metric in enumerate(df_analytics.columns[2:]): key = f"{row[0]}_{metric}" mlflow.log_metric( key=key.replace(":", "."), value=row[i + 2], step=row[1], ) if (row[1] == df_analytics["iteration"].max() and row[0] in config.flownet.hyperopt.loss.keys and metric == config.flownet.hyperopt.loss.metric): hyperopt_loss += ( row[i + 2] * config.flownet.hyperopt.loss.factors[ config.flownet.hyperopt.loss.keys.index(row[0])]) mlflow.log_metric("hyperopt_loss", value=hyperopt_loss) mlflow.end_run(status=RunStatus.to_string(RunStatus.FINISHED)) return {"loss": hyperopt_loss, "status": STATUS_OK} except Exception as exception: # pylint: disable=broad-except print(exception) mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED)) return {"status": STATUS_FAIL}
def __exit__(self, exc_type, exc_val, exc_tb): status = RunStatus.FINISHED if exc_type is None else RunStatus.FAILED end_run(RunStatus.to_string(status)) return exc_type is None
class SqlRun(Base): """ DB model for :py:class:`mlflow.entities.Run`. These are recorded in ``runs`` table. """ __tablename__ = 'runs' run_uuid = Column(String(32), nullable=False) """ Run UUID: `String` (limit 32 characters). *Primary Key* for ``runs`` table. """ name = Column(String(250)) """ Run name: `String` (limit 250 characters). """ source_type = Column(String(20), default=SourceType.to_string(SourceType.LOCAL)) """ Source Type: `String` (limit 20 characters). Can be one of ``NOTEBOOK``, ``JOB``, ``PROJECT``, ``LOCAL`` (default), or ``UNKNOWN``. """ source_name = Column(String(500)) """ Name of source recording the run: `String` (limit 500 characters). """ entry_point_name = Column(String(50)) """ Entry-point name that launched the run run: `String` (limit 50 characters). """ user_id = Column(String(256), nullable=True, default=None) """ User ID: `String` (limit 256 characters). Defaults to ``null``. """ status = Column(String(20), default=RunStatus.to_string(RunStatus.SCHEDULED)) """ Run Status: `String` (limit 20 characters). Can be one of ``RUNNING``, ``SCHEDULED`` (default), ``FINISHED``, ``FAILED``. """ start_time = Column(BigInteger, default=int(time.time())) """ Run start time: `BigInteger`. Defaults to current system time. """ end_time = Column(BigInteger, nullable=True, default=None) """ Run end time: `BigInteger`. """ source_version = Column(String(50)) """ Source version: `String` (limit 50 characters). """ lifecycle_stage = Column(String(20), default=LifecycleStage.ACTIVE) """ Lifecycle Stage of run: `String` (limit 32 characters). Can be either ``active`` (default) or ``deleted``. """ artifact_uri = Column(String(200), default=None) """ Default artifact location for this run: `String` (limit 200 characters). """ experiment_id = Column(Integer, ForeignKey('experiments.experiment_id')) """ Experiment ID to which this run belongs to: *Foreign Key* into ``experiment`` table. """ experiment = relationship('SqlExperiment', backref=backref('runs', cascade='all')) """ SQLAlchemy relationship (many:one) with :py:class:`mlflow.store.dbmodels.models.SqlExperiment`. """ __table_args__ = (CheckConstraint(source_type.in_(SourceTypes), name='source_type'), CheckConstraint(status.in_(RunStatusTypes), name='status'), CheckConstraint(lifecycle_stage.in_( LifecycleStage.view_type_to_stages(ViewType.ALL)), name='runs_lifecycle_stage'), PrimaryKeyConstraint('run_uuid', name='run_pk')) def to_mlflow_entity(self): """ Convert DB model to corresponding MLflow entity. :return: :py:class:`mlflow.entities.Run`. """ run_info = RunInfo(run_uuid=self.run_uuid, run_id=self.run_uuid, experiment_id=str(self.experiment_id), user_id=self.user_id, status=self.status, start_time=self.start_time, end_time=self.end_time, lifecycle_stage=self.lifecycle_stage, artifact_uri=self.artifact_uri) # only get latest recorded metrics per key all_metrics = [m.to_mlflow_entity() for m in self.metrics] metrics = {} for m in all_metrics: existing_metric = metrics.get(m.key) if (existing_metric is None)\ or ((m.step, m.timestamp, m.value) >= (existing_metric.step, existing_metric.timestamp, existing_metric.value)): metrics[m.key] = m run_data = RunData(metrics=list(metrics.values()), params=[p.to_mlflow_entity() for p in self.params], tags=[t.to_mlflow_entity() for t in self.tags]) return Run(run_info=run_info, run_data=run_data)