def create_run(self, experiment_id, user_id, start_time, tags): with self.ManagedSessionMaker() as session: experiment = self.get_experiment(experiment_id) self._check_experiment_is_active(experiment) run_id = uuid.uuid4().hex artifact_location = append_to_uri_path( experiment.artifact_location, run_id, SqlAlchemyStore.ARTIFACTS_FOLDER_NAME) run = SqlRun(name="", artifact_uri=artifact_location, run_uuid=run_id, experiment_id=experiment_id, source_type=SourceType.to_string(SourceType.UNKNOWN), source_name="", entry_point_name="", user_id=user_id, status=RunStatus.to_string(RunStatus.RUNNING), start_time=start_time, end_time=None, source_version="", lifecycle_stage=LifecycleStage.ACTIVE) tags_dict = {} for tag in tags: tags_dict[tag.key] = tag.value run.tags = [ SqlTag(key=key, value=value) for key, value in tags_dict.items() ] self._save_to_db(objs=run, session=session) return run.to_mlflow_entity()
def end_run(status=RunStatus.to_string(RunStatus.FINISHED)): """End an active MLflow run (if there is one).""" global _active_run_stack if len(_active_run_stack) > 0: # Clear out the global existing run environment variable as well. env.unset_variable(_RUN_ID_ENV_VAR) run = _active_run_stack.pop() MlflowClient().set_terminated(run.info.run_id, status)
def test_status_mappings(self): # test enum to string mappings self.assertEqual("RUNNING", RunStatus.to_string(RunStatus.RUNNING)) self.assertEqual(RunStatus.RUNNING, RunStatus.from_string("RUNNING")) self.assertEqual("SCHEDULED", RunStatus.to_string(RunStatus.SCHEDULED)) self.assertEqual(RunStatus.SCHEDULED, RunStatus.from_string("SCHEDULED")) self.assertEqual("FINISHED", RunStatus.to_string(RunStatus.FINISHED)) self.assertEqual(RunStatus.FINISHED, RunStatus.from_string("FINISHED")) self.assertEqual("FAILED", RunStatus.to_string(RunStatus.FAILED)) self.assertEqual(RunStatus.FAILED, RunStatus.from_string("FAILED")) self.assertEqual("KILLED", RunStatus.to_string(RunStatus.KILLED)) self.assertEqual(RunStatus.KILLED, RunStatus.from_string("KILLED")) with self.assertRaises(Exception) as e: RunStatus.to_string(-120) self.assertIn("Could not get string corresponding to run status -120", str(e.exception)) with self.assertRaises(Exception) as e: RunStatus.from_string("the IMPOSSIBLE status string") self.assertIn( "Could not get run status corresponding to string the IMPO", str(e.exception))
def test_start_run_context_manager(): with start_run() as first_run: first_uuid = first_run.info.run_id # Check that start_run() causes the run information to be persisted in the store persisted_run = tracking.MlflowClient().get_run(first_uuid) assert persisted_run is not None assert persisted_run.info == first_run.info finished_run = tracking.MlflowClient().get_run(first_uuid) assert finished_run.info.status == RunStatus.to_string(RunStatus.FINISHED) # Launch a separate run that fails, verify the run status is FAILED and the run UUID is # different with pytest.raises(Exception): with start_run() as second_run: second_run_id = second_run.info.run_id raise Exception("Failing run!") assert second_run_id != first_uuid finished_run2 = tracking.MlflowClient().get_run(second_run_id) assert finished_run2.info.status == RunStatus.to_string(RunStatus.FAILED)
def update_run_info(self, run_id, run_status, end_time): with self.ManagedSessionMaker() as session: run = self._get_run(run_uuid=run_id, session=session) self._check_run_is_active(run) run.status = RunStatus.to_string(run_status) run.end_time = end_time self._save_to_db(objs=run, session=session) run = run.to_mlflow_entity() return run.info
def set_terminated(self, run_id, status=None, end_time=None): """Set a run's status to terminated. :param status: A string value of :py:class:`mlflow.entities.RunStatus`. Defaults to "FINISHED". :param end_time: If not provided, defaults to the current time.""" end_time = end_time if end_time else int(time.time() * 1000) status = status if status else RunStatus.to_string(RunStatus.FINISHED) self.store.update_run_info(run_id, run_status=RunStatus.from_string(status), end_time=end_time)
def test_run_local_git_repo(local_git_repo, local_git_repo_uri, use_start_run, version): if version is not None: uri = local_git_repo_uri + "#" + TEST_PROJECT_NAME else: uri = os.path.join("%s/" % local_git_repo, TEST_PROJECT_NAME) if version == "git-commit": version = _get_version_local_git_repo(local_git_repo) submitted_run = kiwi.projects.run( uri, entry_point="test_tracking", version=version, parameters={"use_start_run": use_start_run}, use_conda=False, experiment_id=FileStore.DEFAULT_EXPERIMENT_ID) # Blocking runs should be finished when they return validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED) # Test that we can call wait() on a synchronous run & that the run has the correct # status after calling wait(). submitted_run.wait() validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED) # Validate run contents in the FileStore run_id = submitted_run.run_id mlflow_service = kiwi.tracking.MlflowClient() run_infos = mlflow_service.list_run_infos( experiment_id=FileStore.DEFAULT_EXPERIMENT_ID, run_view_type=ViewType.ACTIVE_ONLY) assert len(run_infos) == 1 store_run_id = run_infos[0].run_id assert run_id == store_run_id run = mlflow_service.get_run(run_id) assert run.info.status == RunStatus.to_string(RunStatus.FINISHED) assert run.data.params == {"use_start_run": use_start_run} assert run.data.metrics == {"some_key": 3} tags = run.data.tags assert tags[MLFLOW_USER] == MOCK_USER assert "file:" in tags[MLFLOW_SOURCE_NAME] assert tags[MLFLOW_SOURCE_TYPE] == SourceType.to_string(SourceType.PROJECT) assert tags[MLFLOW_PROJECT_ENTRY_POINT] == "test_tracking" assert tags[MLFLOW_PROJECT_BACKEND] == "local" if version == "master": assert tags[MLFLOW_GIT_BRANCH] == "master" assert tags[MLFLOW_GIT_REPO_URL] == local_git_repo_uri assert tags[LEGACY_MLFLOW_GIT_BRANCH_NAME] == "master" assert tags[LEGACY_MLFLOW_GIT_REPO_URL] == local_git_repo_uri
def _read_persisted_run_info_dict(run_info_dict): dict_copy = run_info_dict.copy() if 'lifecycle_stage' not in dict_copy: dict_copy['lifecycle_stage'] = LifecycleStage.ACTIVE # 'status' is stored as an integer enum in meta file, but RunInfo.status field is a string. # converting to string before hydrating RunInfo. # If 'status' value not recorded in files, mark it as 'RUNNING' (default) dict_copy['status'] = RunStatus.to_string( run_info_dict.get('status', RunStatus.RUNNING)) # 'experiment_id' was changed from int to string, so we must cast to string # when reading legacy run_infos if isinstance(dict_copy["experiment_id"], int): dict_copy["experiment_id"] = str(dict_copy["experiment_id"]) return RunInfo.from_dictionary(dict_copy)
def _create(): run_id = str(uuid.uuid4()) experiment_id = str(random_int(10, 2000)) user_id = random_str(random_int(10, 25)) status = RunStatus.to_string(random.choice(RunStatus.all_status())) start_time = random_int(1, 10) end_time = start_time + random_int(1, 10) lifecycle_stage = LifecycleStage.ACTIVE artifact_uri = random_str(random_int(10, 40)) ri = RunInfo(run_uuid=run_id, run_id=run_id, experiment_id=experiment_id, user_id=user_id, status=status, start_time=start_time, end_time=end_time, lifecycle_stage=lifecycle_stage, artifact_uri=artifact_uri) return (ri, run_id, experiment_id, user_id, status, start_time, end_time, lifecycle_stage, artifact_uri)
def create_run(self, experiment_id, user_id, start_time, tags): """ Creates a run with the specified attributes. """ experiment_id = FileStore.DEFAULT_EXPERIMENT_ID if experiment_id is None else experiment_id experiment = self.get_experiment(experiment_id) if experiment is None: raise MlflowException( "Could not create run under experiment with ID %s - no such experiment " "exists." % experiment_id, databricks_pb2.RESOURCE_DOES_NOT_EXIST) if experiment.lifecycle_stage != LifecycleStage.ACTIVE: raise MlflowException( "Could not create run under non-active experiment with ID " "%s." % experiment_id, databricks_pb2.INVALID_STATE) run_uuid = uuid.uuid4().hex artifact_uri = self._get_artifact_dir(experiment_id, run_uuid) run_info = RunInfo(run_uuid=run_uuid, run_id=run_uuid, experiment_id=experiment_id, artifact_uri=artifact_uri, user_id=user_id, status=RunStatus.to_string(RunStatus.RUNNING), start_time=start_time, end_time=None, lifecycle_stage=LifecycleStage.ACTIVE) # Persist run metadata and create directories for logging metrics, parameters, artifacts run_dir = self._get_run_dir(run_info.experiment_id, run_info.run_id) mkdir(run_dir) run_info_dict = _make_persisted_run_info_dict(run_info) write_yaml(run_dir, FileStore.META_DATA_FILE_NAME, run_info_dict) mkdir(run_dir, FileStore.METRICS_FOLDER_NAME) mkdir(run_dir, FileStore.PARAMS_FOLDER_NAME) mkdir(run_dir, FileStore.ARTIFACTS_FOLDER_NAME) for tag in tags: self.set_tag(run_uuid, tag) return self.get_run(run_id=run_uuid)
def test_run(use_start_run): submitted_run = kiwi.projects.run( TEST_PROJECT_DIR, entry_point="test_tracking", parameters={"use_start_run": use_start_run}, use_conda=False, experiment_id=FileStore.DEFAULT_EXPERIMENT_ID) assert submitted_run.run_id is not None # Blocking runs should be finished when they return validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED) # Test that we can call wait() on a synchronous run & that the run has the correct # status after calling wait(). submitted_run.wait() validate_exit_status(submitted_run.get_status(), RunStatus.FINISHED) # Validate run contents in the FileStore run_id = submitted_run.run_id mlflow_service = kiwi.tracking.MlflowClient() run_infos = mlflow_service.list_run_infos( experiment_id=FileStore.DEFAULT_EXPERIMENT_ID, run_view_type=ViewType.ACTIVE_ONLY) assert len(run_infos) == 1 store_run_id = run_infos[0].run_id assert run_id == store_run_id run = mlflow_service.get_run(run_id) assert run.info.status == RunStatus.to_string(RunStatus.FINISHED) assert run.data.params == {"use_start_run": use_start_run} assert run.data.metrics == {"some_key": 3} tags = run.data.tags assert tags[MLFLOW_USER] == MOCK_USER assert "file:" in tags[MLFLOW_SOURCE_NAME] assert tags[MLFLOW_SOURCE_TYPE] == SourceType.to_string(SourceType.PROJECT) assert tags[MLFLOW_PROJECT_ENTRY_POINT] == "test_tracking"
def get_status(self): return RunStatus.to_string(self._get_status())
class SqlRun(Base): """ DB model for :py:class:`mlflow.entities.Run`. These are recorded in ``runs`` table. """ __tablename__ = 'runs' run_uuid = Column(String(32), nullable=False) """ Run UUID: `String` (limit 32 characters). *Primary Key* for ``runs`` table. """ name = Column(String(250)) """ Run name: `String` (limit 250 characters). """ source_type = Column(String(20), default=SourceType.to_string(SourceType.LOCAL)) """ Source Type: `String` (limit 20 characters). Can be one of ``NOTEBOOK``, ``JOB``, ``PROJECT``, ``LOCAL`` (default), or ``UNKNOWN``. """ source_name = Column(String(500)) """ Name of source recording the run: `String` (limit 500 characters). """ entry_point_name = Column(String(50)) """ Entry-point name that launched the run run: `String` (limit 50 characters). """ user_id = Column(String(256), nullable=True, default=None) """ User ID: `String` (limit 256 characters). Defaults to ``null``. """ status = Column(String(20), default=RunStatus.to_string(RunStatus.SCHEDULED)) """ Run Status: `String` (limit 20 characters). Can be one of ``RUNNING``, ``SCHEDULED`` (default), ``FINISHED``, ``FAILED``. """ start_time = Column(BigInteger, default=int(time.time())) """ Run start time: `BigInteger`. Defaults to current system time. """ end_time = Column(BigInteger, nullable=True, default=None) """ Run end time: `BigInteger`. """ source_version = Column(String(50)) """ Source version: `String` (limit 50 characters). """ lifecycle_stage = Column(String(20), default=LifecycleStage.ACTIVE) """ Lifecycle Stage of run: `String` (limit 32 characters). Can be either ``active`` (default) or ``deleted``. """ artifact_uri = Column(String(200), default=None) """ Default artifact location for this run: `String` (limit 200 characters). """ experiment_id = Column(Integer, ForeignKey('experiments.experiment_id')) """ Experiment ID to which this run belongs to: *Foreign Key* into ``experiment`` table. """ experiment = relationship('SqlExperiment', backref=backref('runs', cascade='all')) """ SQLAlchemy relationship (many:one) with :py:class:`mlflow.store.dbmodels.models.SqlExperiment`. """ __table_args__ = (CheckConstraint(source_type.in_(SourceTypes), name='source_type'), CheckConstraint(status.in_(RunStatusTypes), name='status'), CheckConstraint(lifecycle_stage.in_( LifecycleStage.view_type_to_stages(ViewType.ALL)), name='runs_lifecycle_stage'), PrimaryKeyConstraint('run_uuid', name='run_pk')) @staticmethod def get_attribute_name(mlflow_attribute_name): """ Resolves an MLflow attribute name to a `SqlRun` attribute name. """ # Currently, MLflow Search attributes defined in `SearchUtils.VALID_SEARCH_ATTRIBUTE_KEYS` # share the same names as their corresponding `SqlRun` attributes. Therefore, this function # returns the same attribute name return mlflow_attribute_name def to_mlflow_entity(self): """ Convert DB model to corresponding MLflow entity. :return: :py:class:`mlflow.entities.Run`. """ run_info = RunInfo(run_uuid=self.run_uuid, run_id=self.run_uuid, experiment_id=str(self.experiment_id), user_id=self.user_id, status=self.status, start_time=self.start_time, end_time=self.end_time, lifecycle_stage=self.lifecycle_stage, artifact_uri=self.artifact_uri) run_data = RunData( metrics=[m.to_mlflow_entity() for m in self.latest_metrics], params=[p.to_mlflow_entity() for p in self.params], tags=[t.to_mlflow_entity() for t in self.tags]) return Run(run_info=run_info, run_data=run_data)
BigInteger, PrimaryKeyConstraint, Boolean) from kiwi.entities import (Experiment, RunTag, Metric, Param, RunData, RunInfo, SourceType, RunStatus, Run, ViewType, ExperimentTag) from kiwi.entities.lifecycle_stage import LifecycleStage from kiwi.store.db.base_sql_model import Base SourceTypes = [ SourceType.to_string(SourceType.NOTEBOOK), SourceType.to_string(SourceType.JOB), SourceType.to_string(SourceType.LOCAL), SourceType.to_string(SourceType.UNKNOWN), SourceType.to_string(SourceType.PROJECT) ] RunStatusTypes = [ RunStatus.to_string(RunStatus.SCHEDULED), RunStatus.to_string(RunStatus.FAILED), RunStatus.to_string(RunStatus.FINISHED), RunStatus.to_string(RunStatus.RUNNING), RunStatus.to_string(RunStatus.KILLED) ] class SqlExperiment(Base): """ DB model for :py:class:`mlflow.entities.Experiment`. These are recorded in ``experiment`` table. """ __tablename__ = 'experiments' experiment_id = Column(Integer, autoincrement=True) """
def __exit__(self, exc_type, exc_val, exc_tb): status = RunStatus.FINISHED if exc_type is None else RunStatus.FAILED end_run(RunStatus.to_string(status)) return exc_type is None
def get_status(self, databricks_run_id): return RunStatus.to_string(self._get_status(databricks_run_id))