Beispiel #1
0
def ui(backend_store_uri, default_artifact_root, port):
    """
    Launch the MLflow tracking UI for local viewing of run results. To launch a production
    server, use the "mlflow server" command instead.

    The UI will be visible at http://localhost:5000 by default.
    """

    # Ensure that both backend_store_uri and default_artifact_uri are set correctly.
    if not backend_store_uri:
        backend_store_uri = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH

    if not default_artifact_root:
        if is_local_uri(backend_store_uri):
            default_artifact_root = backend_store_uri
        else:
            default_artifact_root = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH

    try:
        initialize_backend_stores(backend_store_uri, default_artifact_root)
    except Exception as e:  # pylint: disable=broad-except
        _logger.error("Error initializing backend store")
        _logger.exception(e)
        sys.exit(1)

    # TODO: We eventually want to disable the write path in this version of the server.
    try:
        _run_server(backend_store_uri, default_artifact_root, "127.0.0.1",
                    port, None, 1)
    except ShellCommandException:
        eprint(
            "Running the mlflow server failed. Please see the logs above for details."
        )
        sys.exit(1)
Beispiel #2
0
def server(
    backend_store_uri,
    default_artifact_root,
    host,
    port,
    workers,
    static_prefix,
    gunicorn_opts,
    waitress_opts,
    expose_prometheus,
):
    """
    Run the MLflow tracking server.

    The server which listen on http://localhost:5000 by default, and only accept connections
    from the local machine. To let the server accept connections from other machines, you will need
    to pass ``--host 0.0.0.0`` to listen on all network interfaces
    (or a specific interface address).
    """
    from mlflow.server import _run_server
    from mlflow.server.handlers import initialize_backend_stores

    _validate_server_args(gunicorn_opts=gunicorn_opts, workers=workers, waitress_opts=waitress_opts)

    # Ensure that both backend_store_uri and default_artifact_uri are set correctly.
    if not backend_store_uri:
        backend_store_uri = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH

    if not default_artifact_root:
        if is_local_uri(backend_store_uri):
            default_artifact_root = backend_store_uri
        else:
            eprint(
                "Option 'default-artifact-root' is required, when backend store is not "
                "local file based."
            )
            sys.exit(1)

    try:
        initialize_backend_stores(backend_store_uri, default_artifact_root)
    except Exception as e:
        _logger.error("Error initializing backend store")
        _logger.exception(e)
        sys.exit(1)

    try:
        _run_server(
            backend_store_uri,
            default_artifact_root,
            host,
            port,
            static_prefix,
            workers,
            gunicorn_opts,
            waitress_opts,
            expose_prometheus,
        )
    except ShellCommandException:
        eprint("Running the mlflow server failed. Please see the logs above for details.")
        sys.exit(1)
Beispiel #3
0
    def __init__(self, db_uri, default_artifact_root):
        """
        Create a database backed store.

        :param db_uri: The SQLAlchemy database URI string to connect to the database. See
                       the `SQLAlchemy docs
                       <https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>`_
                       for format specifications. Mlflow supports the dialects ``mysql``,
                       ``mssql``, ``sqlite``, and ``postgresql``.
        :param default_artifact_root: Path/URI to location suitable for large data (such as a blob
                                      store object, DBFS path, or shared NFS file system).
        """
        super().__init__()
        self.db_uri = db_uri
        self.db_type = extract_db_type_from_uri(db_uri)
        self.artifact_root_uri = default_artifact_root
        # Quick check to see if the respective SQLAlchemy database engine has already been created.
        if db_uri not in SqlAlchemyStore._db_uri_sql_alchemy_engine_map:
            with SqlAlchemyStore._db_uri_sql_alchemy_engine_map_lock:
                # Repeat check to prevent race conditions where one thread checks for an existing
                # engine while another is creating the respective one, resulting in multiple
                # engines being created. It isn't combined with the above check to prevent
                # inefficiency from multiple threads waiting for the lock to check for engine
                # existence if it has already been created.
                if db_uri not in SqlAlchemyStore._db_uri_sql_alchemy_engine_map:
                    SqlAlchemyStore._db_uri_sql_alchemy_engine_map[
                        db_uri] = mlflow.store.db.utils.create_sqlalchemy_engine_with_retry(
                            db_uri)
        self.engine = SqlAlchemyStore._db_uri_sql_alchemy_engine_map[db_uri]
        # On a completely fresh MLflow installation against an empty database (verify database
        # emptiness by checking that 'experiments' etc aren't in the list of table names), run all
        # DB migrations
        expected_tables = [
            SqlExperiment.__tablename__,
            SqlRun.__tablename__,
            SqlMetric.__tablename__,
            SqlParam.__tablename__,
            SqlTag.__tablename__,
            SqlExperimentTag.__tablename__,
            SqlLatestMetric.__tablename__,
        ]
        inspected_tables = set(
            sqlalchemy.inspect(self.engine).get_table_names())
        if any([table not in inspected_tables for table in expected_tables]):
            mlflow.store.db.utils._initialize_tables(self.engine)
        Base.metadata.bind = self.engine
        SessionMaker = sqlalchemy.orm.sessionmaker(bind=self.engine)
        self.ManagedSessionMaker = mlflow.store.db.utils._get_managed_session_maker(
            SessionMaker, self.db_type)
        mlflow.store.db.utils._verify_schema(self.engine)

        if is_local_uri(default_artifact_root):
            mkdir(local_file_uri_to_path(default_artifact_root))

        if len(self.list_experiments(view_type=ViewType.ALL)) == 0:
            with self.ManagedSessionMaker() as session:
                self._create_default_experiment(session)
Beispiel #4
0
def test_uri_types():
    assert is_local_uri("mlruns")
    assert is_local_uri("./mlruns")
    assert is_local_uri("file:///foo/mlruns")
    assert is_local_uri("file:foo/mlruns")
    assert not is_local_uri("https://whatever")
    assert not is_local_uri("http://whatever")
    assert not is_local_uri("databricks")
    assert not is_local_uri("databricks:whatever")
    assert not is_local_uri("databricks://whatever")

    assert is_databricks_uri("databricks")
    assert is_databricks_uri("databricks:whatever")
    assert is_databricks_uri("databricks://whatever")
    assert not is_databricks_uri("mlruns")
    assert not is_databricks_uri("http://whatever")

    assert is_http_uri("http://whatever")
    assert is_http_uri("https://whatever")
    assert not is_http_uri("file://whatever")
    assert not is_http_uri("databricks://whatever")
    assert not is_http_uri("mlruns")
Beispiel #5
0
    def __init__(self, db_uri, default_artifact_root):
        """
        Create a database backed store.

        :param db_uri: The SQLAlchemy database URI string to connect to the database. See
                       the `SQLAlchemy docs
                       <https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>`_
                       for format specifications. Mlflow supports the dialects ``mysql``,
                       ``mssql``, ``sqlite``, and ``postgresql``.
        :param default_artifact_root: Path/URI to location suitable for large data (such as a blob
                                      store object, DBFS path, or shared NFS file system).
        """
        super().__init__()
        self.db_uri = db_uri
        self.db_type = extract_db_type_from_uri(db_uri)
        self.artifact_root_uri = default_artifact_root
        self.engine = mlflow.store.db.utils.create_sqlalchemy_engine_with_retry(
            db_uri)
        # On a completely fresh MLflow installation against an empty database (verify database
        # emptiness by checking that 'experiments' etc aren't in the list of table names), run all
        # DB migrations
        expected_tables = [
            SqlExperiment.__tablename__,
            SqlRun.__tablename__,
            SqlMetric.__tablename__,
            SqlParam.__tablename__,
            SqlTag.__tablename__,
            SqlExperimentTag.__tablename__,
            SqlLatestMetric.__tablename__,
        ]
        inspected_tables = set(
            sqlalchemy.inspect(self.engine).get_table_names())
        if any([table not in inspected_tables for table in expected_tables]):
            mlflow.store.db.utils._initialize_tables(self.engine)
        Base.metadata.bind = self.engine
        SessionMaker = sqlalchemy.orm.sessionmaker(bind=self.engine)
        self.ManagedSessionMaker = mlflow.store.db.utils._get_managed_session_maker(
            SessionMaker, self.db_type)
        mlflow.store.db.utils._verify_schema(self.engine)

        if is_local_uri(default_artifact_root):
            mkdir(local_file_uri_to_path(default_artifact_root))

        if len(self.list_experiments()) == 0:
            with self.ManagedSessionMaker() as session:
                self._create_default_experiment(session)
Beispiel #6
0
def resolve_default_artifact_root(
    serve_artifacts: bool,
    default_artifact_root: str,
    backend_store_uri: str,
    resolve_to_local: bool = False,
) -> str:
    if serve_artifacts and not default_artifact_root:
        default_artifact_root = DEFAULT_ARTIFACTS_URI
    elif not serve_artifacts and not default_artifact_root:
        if is_local_uri(backend_store_uri):
            default_artifact_root = backend_store_uri
        elif resolve_to_local:
            default_artifact_root = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH
        else:
            msg = (
                "Option 'default-artifact-root' is required when backend store is not "
                "local file based.")
            eprint(msg)
            raise click.UsageError(message=msg)
    return default_artifact_root
Beispiel #7
0
def ui(backend_store_uri, default_artifact_root, port, host):
    """
    Launch the MLflow tracking UI for local viewing of run results. To launch a production
    server, use the "mlflow server" command instead.

    The UI will be visible at http://localhost:5000 by default, and only accept connections
    from the local machine. To let the UI server accept connections from other machines, you will
    need to pass ``--host 0.0.0.0`` to listen on all network interfaces (or a specific interface
    address).
    """
    from mlflow.server import _run_server
    from mlflow.server.handlers import initialize_backend_stores

    # Ensure that both backend_store_uri and default_artifact_uri are set correctly.
    if not backend_store_uri:
        backend_store_uri = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH

    if not default_artifact_root:
        if is_local_uri(backend_store_uri):
            default_artifact_root = backend_store_uri
        else:
            default_artifact_root = DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH

    try:
        initialize_backend_stores(backend_store_uri, default_artifact_root)
    except Exception as e:  # pylint: disable=broad-except
        _logger.error("Error initializing backend store")
        _logger.exception(e)
        sys.exit(1)

    # TODO: We eventually want to disable the write path in this version of the server.
    try:
        _run_server(backend_store_uri, default_artifact_root, host, port, None,
                    1)
    except ShellCommandException:
        eprint(
            "Running the mlflow server failed. Please see the logs above for details."
        )
        sys.exit(1)
Beispiel #8
0
def log_model(
    spark_model,
    artifact_path,
    conda_env=None,
    dfs_tmpdir=None,
    sample_input=None,
    registered_model_name=None,
    signature: ModelSignature = None,
    input_example: ModelInputExample = None,
    await_registration_for=DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
    pip_requirements=None,
    extra_pip_requirements=None,
):
    """
    Log a Spark MLlib model as an MLflow artifact for the current run. This uses the
    MLlib persistence format and produces an MLflow Model with the Spark flavor.

    Note: If no run is active, it will instantiate a run to obtain a run_id.

    :param spark_model: Spark model to be saved - MLflow can only save descendants of
                        pyspark.ml.Model which implement MLReadable and MLWritable.
    :param artifact_path: Run relative artifact path.
    :param conda_env: Either a dictionary representation of a Conda environment or the path to a
                      Conda environment yaml file. If provided, this decsribes the environment
                      this model should be run in. At minimum, it should specify the dependencies
                      contained in :func:`get_default_conda_env()`. If `None`, the default
                      :func:`get_default_conda_env()` environment is added to the model.
                      The following is an *example* dictionary representation of a Conda
                      environment::

                        {
                            'name': 'mlflow-env',
                            'channels': ['defaults'],
                            'dependencies': [
                                'python=3.7.0',
                                'pyspark=2.3.0'
                            ]
                        }
    :param dfs_tmpdir: Temporary directory path on Distributed (Hadoop) File System (DFS) or local
                       filesystem if running in local mode. The model is written in this
                       destination and then copied into the model's artifact directory. This is
                       necessary as Spark ML models read from and write to DFS if running on a
                       cluster. If this operation completes successfully, all temporary files
                       created on the DFS are removed. Defaults to ``/tmp/mlflow``.
    :param sample_input: A sample input used to add the MLeap flavor to the model.
                         This must be a PySpark DataFrame that the model can evaluate. If
                         ``sample_input`` is ``None``, the MLeap flavor is not added.
    :param registered_model_name: If given, create a model version under
                                  ``registered_model_name``, also creating a registered model if one
                                  with the given name does not exist.

    :param signature: :py:class:`ModelSignature <mlflow.models.ModelSignature>`
                      describes model input and output :py:class:`Schema <mlflow.types.Schema>`.
                      The model signature can be :py:func:`inferred <mlflow.models.infer_signature>`
                      from datasets with valid model input (e.g. the training dataset with target
                      column omitted) and valid model output (e.g. model predictions generated on
                      the training dataset), for example:

                      .. code-block:: python

                        from mlflow.models.signature import infer_signature
                        train = df.drop_column("target_label")
                        predictions = ... # compute model predictions
                        signature = infer_signature(train, predictions)
    :param input_example: Input example provides one or several instances of valid
                          model input. The example can be used as a hint of what data to feed the
                          model. The given example will be converted to a Pandas DataFrame and then
                          serialized to json using the Pandas split-oriented format. Bytes are
                          base64-encoded.
    :param await_registration_for: Number of seconds to wait for the model version to finish
                            being created and is in ``READY`` status. By default, the function
                            waits for five minutes. Specify 0 or None to skip waiting.
    :param pip_requirements: {{ pip_requirements }}
    :param extra_pip_requirements: {{ extra_pip_requirements }}
    :return: A :py:class:`ModelInfo <mlflow.models.model.ModelInfo>` instance that contains the
             metadata of the logged model.

    .. code-block:: python
        :caption: Example

        from pyspark.ml import Pipeline
        from pyspark.ml.classification import LogisticRegression
        from pyspark.ml.feature import HashingTF, Tokenizer
        training = spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"])
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
        lr = LogisticRegression(maxIter=10, regParam=0.001)
        pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
        model = pipeline.fit(training)
        mlflow.spark.log_model(model, "spark-model")
    """
    from py4j.protocol import Py4JError

    _validate_model(spark_model)
    from pyspark.ml import PipelineModel

    if not isinstance(spark_model, PipelineModel):
        spark_model = PipelineModel([spark_model])
    run_id = mlflow.tracking.fluent._get_or_start_run().info.run_id
    run_root_artifact_uri = mlflow.get_artifact_uri()
    # If the artifact URI is a local filesystem path, defer to Model.log() to persist the model,
    # since Spark may not be able to write directly to the driver's filesystem. For example,
    # writing to `file:/uri` will write to the local filesystem from each executor, which will
    # be incorrect on multi-node clusters - to avoid such issues we just use the Model.log() path
    # here.
    if is_local_uri(run_root_artifact_uri):
        return Model.log(
            artifact_path=artifact_path,
            flavor=mlflow.spark,
            spark_model=spark_model,
            conda_env=conda_env,
            dfs_tmpdir=dfs_tmpdir,
            sample_input=sample_input,
            registered_model_name=registered_model_name,
            signature=signature,
            input_example=input_example,
            await_registration_for=await_registration_for,
            pip_requirements=pip_requirements,
            extra_pip_requirements=extra_pip_requirements,
        )
    model_dir = os.path.join(run_root_artifact_uri, artifact_path)
    # Try to write directly to the artifact repo via Spark. If this fails, defer to Model.log()
    # to persist the model
    try:
        spark_model.save(posixpath.join(model_dir, _SPARK_MODEL_PATH_SUB))
    except Py4JError:
        return Model.log(
            artifact_path=artifact_path,
            flavor=mlflow.spark,
            spark_model=spark_model,
            conda_env=conda_env,
            dfs_tmpdir=dfs_tmpdir,
            sample_input=sample_input,
            registered_model_name=registered_model_name,
            signature=signature,
            input_example=input_example,
            await_registration_for=await_registration_for,
            pip_requirements=pip_requirements,
            extra_pip_requirements=extra_pip_requirements,
        )

    # Otherwise, override the default model log behavior and save model directly to artifact repo
    mlflow_model = Model(artifact_path=artifact_path, run_id=run_id)
    with TempDir() as tmp:
        tmp_model_metadata_dir = tmp.path()
        _save_model_metadata(
            tmp_model_metadata_dir,
            spark_model,
            mlflow_model,
            sample_input,
            conda_env,
            signature=signature,
            input_example=input_example,
        )
        mlflow.tracking.fluent.log_artifacts(tmp_model_metadata_dir,
                                             artifact_path)
        if registered_model_name is not None:
            mlflow.register_model(
                "runs:/%s/%s" % (run_id, artifact_path),
                registered_model_name,
                await_registration_for,
            )
        return mlflow_model.get_model_info()
Beispiel #9
0
def log_model(spark_model,
              artifact_path,
              conda_env=None,
              dfs_tmpdir=None,
              sample_input=None,
              registered_model_name=None):
    """
    Log a Spark MLlib model as an MLflow artifact for the current run. This uses the
    MLlib persistence format and produces an MLflow Model with the Spark flavor.

    :param spark_model: Spark model to be saved - MLFlow can only save descendants of
                        pyspark.ml.Model which implement MLReadable and MLWritable.
    :param artifact_path: Run relative artifact path.
    :param conda_env: Either a dictionary representation of a Conda environment or the path to a
                      Conda environment yaml file. If provided, this decsribes the environment
                      this model should be run in. At minimum, it should specify the dependencies
                      contained in :func:`get_default_conda_env()`. If `None`, the default
                      :func:`get_default_conda_env()` environment is added to the model.
                      The following is an *example* dictionary representation of a Conda
                      environment::

                        {
                            'name': 'mlflow-env',
                            'channels': ['defaults'],
                            'dependencies': [
                                'python=3.7.0',
                                'pyspark=2.3.0'
                            ]
                        }
    :param dfs_tmpdir: Temporary directory path on Distributed (Hadoop) File System (DFS) or local
                       filesystem if running in local mode. The model is written in this
                       destination and then copied into the model's artifact directory. This is
                       necessary as Spark ML models read from and write to DFS if running on a
                       cluster. If this operation completes successfully, all temporary files
                       created on the DFS are removed. Defaults to ``/tmp/mlflow``.
    :param sample_input: A sample input used to add the MLeap flavor to the model.
                         This must be a PySpark DataFrame that the model can evaluate. If
                         ``sample_input`` is ``None``, the MLeap flavor is not added.
    :param registered_model_name: Note:: Experimental: This argument may change or be removed in a
                                  future release without warning. If given, create a model
                                  version under ``registered_model_name``, also creating a
                                  registered model if one with the given name does not exist.

    >>> from pyspark.ml import Pipeline
    >>> from pyspark.ml.classification import LogisticRegression
    >>> from pyspark.ml.feature import HashingTF, Tokenizer
    >>> training = spark.createDataFrame([
    ...   (0, "a b c d e spark", 1.0),
    ...   (1, "b d", 0.0),
    ...   (2, "spark f g h", 1.0),
    ...   (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"])
    >>> tokenizer = Tokenizer(inputCol="text", outputCol="words")
    >>> hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    >>> lr = LogisticRegression(maxIter=10, regParam=0.001)
    >>> pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    >>> model = pipeline.fit(training)
    >>> mlflow.spark.log_model(model, "spark-model")
    """
    from py4j.protocol import Py4JJavaError

    _validate_model(spark_model)
    from pyspark.ml import PipelineModel
    if not isinstance(spark_model, PipelineModel):
        spark_model = PipelineModel([spark_model])
    run_id = mlflow.tracking.fluent._get_or_start_run().info.run_id
    run_root_artifact_uri = mlflow.get_artifact_uri()
    # If the artifact URI is a local filesystem path, defer to Model.log() to persist the model,
    # since Spark may not be able to write directly to the driver's filesystem. For example,
    # writing to `file:/uri` will write to the local filesystem from each executor, which will
    # be incorrect on multi-node clusters - to avoid such issues we just use the Model.log() path
    # here.
    if is_local_uri(run_root_artifact_uri):
        return Model.log(artifact_path=artifact_path,
                         flavor=mlflow.spark,
                         spark_model=spark_model,
                         conda_env=conda_env,
                         dfs_tmpdir=dfs_tmpdir,
                         sample_input=sample_input,
                         registered_model_name=registered_model_name)
    # If Spark cannot write directly to the artifact repo, defer to Model.log() to persist the
    # model
    model_dir = os.path.join(run_root_artifact_uri, artifact_path)
    try:
        spark_model.save(os.path.join(model_dir, _SPARK_MODEL_PATH_SUB))
    except Py4JJavaError:
        return Model.log(artifact_path=artifact_path,
                         flavor=mlflow.spark,
                         spark_model=spark_model,
                         conda_env=conda_env,
                         dfs_tmpdir=dfs_tmpdir,
                         sample_input=sample_input,
                         registered_model_name=registered_model_name)

    # Otherwise, override the default model log behavior and save model directly to artifact repo
    mlflow_model = Model(artifact_path=artifact_path, run_id=run_id)
    with TempDir() as tmp:
        tmp_model_metadata_dir = tmp.path()
        _save_model_metadata(tmp_model_metadata_dir, spark_model, mlflow_model,
                             sample_input, conda_env)
        mlflow.tracking.fluent.log_artifacts(tmp_model_metadata_dir,
                                             artifact_path)
        if registered_model_name is not None:
            mlflow.register_model("runs:/%s/%s" % (run_id, artifact_path),
                                  registered_model_name)