Ejemplo n.º 1
0
def is_flavor_supported_for_associated_package_versions(flavor_name):
    """
    :return: True if the specified flavor is supported for the currently-installed versions of its
             associated packages
    """
    module_name, module_key = FLAVOR_TO_MODULE_NAME_AND_VERSION_INFO_KEY[
        flavor_name]
    actual_version = importlib.import_module(module_name).__version__

    # In Databricks, treat 'pyspark 3.x.y.dev0' as 'pyspark 3.x.y'
    if module_name == "pyspark" and is_in_databricks_runtime():
        actual_version = _strip_dev_version_suffix(actual_version)

    if _violates_pep_440(actual_version) or _is_pre_or_dev_release(
            actual_version):
        return False
    min_version, max_version, _ = get_min_max_version_and_pip_release(
        module_key)

    if module_name == "pyspark" and is_in_databricks_runtime():
        return _check_spark_version_in_range(actual_version, min_version,
                                             max_version)
    else:
        return _check_version_in_range(actual_version, min_version,
                                       max_version)
Ejemplo n.º 2
0
def test_is_in_databricks_runtime():
    with mock.patch(
        "sys.modules",
        new={**sys.modules, "pyspark": mock.MagicMock(), "pyspark.databricks": mock.MagicMock()},
    ):
        # pylint: disable=unused-import,import-error,no-name-in-module,unused-variable
        import pyspark.databricks

        assert databricks_utils.is_in_databricks_runtime()

    with mock.patch("sys.modules", new={**sys.modules, "pyspark": mock.MagicMock()}):
        with pytest.raises(ModuleNotFoundError, match="No module named 'pyspark.databricks'"):
            # pylint: disable=unused-import,import-error,no-name-in-module,unused-variable
            import pyspark.databricks
        assert not databricks_utils.is_in_databricks_runtime()
Ejemplo n.º 3
0
def get_or_create_nfs_tmp_dir():
    """
    Get or create a temporary NFS directory which will be removed once python process exit.
    """
    from mlflow.utils.databricks_utils import is_in_databricks_runtime, get_repl_id
    from mlflow.utils.nfs_on_spark import get_nfs_cache_root_dir

    nfs_root_dir = get_nfs_cache_root_dir()

    if is_in_databricks_runtime() and get_repl_id() is not None:
        # Note: In databricks, atexit hook does not work.
        # The {nfs_root_dir}/repl_tmp_data/{repl_id} directory will be removed once databricks
        # notebook detaches.
        # The repl_tmp_data directory is designed to be used by all kinds of applications,
        # so create a child directory "mlflow" for storing mlflow temp data.
        tmp_nfs_dir = os.path.join(nfs_root_dir, "repl_tmp_data", get_repl_id(), "mlflow")
        os.makedirs(tmp_nfs_dir, exist_ok=True)
    else:
        tmp_nfs_dir = tempfile.mkdtemp(dir=nfs_root_dir)
        # mkdtemp creates a directory with permission 0o700
        # change it to be 0o777 to ensure it can be seen in spark UDF
        os.chmod(tmp_nfs_dir, 0o777)
        atexit.register(shutil.rmtree, tmp_nfs_dir, ignore_errors=True)

    return tmp_nfs_dir
Ejemplo n.º 4
0
def _get_installed_version(package, module=None):
    """
    Obtains the installed package version using `importlib_metadata.version`. If it fails, use
    `__import__(module or package).__version__`.
    """
    try:
        version = importlib_metadata.version(package)
    except importlib_metadata.PackageNotFoundError:
        # Note `importlib_metadata.version(package)` is not necessarily equal to
        # `__import__(package).__version__`. See the example for pytorch below.
        #
        # Example
        # -------
        # $ pip install torch==1.9.0
        # $ python -c "import torch; print(torch.__version__)"
        # 1.9.0+cu102
        # $ python -c "import importlib_metadata; print(importlib_metadata.version('torch'))"
        # 1.9.0
        version = __import__(module or package).__version__

    # In Databricks, strip a dev version suffix for pyspark (e.g. '3.1.2.dev0' -> '3.1.2')
    # and make it installable from PyPI.
    if package == "pyspark" and is_in_databricks_runtime():
        version = _strip_dev_version_suffix(version)

    return version
Ejemplo n.º 5
0
def _is_pyenv_available():
    """
    Returns True if pyenv is available, otherwise False.
    """
    if is_in_databricks_runtime():
        return os.path.exists(_DATABRICKS_PYENV_BIN_PATH)
    else:
        return shutil.which("pyenv") is not None
Ejemplo n.º 6
0
def test_no_throw():
    """
    Outside of Databricks the databricks_utils methods should never throw and should only return
    None.
    """
    assert not databricks_utils.is_in_databricks_notebook()
    assert not databricks_utils.is_in_databricks_job()
    assert not databricks_utils.is_dbfs_fuse_available()
    assert not databricks_utils.is_in_databricks_runtime()
Ejemplo n.º 7
0
def main():
    args = parse_args()
    model_path = args.model_path
    flavor = args.flavor
    # Mirror `sys.path` of the parent process
    sys.path = json.loads(args.sys_path)

    if flavor == mlflow.spark.FLAVOR_NAME and is_in_databricks_runtime():
        # Clear 'PYSPARK_GATEWAY_PORT' and 'PYSPARK_GATEWAY_SECRET' to enforce launching a new JVM
        # gateway before calling `mlflow.spark._load_pyfunc` that creates a new spark session
        # if it doesn't exist.
        os.environ.pop("PYSPARK_GATEWAY_PORT", None)
        os.environ.pop("PYSPARK_GATEWAY_SECRET", None)
        os.environ["SPARK_DIST_CLASSPATH"] = "/databricks/jars/*"

    cap_cm = _CaptureImportedModules()

    # If `model_path` refers to an MLflow model directory, load the model using
    # `mlflow.pyfunc.load_model`
    if os.path.isdir(model_path) and MLMODEL_FILE_NAME in os.listdir(
            model_path):
        pyfunc_conf = Model.load(model_path).flavors.get(
            mlflow.pyfunc.FLAVOR_NAME)
        loader_module = importlib.import_module(pyfunc_conf[MAIN])
        original = loader_module._load_pyfunc

        @functools.wraps(original)
        def _load_pyfunc_patch(*args, **kwargs):
            with cap_cm:
                return original(*args, **kwargs)

        loader_module._load_pyfunc = _load_pyfunc_patch
        mlflow.pyfunc.load_model(model_path)
    # Otherwise, load the model using `mlflow.<flavor>._load_pyfunc`. For models that don't contain
    # pyfunc flavor (e.g. scikit-learn estimator that doesn't implement a `predict` method),
    # we need to directly pass a model data path to this script.
    else:
        with cap_cm:
            importlib.import_module(f"mlflow.{flavor}")._load_pyfunc(
                model_path)

    # Store the imported modules in `output_file`
    write_to(args.output_file, "\n".join(cap_cm.imported_modules))

    # Clean up a spark session created by `mlflow.spark._load_pyfunc`
    if flavor == mlflow.spark.FLAVOR_NAME:
        from pyspark.sql import SparkSession

        spark = SparkSession._instantiatedSession
        if spark:
            try:
                spark.stop()
            except Exception:
                # Swallow unexpected exceptions
                pass
Ejemplo n.º 8
0
def _init_modules_to_packages_map():
    global _MODULES_TO_PACKAGES
    if _MODULES_TO_PACKAGES is None and _PACKAGES_TO_MODULES is None:
        # Note `importlib_metada.packages_distributions` only captures packages installed into
        # Python’s site-packages directory via tools such as pip:
        # https://importlib-metadata.readthedocs.io/en/latest/using.html#using-importlib-metadata
        _MODULES_TO_PACKAGES = importlib_metadata.packages_distributions()

        # In Databricks, `_MODULES_TO_PACKAGES` doesn't contain pyspark since it's not installed
        # via pip or conda. To work around this issue, manually add pyspark.
        if is_in_databricks_runtime():
            _MODULES_TO_PACKAGES.update({"pyspark": ["pyspark"]})
Ejemplo n.º 9
0
def _infer_requirements(model_uri, flavor):
    """
    Infers the pip requirements of the specified model by creating a subprocess and loading
    the model in it to determine which packages are imported.

    :param model_uri: The URI of the model.
    :param: flavor: The flavor name of the model.
    :return: A list of inferred pip requirements.
    """
    global _MODULES_TO_PACKAGES
    if _MODULES_TO_PACKAGES is None:
        # Note `importlib_metada.packages_distributions` only captures packages installed into
        # Python’s site-packages directory via tools such as pip:
        # https://importlib-metadata.readthedocs.io/en/latest/using.html#using-importlib-metadata
        _MODULES_TO_PACKAGES = importlib_metadata.packages_distributions()

        # In Databricks, `_MODULES_TO_PACKAGES` doesn't contain pyspark since it's not installed
        # via pip or conda. To work around this issue, manually add pyspark.
        if is_in_databricks_runtime():
            _MODULES_TO_PACKAGES.update({"pyspark": ["pyspark"]})

    global _PYPI_PACKAGE_INDEX
    if _PYPI_PACKAGE_INDEX is None:
        _PYPI_PACKAGE_INDEX = _load_pypi_package_index()

    modules = _capture_imported_modules(model_uri, flavor)
    packages = _flatten(
        [_MODULES_TO_PACKAGES.get(module, []) for module in modules])
    packages = map(_normalize_package_name, packages)
    packages = _prune_packages(packages)
    excluded_packages = [
        # Certain packages (e.g. scikit-learn 0.24.2) imports `setuptools` or `pkg_resources`
        # (a module provided by `setuptools`) to process or interact with package metadata.
        # It should be safe to exclude `setuptools` because it's rare to encounter a python
        # environment where `setuptools` is not pre-installed.
        "setuptools",
        # Exclude a package that provides the mlflow module (e.g. mlflow, mlflow-skinny).
        # Certain flavors (e.g. pytorch) import mlflow while loading a model, but mlflow should
        # not be counted as a model requirement.
        *_MODULES_TO_PACKAGES.get("mlflow", []),
    ]
    packages = packages - set(excluded_packages)
    unrecognized_packages = packages - _PYPI_PACKAGE_INDEX.package_names
    if unrecognized_packages:
        _logger.warning(
            "The following packages were not found in the public PyPI package index as of"
            " %s; if these packages are not present in the public PyPI index, you must install"
            " them manually before loading your model: %s",
            _PYPI_PACKAGE_INDEX.date,
            unrecognized_packages,
        )
    return sorted(map(_get_pinned_requirement, packages))
Ejemplo n.º 10
0
def is_flavor_supported_for_associated_package_versions(flavor_name):
    """
    :return: True if the specified flavor is supported for the currently-installed versions of its
             associated packages
    """
    module_name, module_key = FLAVOR_TO_MODULE_NAME_AND_VERSION_INFO_KEY[flavor_name]
    actual_version = importlib.import_module(module_name).__version__

    # In Databricks, treat 'pyspark 3.x.y.dev0' as 'pyspark 3.x.y'
    if module_name == "pyspark" and is_in_databricks_runtime():
        actual_version = _strip_dev_version_suffix(actual_version)

    if _violates_pep_440(actual_version) or _is_pre_or_dev_release(actual_version):
        return False
    min_version, max_version, _ = get_min_max_version_and_pip_release(module_key)

    if module_name == "pyspark" and is_in_databricks_runtime():
        # MLflow 1.25.0 is known to be compatible with PySpark 3.3.0 on Databricks, despite the
        # fact that PySpark 3.3.0 was not available in PyPI at the time of the MLflow 1.25.0 release
        if Version(max_version) < Version("3.3.0"):
            max_version = "3.3.0"
        return _check_spark_version_in_range(actual_version, min_version, max_version)
    else:
        return _check_version_in_range(actual_version, min_version, max_version)
Ejemplo n.º 11
0
def main():
    args = parse_args()
    model_path = args.model_path
    flavor = args.flavor
    # Mirror `sys.path` of the parent process
    sys.path = json.loads(args.sys_path)

    if flavor == mlflow.spark.FLAVOR_NAME and is_in_databricks_runtime():
        try:
            # pylint: disable=import-error
            from dbruntime.spark_connection import initialize_spark_connection

            initialize_spark_connection()
        except Exception as e:
            raise Exception(
                "Attempted to initialize a spark session to load the spark model, but failed"
            ) from e

    cap_cm = _CaptureImportedModules()

    # If `model_path` refers to an MLflow model directory, load the model using
    # `mlflow.pyfunc.load_model`
    if os.path.isdir(model_path) and MLMODEL_FILE_NAME in os.listdir(
            model_path):
        pyfunc_conf = Model.load(model_path).flavors.get(
            mlflow.pyfunc.FLAVOR_NAME)
        loader_module = importlib.import_module(pyfunc_conf[MAIN])
        original = loader_module._load_pyfunc

        @functools.wraps(original)
        def _load_pyfunc_patch(*args, **kwargs):
            with cap_cm:
                return original(*args, **kwargs)

        loader_module._load_pyfunc = _load_pyfunc_patch
        mlflow.pyfunc.load_model(model_path)
    # Otherwise, load the model using `mlflow.<flavor>._load_pyfunc`. For models that don't contain
    # pyfunc flavor (e.g. scikit-learn estimator that doesn't implement a `predict` method),
    # we need to directly pass a model data path to this script.
    else:
        with cap_cm:
            importlib.import_module(f"mlflow.{flavor}")._load_pyfunc(
                model_path)

    # Store the imported modules in `output_file`
    write_to(args.output_file, "\n".join(cap_cm.imported_modules))
Ejemplo n.º 12
0
def get_nfs_cache_root_dir():
    if is_in_databricks_runtime():
        nfs_enabled = (_get_active_spark_session().conf.get(
            "spark.databricks.mlflow.nfs.enabled", "true").lower() == "true")
        if nfs_enabled:
            nfs_root_dir = "/local_disk0/.ephemeral_nfs"
            # Test whether the NFS directory is writable.
            test_path = os.path.join(nfs_root_dir, uuid.uuid4().hex)
            try:
                os.makedirs(test_path)
                return nfs_root_dir
            except Exception:
                # For databricks cluster enabled Table ACL, we have no permission to access NFS
                # directory, in this case, return None representing NFS is not available.
                return None
            finally:
                shutil.rmtree(test_path, ignore_errors=True)
        else:
            return None
    else:
        return _get_active_spark_session().conf.get("spark.mlflow.nfs.rootDir",
                                                    None)
Ejemplo n.º 13
0
def test_is_in_databricks_runtime():
    with mock.patch.dict(os.environ, {"DATABRICKS_RUNTIME_VERSION": "11.x"}):
        assert databricks_utils.is_in_databricks_runtime()

    assert not databricks_utils.is_in_databricks_runtime()
Ejemplo n.º 14
0
def _get_pyenv_bin_path():
    return _DATABRICKS_PYENV_BIN_PATH if is_in_databricks_runtime(
    ) else "pyenv"
Ejemplo n.º 15
0
def get_nfs_cache_root_dir():
    # TODO: create isolated path for each user
    if is_in_databricks_runtime():
        return "/local_disk0/.ephemeral_nfs/mlflow/cache"
    else:
        return _get_active_spark_session().conf.get("spark.mlflow.nfs.rootDir", None)