def is_flavor_supported_for_associated_package_versions(flavor_name): """ :return: True if the specified flavor is supported for the currently-installed versions of its associated packages """ module_name, module_key = FLAVOR_TO_MODULE_NAME_AND_VERSION_INFO_KEY[ flavor_name] actual_version = importlib.import_module(module_name).__version__ # In Databricks, treat 'pyspark 3.x.y.dev0' as 'pyspark 3.x.y' if module_name == "pyspark" and is_in_databricks_runtime(): actual_version = _strip_dev_version_suffix(actual_version) if _violates_pep_440(actual_version) or _is_pre_or_dev_release( actual_version): return False min_version, max_version, _ = get_min_max_version_and_pip_release( module_key) if module_name == "pyspark" and is_in_databricks_runtime(): return _check_spark_version_in_range(actual_version, min_version, max_version) else: return _check_version_in_range(actual_version, min_version, max_version)
def test_is_in_databricks_runtime(): with mock.patch( "sys.modules", new={**sys.modules, "pyspark": mock.MagicMock(), "pyspark.databricks": mock.MagicMock()}, ): # pylint: disable=unused-import,import-error,no-name-in-module,unused-variable import pyspark.databricks assert databricks_utils.is_in_databricks_runtime() with mock.patch("sys.modules", new={**sys.modules, "pyspark": mock.MagicMock()}): with pytest.raises(ModuleNotFoundError, match="No module named 'pyspark.databricks'"): # pylint: disable=unused-import,import-error,no-name-in-module,unused-variable import pyspark.databricks assert not databricks_utils.is_in_databricks_runtime()
def get_or_create_nfs_tmp_dir(): """ Get or create a temporary NFS directory which will be removed once python process exit. """ from mlflow.utils.databricks_utils import is_in_databricks_runtime, get_repl_id from mlflow.utils.nfs_on_spark import get_nfs_cache_root_dir nfs_root_dir = get_nfs_cache_root_dir() if is_in_databricks_runtime() and get_repl_id() is not None: # Note: In databricks, atexit hook does not work. # The {nfs_root_dir}/repl_tmp_data/{repl_id} directory will be removed once databricks # notebook detaches. # The repl_tmp_data directory is designed to be used by all kinds of applications, # so create a child directory "mlflow" for storing mlflow temp data. tmp_nfs_dir = os.path.join(nfs_root_dir, "repl_tmp_data", get_repl_id(), "mlflow") os.makedirs(tmp_nfs_dir, exist_ok=True) else: tmp_nfs_dir = tempfile.mkdtemp(dir=nfs_root_dir) # mkdtemp creates a directory with permission 0o700 # change it to be 0o777 to ensure it can be seen in spark UDF os.chmod(tmp_nfs_dir, 0o777) atexit.register(shutil.rmtree, tmp_nfs_dir, ignore_errors=True) return tmp_nfs_dir
def _get_installed_version(package, module=None): """ Obtains the installed package version using `importlib_metadata.version`. If it fails, use `__import__(module or package).__version__`. """ try: version = importlib_metadata.version(package) except importlib_metadata.PackageNotFoundError: # Note `importlib_metadata.version(package)` is not necessarily equal to # `__import__(package).__version__`. See the example for pytorch below. # # Example # ------- # $ pip install torch==1.9.0 # $ python -c "import torch; print(torch.__version__)" # 1.9.0+cu102 # $ python -c "import importlib_metadata; print(importlib_metadata.version('torch'))" # 1.9.0 version = __import__(module or package).__version__ # In Databricks, strip a dev version suffix for pyspark (e.g. '3.1.2.dev0' -> '3.1.2') # and make it installable from PyPI. if package == "pyspark" and is_in_databricks_runtime(): version = _strip_dev_version_suffix(version) return version
def _is_pyenv_available(): """ Returns True if pyenv is available, otherwise False. """ if is_in_databricks_runtime(): return os.path.exists(_DATABRICKS_PYENV_BIN_PATH) else: return shutil.which("pyenv") is not None
def test_no_throw(): """ Outside of Databricks the databricks_utils methods should never throw and should only return None. """ assert not databricks_utils.is_in_databricks_notebook() assert not databricks_utils.is_in_databricks_job() assert not databricks_utils.is_dbfs_fuse_available() assert not databricks_utils.is_in_databricks_runtime()
def main(): args = parse_args() model_path = args.model_path flavor = args.flavor # Mirror `sys.path` of the parent process sys.path = json.loads(args.sys_path) if flavor == mlflow.spark.FLAVOR_NAME and is_in_databricks_runtime(): # Clear 'PYSPARK_GATEWAY_PORT' and 'PYSPARK_GATEWAY_SECRET' to enforce launching a new JVM # gateway before calling `mlflow.spark._load_pyfunc` that creates a new spark session # if it doesn't exist. os.environ.pop("PYSPARK_GATEWAY_PORT", None) os.environ.pop("PYSPARK_GATEWAY_SECRET", None) os.environ["SPARK_DIST_CLASSPATH"] = "/databricks/jars/*" cap_cm = _CaptureImportedModules() # If `model_path` refers to an MLflow model directory, load the model using # `mlflow.pyfunc.load_model` if os.path.isdir(model_path) and MLMODEL_FILE_NAME in os.listdir( model_path): pyfunc_conf = Model.load(model_path).flavors.get( mlflow.pyfunc.FLAVOR_NAME) loader_module = importlib.import_module(pyfunc_conf[MAIN]) original = loader_module._load_pyfunc @functools.wraps(original) def _load_pyfunc_patch(*args, **kwargs): with cap_cm: return original(*args, **kwargs) loader_module._load_pyfunc = _load_pyfunc_patch mlflow.pyfunc.load_model(model_path) # Otherwise, load the model using `mlflow.<flavor>._load_pyfunc`. For models that don't contain # pyfunc flavor (e.g. scikit-learn estimator that doesn't implement a `predict` method), # we need to directly pass a model data path to this script. else: with cap_cm: importlib.import_module(f"mlflow.{flavor}")._load_pyfunc( model_path) # Store the imported modules in `output_file` write_to(args.output_file, "\n".join(cap_cm.imported_modules)) # Clean up a spark session created by `mlflow.spark._load_pyfunc` if flavor == mlflow.spark.FLAVOR_NAME: from pyspark.sql import SparkSession spark = SparkSession._instantiatedSession if spark: try: spark.stop() except Exception: # Swallow unexpected exceptions pass
def _init_modules_to_packages_map(): global _MODULES_TO_PACKAGES if _MODULES_TO_PACKAGES is None and _PACKAGES_TO_MODULES is None: # Note `importlib_metada.packages_distributions` only captures packages installed into # Python’s site-packages directory via tools such as pip: # https://importlib-metadata.readthedocs.io/en/latest/using.html#using-importlib-metadata _MODULES_TO_PACKAGES = importlib_metadata.packages_distributions() # In Databricks, `_MODULES_TO_PACKAGES` doesn't contain pyspark since it's not installed # via pip or conda. To work around this issue, manually add pyspark. if is_in_databricks_runtime(): _MODULES_TO_PACKAGES.update({"pyspark": ["pyspark"]})
def _infer_requirements(model_uri, flavor): """ Infers the pip requirements of the specified model by creating a subprocess and loading the model in it to determine which packages are imported. :param model_uri: The URI of the model. :param: flavor: The flavor name of the model. :return: A list of inferred pip requirements. """ global _MODULES_TO_PACKAGES if _MODULES_TO_PACKAGES is None: # Note `importlib_metada.packages_distributions` only captures packages installed into # Python’s site-packages directory via tools such as pip: # https://importlib-metadata.readthedocs.io/en/latest/using.html#using-importlib-metadata _MODULES_TO_PACKAGES = importlib_metadata.packages_distributions() # In Databricks, `_MODULES_TO_PACKAGES` doesn't contain pyspark since it's not installed # via pip or conda. To work around this issue, manually add pyspark. if is_in_databricks_runtime(): _MODULES_TO_PACKAGES.update({"pyspark": ["pyspark"]}) global _PYPI_PACKAGE_INDEX if _PYPI_PACKAGE_INDEX is None: _PYPI_PACKAGE_INDEX = _load_pypi_package_index() modules = _capture_imported_modules(model_uri, flavor) packages = _flatten( [_MODULES_TO_PACKAGES.get(module, []) for module in modules]) packages = map(_normalize_package_name, packages) packages = _prune_packages(packages) excluded_packages = [ # Certain packages (e.g. scikit-learn 0.24.2) imports `setuptools` or `pkg_resources` # (a module provided by `setuptools`) to process or interact with package metadata. # It should be safe to exclude `setuptools` because it's rare to encounter a python # environment where `setuptools` is not pre-installed. "setuptools", # Exclude a package that provides the mlflow module (e.g. mlflow, mlflow-skinny). # Certain flavors (e.g. pytorch) import mlflow while loading a model, but mlflow should # not be counted as a model requirement. *_MODULES_TO_PACKAGES.get("mlflow", []), ] packages = packages - set(excluded_packages) unrecognized_packages = packages - _PYPI_PACKAGE_INDEX.package_names if unrecognized_packages: _logger.warning( "The following packages were not found in the public PyPI package index as of" " %s; if these packages are not present in the public PyPI index, you must install" " them manually before loading your model: %s", _PYPI_PACKAGE_INDEX.date, unrecognized_packages, ) return sorted(map(_get_pinned_requirement, packages))
def is_flavor_supported_for_associated_package_versions(flavor_name): """ :return: True if the specified flavor is supported for the currently-installed versions of its associated packages """ module_name, module_key = FLAVOR_TO_MODULE_NAME_AND_VERSION_INFO_KEY[flavor_name] actual_version = importlib.import_module(module_name).__version__ # In Databricks, treat 'pyspark 3.x.y.dev0' as 'pyspark 3.x.y' if module_name == "pyspark" and is_in_databricks_runtime(): actual_version = _strip_dev_version_suffix(actual_version) if _violates_pep_440(actual_version) or _is_pre_or_dev_release(actual_version): return False min_version, max_version, _ = get_min_max_version_and_pip_release(module_key) if module_name == "pyspark" and is_in_databricks_runtime(): # MLflow 1.25.0 is known to be compatible with PySpark 3.3.0 on Databricks, despite the # fact that PySpark 3.3.0 was not available in PyPI at the time of the MLflow 1.25.0 release if Version(max_version) < Version("3.3.0"): max_version = "3.3.0" return _check_spark_version_in_range(actual_version, min_version, max_version) else: return _check_version_in_range(actual_version, min_version, max_version)
def main(): args = parse_args() model_path = args.model_path flavor = args.flavor # Mirror `sys.path` of the parent process sys.path = json.loads(args.sys_path) if flavor == mlflow.spark.FLAVOR_NAME and is_in_databricks_runtime(): try: # pylint: disable=import-error from dbruntime.spark_connection import initialize_spark_connection initialize_spark_connection() except Exception as e: raise Exception( "Attempted to initialize a spark session to load the spark model, but failed" ) from e cap_cm = _CaptureImportedModules() # If `model_path` refers to an MLflow model directory, load the model using # `mlflow.pyfunc.load_model` if os.path.isdir(model_path) and MLMODEL_FILE_NAME in os.listdir( model_path): pyfunc_conf = Model.load(model_path).flavors.get( mlflow.pyfunc.FLAVOR_NAME) loader_module = importlib.import_module(pyfunc_conf[MAIN]) original = loader_module._load_pyfunc @functools.wraps(original) def _load_pyfunc_patch(*args, **kwargs): with cap_cm: return original(*args, **kwargs) loader_module._load_pyfunc = _load_pyfunc_patch mlflow.pyfunc.load_model(model_path) # Otherwise, load the model using `mlflow.<flavor>._load_pyfunc`. For models that don't contain # pyfunc flavor (e.g. scikit-learn estimator that doesn't implement a `predict` method), # we need to directly pass a model data path to this script. else: with cap_cm: importlib.import_module(f"mlflow.{flavor}")._load_pyfunc( model_path) # Store the imported modules in `output_file` write_to(args.output_file, "\n".join(cap_cm.imported_modules))
def get_nfs_cache_root_dir(): if is_in_databricks_runtime(): nfs_enabled = (_get_active_spark_session().conf.get( "spark.databricks.mlflow.nfs.enabled", "true").lower() == "true") if nfs_enabled: nfs_root_dir = "/local_disk0/.ephemeral_nfs" # Test whether the NFS directory is writable. test_path = os.path.join(nfs_root_dir, uuid.uuid4().hex) try: os.makedirs(test_path) return nfs_root_dir except Exception: # For databricks cluster enabled Table ACL, we have no permission to access NFS # directory, in this case, return None representing NFS is not available. return None finally: shutil.rmtree(test_path, ignore_errors=True) else: return None else: return _get_active_spark_session().conf.get("spark.mlflow.nfs.rootDir", None)
def test_is_in_databricks_runtime(): with mock.patch.dict(os.environ, {"DATABRICKS_RUNTIME_VERSION": "11.x"}): assert databricks_utils.is_in_databricks_runtime() assert not databricks_utils.is_in_databricks_runtime()
def _get_pyenv_bin_path(): return _DATABRICKS_PYENV_BIN_PATH if is_in_databricks_runtime( ) else "pyenv"
def get_nfs_cache_root_dir(): # TODO: create isolated path for each user if is_in_databricks_runtime(): return "/local_disk0/.ephemeral_nfs/mlflow/cache" else: return _get_active_spark_session().conf.get("spark.mlflow.nfs.rootDir", None)