Esempio n. 1
0
    def get_dependencies_from_conda_yaml(path):
        with open(path) as f:
            conda_env = yaml.safe_load(f)

        python = None
        build_dependencies = None
        unmatched_dependencies = []
        dependencies = None
        for dep in conda_env.get("dependencies", []):
            if isinstance(dep, str):
                match = _CONDA_DEPENDENCY_REGEX.match(dep)
                if not match:
                    unmatched_dependencies.append(dep)
                    continue
                package = match.group("package")
                operator = match.group("operator")
                version = match.group("version")

                # Python
                if not python and package == "python":
                    if operator is None:
                        raise MlflowException.invalid_parameter_value(
                            f"Invalid dependency for python: {dep}. "
                            "It must be pinned (e.g. python=3.8.13)."
                        )

                    if operator in ("<", ">", "!="):
                        raise MlflowException(
                            f"Invalid version comperator for python: '{operator}'. "
                            "Must be one of ['<=', '>=', '=', '=='].",
                            error_code=INVALID_PARAMETER_VALUE,
                        )
                    python = version
                    continue

                # Build packages
                if build_dependencies is None:
                    build_dependencies = []
                # "=" is an invalid operator for pip
                operator = "==" if operator == "=" else operator
                build_dependencies.append(package + (operator or "") + (version or ""))
            elif _is_pip_deps(dep):
                dependencies = dep["pip"]
            else:
                raise MlflowException(
                    f"Invalid conda dependency: {dep}. Must be str or dict in the form of "
                    '{"pip": [...]}',
                    error_code=INVALID_PARAMETER_VALUE,
                )

        if python is None:
            raise MlflowException(
                f"Could not extract python version from {path}",
                error_code=INVALID_PARAMETER_VALUE,
            )

        if unmatched_dependencies:
            _logger.warning(
                "The following conda dependencies will not be installed in the resulting "
                "environment: %s",
                unmatched_dependencies,
            )

        return dict(python=python, build_dependencies=build_dependencies, dependencies=dependencies)
Esempio n. 2
0
def run(
    uri,
    entry_point="main",
    version=None,
    parameters=None,
    docker_args=None,
    experiment_name=None,
    experiment_id=None,
    backend="local",
    backend_config=None,
    use_conda=None,
    storage_dir=None,
    synchronous=True,
    run_id=None,
    run_name=None,
    env_manager=None,
):
    """
    Run an MLflow project. The project can be local or stored at a Git URI.

    MLflow provides built-in support for running projects locally or remotely on a Databricks or
    Kubernetes cluster. You can also run projects against other targets by installing an appropriate
    third-party plugin. See `Community Plugins <../plugins.html#community-plugins>`_ for more
    information.

    For information on using this method in chained workflows, see `Building Multistep Workflows
    <../projects.html#building-multistep-workflows>`_.

    :raises: :py:class:`mlflow.exceptions.ExecutionException` If a run launched in blocking mode
             is unsuccessful.

    :param uri: URI of project to run. A local filesystem path
                or a Git repository URI (e.g. https://github.com/mlflow/mlflow-example)
                pointing to a project directory containing an MLproject file.
    :param entry_point: Entry point to run within the project. If no entry point with the specified
                        name is found, runs the project file ``entry_point`` as a script,
                        using "python" to run ``.py`` files and the default shell (specified by
                        environment variable ``$SHELL``) to run ``.sh`` files.
    :param version: For Git-based projects, either a commit hash or a branch name.
    :param parameters: Parameters (dictionary) for the entry point command.
    :param docker_args: Arguments (dictionary) for the docker command.
    :param experiment_name: Name of experiment under which to launch the run.
    :param experiment_id: ID of experiment under which to launch the run.
    :param backend: Execution backend for the run: MLflow provides built-in support for "local",
                    "databricks", and "kubernetes" (experimental) backends. If running against
                    Databricks, will run against a Databricks workspace determined as follows:
                    if a Databricks tracking URI of the form ``databricks://profile`` has been set
                    (e.g. by setting the MLFLOW_TRACKING_URI environment variable), will run
                    against the workspace specified by <profile>. Otherwise, runs against the
                    workspace specified by the default Databricks CLI profile.
    :param backend_config: A dictionary, or a path to a JSON file (must end in '.json'), which will
                           be passed as config to the backend. The exact content which should be
                           provided is different for each execution backend and is documented
                           at https://www.mlflow.org/docs/latest/projects.html.
    :param use_conda: This argument is deprecated. Use `env_manager='local'` instead.
                      If True (the default), create a new Conda environment for the run and
                      install project dependencies within that environment. Otherwise, run the
                      project in the current environment without installing any project
                      dependencies.
    :param storage_dir: Used only if ``backend`` is "local". MLflow downloads artifacts from
                        distributed URIs passed to parameters of type ``path`` to subdirectories of
                        ``storage_dir``.
    :param synchronous: Whether to block while waiting for a run to complete. Defaults to True.
                        Note that if ``synchronous`` is False and ``backend`` is "local", this
                        method will return, but the current process will block when exiting until
                        the local run completes. If the current process is interrupted, any
                        asynchronous runs launched via this method will be terminated. If
                        ``synchronous`` is True and the run fails, the current process will
                        error out as well.
    :param run_id: Note: this argument is used internally by the MLflow project APIs and should
                   not be specified. If specified, the run ID will be used instead of
                   creating a new run.
    :param run_name: The name to give the MLflow Run associated with the project execution.
                     If ``None``, the MLflow Run name is left unset.
    :param env_manager: Specify an environment manager to create a new environment for the run and
                        install project dependencies within that environment. The following values
                        are suppported:

                        - local: use the local environment
                        - conda: use conda
                        - virtualenv: use virtualenv (and pyenv for Python version management)

                        If unspecified, default to conda.
    :return: :py:class:`mlflow.projects.SubmittedRun` exposing information (e.g. run ID)
             about the launched run.

    .. code-block:: python
        :caption: Example

        import mlflow

        project_uri = "https://github.com/mlflow/mlflow-example"
        params = {"alpha": 0.5, "l1_ratio": 0.01}

        # Run MLflow project and create a reproducible conda environment
        # on a local host
        mlflow.run(project_uri, parameters=params)

    .. code-block:: text
        :caption: Output

        ...
        ...
        Elasticnet model (alpha=0.500000, l1_ratio=0.010000):
        RMSE: 0.788347345611717
        MAE: 0.6155576449938276
        R2: 0.19729662005412607
        ... mlflow.projects: === Run (ID '6a5109febe5e4a549461e149590d0a7c') succeeded ===
    """
    backend_config_dict = backend_config if backend_config is not None else {}
    if (backend_config and type(backend_config) != dict
            and os.path.splitext(backend_config)[-1] == ".json"):
        with open(backend_config, "r") as handle:
            try:
                backend_config_dict = json.load(handle)
            except ValueError:
                _logger.error(
                    "Error when attempting to load and parse JSON cluster spec from file %s",
                    backend_config,
                )
                raise

    if use_conda is not None and env_manager is not None:
        raise MlflowException.invalid_parameter_value(
            "`use_conda` cannot be used with `env_manager`")
    elif use_conda is not None:
        warnings.warn(
            "`use_conda` is deprecated and will be removed in a future release. "
            "Use `env_manager=local` instead",
            FutureWarning,
            stacklevel=2,
        )
        env_manager = _EnvManager.CONDA if use_conda else _EnvManager.LOCAL
    elif env_manager is not None:
        _EnvManager.validate(env_manager)

    if backend == "databricks":
        mlflow.projects.databricks.before_run_validations(
            mlflow.get_tracking_uri(), backend_config)
    elif backend == "local" and run_id is not None:
        backend_config_dict[MLFLOW_LOCAL_BACKEND_RUN_ID_CONFIG] = run_id

    experiment_id = _resolve_experiment_id(experiment_name=experiment_name,
                                           experiment_id=experiment_id)

    submitted_run_obj = _run(
        uri=uri,
        experiment_id=experiment_id,
        entry_point=entry_point,
        version=version,
        parameters=parameters,
        docker_args=docker_args,
        backend_name=backend,
        backend_config=backend_config_dict,
        env_manager=env_manager,
        storage_dir=storage_dir,
        synchronous=synchronous,
        run_name=run_name,
    )
    if synchronous:
        _wait_for(submitted_run_obj)
    return submitted_run_obj
Esempio n. 3
0
 def test_invalid_parameter_value(self):
     mlflow_exception = MlflowException.invalid_parameter_value("test")
     assert mlflow_exception.error_code == "INVALID_PARAMETER_VALUE"
Esempio n. 4
0
    def run(
        self, project_uri, entry_point, params, version, backend_config, tracking_uri, experiment_id
    ):
        work_dir = fetch_and_validate_project(project_uri, version, entry_point, params)
        project = load_project(work_dir)
        if MLFLOW_LOCAL_BACKEND_RUN_ID_CONFIG in backend_config:
            run_id = backend_config[MLFLOW_LOCAL_BACKEND_RUN_ID_CONFIG]
        else:
            run_id = None
        active_run = get_or_create_run(
            run_id, project_uri, experiment_id, work_dir, version, entry_point, params
        )
        command_args = []
        command_separator = " "
        env_manager = backend_config[PROJECT_ENV_MANAGER]
        synchronous = backend_config[PROJECT_SYNCHRONOUS]
        docker_args = backend_config[PROJECT_DOCKER_ARGS]
        storage_dir = backend_config[PROJECT_STORAGE_DIR]

        # Select an appropriate env manager for the project env type
        if env_manager is None:
            env_manager = _env_type_to_env_manager(project.env_type)
        else:
            if project.env_type == env_type.PYTHON and env_manager == _EnvManager.CONDA:
                raise MlflowException.invalid_parameter_value(
                    "python_env project cannot be executed using conda. Set `--env-manager` to "
                    "'virtualenv' or 'local' to execute this project."
                )

        # If a docker_env attribute is defined in MLproject then it takes precedence over conda yaml
        # environments, so the project will be executed inside a docker container.
        if project.docker_env:
            from mlflow.projects.docker import (
                validate_docker_env,
                validate_docker_installation,
                build_docker_image,
            )

            tracking.MlflowClient().set_tag(active_run.info.run_id, MLFLOW_PROJECT_ENV, "docker")
            validate_docker_env(project)
            validate_docker_installation()
            image = build_docker_image(
                work_dir=work_dir,
                repository_uri=project.name,
                base_image=project.docker_env.get("image"),
                run_id=active_run.info.run_id,
            )
            command_args += _get_docker_command(
                image=image,
                active_run=active_run,
                docker_args=docker_args,
                volumes=project.docker_env.get("volumes"),
                user_env_vars=project.docker_env.get("environment"),
            )
        # Synchronously create a conda environment (even though this may take some time)
        # to avoid failures due to multiple concurrent attempts to create the same conda env.
        elif env_manager == _EnvManager.VIRTUALENV:
            tracking.MlflowClient().set_tag(
                active_run.info.run_id, MLFLOW_PROJECT_ENV, "virtualenv"
            )
            command_separator = " && "
            if project.env_type == env_type.CONDA:
                python_env = _PythonEnv.from_conda_yaml(project.env_config_path)
            else:
                python_env = _PythonEnv.from_yaml(project.env_config_path)
            python_bin_path = _install_python(python_env.python)
            env_root = _get_mlflow_virtualenv_root()
            work_dir_path = Path(work_dir)
            env_name = _get_virtualenv_name(python_env, work_dir_path)
            env_dir = Path(env_root).joinpath(env_name)
            activate_cmd = _create_virtualenv(work_dir_path, python_bin_path, env_dir, python_env)
            command_args += [activate_cmd]
        elif env_manager == _EnvManager.CONDA:
            tracking.MlflowClient().set_tag(active_run.info.run_id, MLFLOW_PROJECT_ENV, "conda")
            command_separator = " && "
            conda_env_name = get_or_create_conda_env(project.env_config_path)
            command_args += get_conda_command(conda_env_name)

        # In synchronous mode, run the entry point command in a blocking fashion, sending status
        # updates to the tracking server when finished. Note that the run state may not be
        # persisted to the tracking server if interrupted
        if synchronous:
            command_args += get_entry_point_command(project, entry_point, params, storage_dir)
            command_str = command_separator.join(command_args)
            return _run_entry_point(
                command_str, work_dir, experiment_id, run_id=active_run.info.run_id
            )
        # Otherwise, invoke `mlflow run` in a subprocess
        return _invoke_mlflow_run_subprocess(
            work_dir=work_dir,
            entry_point=entry_point,
            parameters=params,
            experiment_id=experiment_id,
            env_manager=env_manager,
            docker_args=docker_args,
            storage_dir=storage_dir,
            run_id=active_run.info.run_id,
        )