Example #1
0
def _parse_kubernetes_config(backend_config):
    """
    Creates build context tarfile containing Dockerfile and project code, returning path to tarfile
    """
    if not backend_config:
        raise ExecutionException("Backend_config file not found.")
    kube_config = backend_config.copy()
    if 'kube-job-template-path' not in backend_config.keys():
        raise ExecutionException(
            "'kube-job-template-path' attribute must be specified in "
            "backend_config.")
    kube_job_template = backend_config['kube-job-template-path']
    if os.path.exists(kube_job_template):
        with open(kube_job_template, 'r') as job_template:
            yaml_obj = yaml.safe_load(job_template.read())
        kube_job_template = yaml_obj
        kube_config['kube-job-template'] = kube_job_template
    else:
        raise ExecutionException(
            "Could not find 'kube-job-template-path': {}".format(
                kube_job_template))
    if 'kube-context' not in backend_config.keys():
        _logger.debug("Could not find kube-context in backend_config."
                      " Using current context or in-cluster config.")
    if 'repository-uri' not in backend_config.keys():
        raise ExecutionException(
            "Could not find 'repository-uri' in backend_config.")
    return kube_config
Example #2
0
def _fetch_project(uri, version=None):
    """
    Fetch a project into a local directory, returning the path to the local project directory.
    """
    parsed_uri, subdirectory = _parse_subdirectory(uri)
    use_temp_dst_dir = _is_zip_uri(parsed_uri) or not _is_local_uri(parsed_uri)
    dst_dir = tempfile.mkdtemp() if use_temp_dst_dir else parsed_uri
    if use_temp_dst_dir:
        _logger.info("=== Fetching project from %s into %s ===", uri, dst_dir)
    if _is_zip_uri(parsed_uri):
        if _is_file_uri(parsed_uri):
            parsed_file_uri = urllib.parse.urlparse(
                urllib.parse.unquote(parsed_uri))
            parsed_uri = os.path.join(parsed_file_uri.netloc,
                                      parsed_file_uri.path)
        _unzip_repo(zip_file=(parsed_uri if _is_local_uri(parsed_uri) else
                              _fetch_zip_repo(parsed_uri)),
                    dst_dir=dst_dir)
    elif _is_local_uri(uri):
        if version is not None:
            raise ExecutionException(
                "Setting a version is only supported for Git project URIs")
        if use_temp_dst_dir:
            dir_util.copy_tree(src=parsed_uri, dst=dst_dir)
    else:
        assert _GIT_URI_REGEX.match(
            parsed_uri), "Non-local URI %s should be a Git URI" % parsed_uri
        _fetch_git_repo(parsed_uri, version, dst_dir)
    res = os.path.abspath(os.path.join(dst_dir, subdirectory))
    if not os.path.exists(res):
        raise ExecutionException("Could not find subdirectory %s of %s" %
                                 (subdirectory, dst_dir))
    return res
Example #3
0
def _validate_docker_env(project):
    if not project.name:
        raise ExecutionException(
            "Project name in MLProject must be specified when using docker "
            "for image tagging.")
    if not project.docker_env.get('image'):
        raise ExecutionException(
            "Project with docker environment must specify the docker image "
            "to use via an 'image' field under the 'docker_env' field.")
Example #4
0
def _fetch_git_repo(uri, version, dst_dir):
    """
    Clone the git repo at ``uri`` into ``dst_dir``, checking out commit ``version`` (or defaulting
    to the head commit of the repository's master branch if version is unspecified).
    Assumes authentication parameters are specified by the environment, e.g. by a Git credential
    helper.
    """
    # We defer importing git until the last moment, because the import requires that the git
    # executable is availble on the PATH, so we only want to fail if we actually need it.
    import git
    repo = git.Repo.init(dst_dir)
    origin = repo.create_remote("origin", uri)
    origin.fetch()
    if version is not None:
        try:
            repo.git.checkout(version)
        except git.exc.GitCommandError as e:
            raise ExecutionException(
                "Unable to checkout version '%s' of git repo %s"
                "- please ensure that the version exists in the repo. "
                "Error: %s" % (version, uri, e))
    else:
        repo.create_head("master", origin.refs.master)
        repo.heads.master.checkout()
    repo.submodule_update(init=True, recursive=True)
Example #5
0
 def _dbfs_path_exists(self, dbfs_path):
     """
     Return True if the passed-in path exists in DBFS for the workspace corresponding to the
     default Databricks CLI profile. The path is expected to be a relative path to the DBFS root
     directory, e.g. 'path/to/file'.
     """
     host_creds = databricks_utils.get_databricks_host_creds(
         self.databricks_profile_uri)
     response = rest_utils.http_request(host_creds=host_creds,
                                        endpoint="/api/2.0/dbfs/get-status",
                                        method="GET",
                                        json={"path": "/%s" % dbfs_path})
     try:
         json_response_obj = json.loads(response.text)
     except Exception:  # pylint: disable=broad-except
         raise MlflowException(
             "API request to check existence of file at DBFS path %s failed with status code "
             "%s. Response body: %s" %
             (dbfs_path, response.status_code, response.text))
     # If request fails with a RESOURCE_DOES_NOT_EXIST error, the file does not exist on DBFS
     error_code_field = "error_code"
     if error_code_field in json_response_obj:
         if json_response_obj[
                 error_code_field] == "RESOURCE_DOES_NOT_EXIST":
             return False
         raise ExecutionException(
             "Got unexpected error response when checking whether file %s "
             "exists in DBFS: %s" % (dbfs_path, json_response_obj))
     return True
Example #6
0
 def _validate_parameters(self, user_parameters):
     missing_params = []
     for name in self.parameters:
         if (name not in user_parameters
                 and self.parameters[name].default is None):
             missing_params.append(name)
     if missing_params:
         raise ExecutionException(
             "No value given for missing parameters: %s" %
             ", ".join(["'%s'" % name for name in missing_params]))
Example #7
0
def push_image_to_registry(image_tag):
    client = docker.from_env()
    _logger.info("=== Pushing docker image %s ===", image_tag)
    for line in client.images.push(repository=image_tag,
                                   stream=True,
                                   decode=True):
        if 'error' in line and line['error']:
            raise ExecutionException("Error while pushing to docker registry: "
                                     "{error}".format(error=line['error']))
    return client.images.get_registry_data(image_tag).id
Example #8
0
def _validate_docker_installation():
    """
    Verify if Docker is installed on host machine.
    """
    try:
        docker_path = "docker"
        process.exec_cmd([docker_path, "--help"], throw_on_error=False)
    except EnvironmentError:
        raise ExecutionException(
            "Could not find Docker executable. "
            "Ensure Docker is installed as per the instructions "
            "at https://docs.docker.com/install/overview/.")
Example #9
0
def before_run_validations(tracking_uri, backend_config):
    """Validations to perform before running a project on Databricks."""
    if backend_config is None:
        raise ExecutionException(
            "Backend spec must be provided when launching MLflow project "
            "runs on Databricks.")
    elif "existing_cluster_id" in backend_config:
        raise MlflowException(message=(
            "MLflow Project runs on Databricks must provide a *new cluster* specification."
            " Project execution against existing clusters is not currently supported. For more"
            " information, see https://mlflow.org/docs/latest/projects.html"
            "#run-an-mlflow-project-on-databricks"),
                              error_code=INVALID_PARAMETER_VALUE)
    if not is_databricks_uri(tracking_uri) and \
            not is_http_uri(tracking_uri):
        raise ExecutionException(
            "When running on Databricks, the MLflow tracking URI must be of the form "
            "'databricks' or 'databricks://profile', or a remote HTTP URI accessible to both the "
            "current client and code running on Databricks. Got local tracking URI %s. "
            "Please specify a valid tracking URI via mlflow.set_tracking_uri or by setting the "
            "MLFLOW_TRACKING_URI environment variable." % tracking_uri)
Example #10
0
def _parse_subdirectory(uri):
    # Parses a uri and returns the uri and subdirectory as separate values.
    # Uses '#' as a delimiter.
    subdirectory = ''
    parsed_uri = uri
    if '#' in uri:
        subdirectory = uri[uri.find('#') + 1:]
        parsed_uri = uri[:uri.find('#')]
    if subdirectory and '.' in subdirectory:
        raise ExecutionException(
            "'.' is not allowed in project subdirectory paths.")
    return parsed_uri, subdirectory
Example #11
0
 def _compute_path_value(self, user_param_value, storage_dir, key_position):
     local_path = get_local_path_or_none(user_param_value)
     if local_path:
         if not os.path.exists(local_path):
             raise ExecutionException(
                 "Got value %s for parameter %s, but no such file or "
                 "directory was found." % (user_param_value, self.name))
         return os.path.abspath(local_path)
     target_sub_dir = 'param_{}'.format(key_position)
     download_dir = os.path.join(storage_dir, target_sub_dir)
     os.mkdir(download_dir)
     return artifact_utils._download_artifact_from_uri(
         artifact_uri=user_param_value, output_path=download_dir)
Example #12
0
def _fetch_zip_repo(uri):
    import requests
    from io import BytesIO
    # TODO (dbczumar): Replace HTTP resolution via ``requests.get`` with an invocation of
    # ```mlflow.data.download_uri()`` when the API supports the same set of available stores as
    # the artifact repository (Azure, FTP, etc). See the following issue:
    # https://github.com/mlflow/mlflow/issues/763.
    response = requests.get(uri)
    try:
        response.raise_for_status()
    except requests.HTTPError as error:
        raise ExecutionException("Unable to retrieve ZIP file. Reason: %s" %
                                 str(error))
    return BytesIO(response.content)
Example #13
0
 def get_entry_point(self, entry_point):
     if entry_point in self._entry_points:
         return self._entry_points[entry_point]
     _, file_extension = os.path.splitext(entry_point)
     ext_to_cmd = {".py": "python", ".sh": os.environ.get("SHELL", "bash")}
     if file_extension in ext_to_cmd:
         command = "%s %s" % (ext_to_cmd[file_extension],
                              shlex_quote(entry_point))
         if not is_string_type(command):
             command = command.encode("utf-8")
         return EntryPoint(name=entry_point, parameters={}, command=command)
     elif file_extension == ".R":
         command = "Rscript -e \"mlflow::mlflow_source('%s')\" --args" % shlex_quote(
             entry_point)
         return EntryPoint(name=entry_point, parameters={}, command=command)
     raise ExecutionException(
         "Could not find {0} among entry points {1} or interpret {0} as a "
         "runnable script. Supported script file extensions: "
         "{2}".format(entry_point, list(self._entry_points.keys()),
                      list(ext_to_cmd.keys())))
Example #14
0
def _wait_for(submitted_run_obj):
    """Wait on the passed-in submitted run, reporting its status to the tracking server."""
    run_id = submitted_run_obj.run_id
    active_run = None
    # Note: there's a small chance we fail to report the run's status to the tracking server if
    # we're interrupted before we reach the try block below
    try:
        active_run = tracking.MlflowClient().get_run(
            run_id) if run_id is not None else None
        if submitted_run_obj.wait():
            _logger.info("=== Run (ID '%s') succeeded ===", run_id)
            _maybe_set_run_terminated(active_run, "FINISHED")
        else:
            _maybe_set_run_terminated(active_run, "FAILED")
            raise ExecutionException("Run (ID '%s') failed" % run_id)
    except KeyboardInterrupt:
        _logger.error("=== Run (ID '%s') interrupted, cancelling run ===",
                      run_id)
        submitted_run_obj.cancel()
        _maybe_set_run_terminated(active_run, "FAILED")
        raise
Example #15
0
def _get_or_create_conda_env(conda_env_path, env_id=None):
    """
    Given a `Project`, creates a conda environment containing the project's dependencies if such a
    conda environment doesn't already exist. Returns the name of the conda environment.
    :param conda_env_path: Path to a conda yaml file.
    :param env_id: Optional string that is added to the contents of the yaml file before
                   calculating the hash. It can be used to distinguish environments that have the
                   same conda dependencies but are supposed to be different based on the context.
                   For example, when serving the model we may install additional dependencies to the
                   environment after the environment has been activated.
    """
    conda_path = _get_conda_bin_executable("conda")
    try:
        process.exec_cmd([conda_path, "--help"], throw_on_error=False)
    except EnvironmentError:
        raise ExecutionException(
            "Could not find Conda executable at {0}. "
            "Ensure Conda is installed as per the instructions at "
            "https://conda.io/projects/conda/en/latest/"
            "user-guide/install/index.html. "
            "You can also configure MLflow to look for a specific "
            "Conda executable by setting the {1} environment variable "
            "to the path of the Conda executable".format(
                conda_path, MLFLOW_CONDA_HOME))
    (_, stdout, _) = process.exec_cmd([conda_path, "env", "list", "--json"])
    env_names = [os.path.basename(env) for env in json.loads(stdout)['envs']]
    project_env_name = _get_conda_env_name(conda_env_path, env_id)
    if project_env_name not in env_names:
        _logger.info('=== Creating conda environment %s ===', project_env_name)
        if conda_env_path:
            process.exec_cmd([
                conda_path, "env", "create", "-n", project_env_name, "--file",
                conda_env_path
            ],
                             stream_output=True)
        else:
            process.exec_cmd(
                [conda_path, "create", "-n", project_env_name, "python"],
                stream_output=True)
    return project_env_name
Example #16
0
def _validate_execution_environment(project, backend):
    if project.docker_env and backend == "databricks":
        raise ExecutionException(
            "Running docker-based projects on Databricks is not yet supported."
        )
Example #17
0
def _run(uri, experiment_id, entry_point, version, parameters, docker_args,
         backend_name, backend_config, use_conda, storage_dir, synchronous):
    """
    Helper that delegates to the project-running method corresponding to the passed-in backend.
    Returns a ``SubmittedRun`` corresponding to the project run.
    """
    tracking_store_uri = tracking.get_tracking_uri()
    # TODO: remove this check once local, databricks, kubernetes execution have been refactored
    # into their own built-in project execution backends.
    if backend_name not in {"local", "databricks", "kubernetes"}:
        backend = loader.load_backend(backend_name)
        if backend:
            submitted_run = backend.run(uri, entry_point, parameters, version,
                                        backend_config, experiment_id,
                                        tracking_store_uri)
            tracking.MlflowClient().set_tag(submitted_run.run_id,
                                            MLFLOW_PROJECT_BACKEND,
                                            backend_name)
            return submitted_run

    work_dir = fetch_and_validate_project(uri, version, entry_point,
                                          parameters)
    project = load_project(work_dir)
    _validate_execution_environment(project, backend_name)

    existing_run_id = None
    if backend_name == "local" and _MLFLOW_LOCAL_BACKEND_RUN_ID_CONFIG in backend_config:
        existing_run_id = backend_config[_MLFLOW_LOCAL_BACKEND_RUN_ID_CONFIG]
    active_run = get_or_create_run(existing_run_id, uri, experiment_id,
                                   work_dir, version, entry_point, parameters)

    if backend_name == "databricks":
        tracking.MlflowClient().set_tag(active_run.info.run_id,
                                        MLFLOW_PROJECT_BACKEND, "databricks")
        from kiwi.projects.databricks import run_databricks
        return run_databricks(remote_run=active_run,
                              uri=uri,
                              entry_point=entry_point,
                              work_dir=work_dir,
                              parameters=parameters,
                              experiment_id=experiment_id,
                              cluster_spec=backend_config)

    elif backend_name == "local":
        tracking.MlflowClient().set_tag(active_run.info.run_id,
                                        MLFLOW_PROJECT_BACKEND, "local")
        command_args = []
        command_separator = " "
        # If a docker_env attribute is defined in MLproject then it takes precedence over conda yaml
        # environments, so the project will be executed inside a docker container.
        if project.docker_env:
            tracking.MlflowClient().set_tag(active_run.info.run_id,
                                            MLFLOW_PROJECT_ENV, "docker")
            _validate_docker_env(project)
            _validate_docker_installation()
            image = _build_docker_image(
                work_dir=work_dir,
                repository_uri=project.name,
                base_image=project.docker_env.get('image'),
                run_id=active_run.info.run_id)
            command_args += _get_docker_command(
                image=image,
                active_run=active_run,
                docker_args=docker_args,
                volumes=project.docker_env.get("volumes"),
                user_env_vars=project.docker_env.get("environment"))
        # Synchronously create a conda environment (even though this may take some time)
        # to avoid failures due to multiple concurrent attempts to create the same conda env.
        elif use_conda:
            tracking.MlflowClient().set_tag(active_run.info.run_id,
                                            MLFLOW_PROJECT_ENV, "conda")
            command_separator = " && "
            conda_env_name = _get_or_create_conda_env(project.conda_env_path)
            command_args += _get_conda_command(conda_env_name)
        # In synchronous mode, run the entry point command in a blocking fashion, sending status
        # updates to the tracking server when finished. Note that the run state may not be
        # persisted to the tracking server if interrupted
        if synchronous:
            command_args += _get_entry_point_command(project, entry_point,
                                                     parameters, storage_dir)
            command_str = command_separator.join(command_args)
            return _run_entry_point(command_str,
                                    work_dir,
                                    experiment_id,
                                    run_id=active_run.info.run_id)
        # Otherwise, invoke `mlflow run` in a subprocess
        return _invoke_mlflow_run_subprocess(work_dir=work_dir,
                                             entry_point=entry_point,
                                             parameters=parameters,
                                             experiment_id=experiment_id,
                                             use_conda=use_conda,
                                             storage_dir=storage_dir,
                                             run_id=active_run.info.run_id)
    elif backend_name == "kubernetes":
        from kiwi.projects import kubernetes as kb
        tracking.MlflowClient().set_tag(active_run.info.run_id,
                                        MLFLOW_PROJECT_ENV, "docker")
        tracking.MlflowClient().set_tag(active_run.info.run_id,
                                        MLFLOW_PROJECT_BACKEND, "kubernetes")
        _validate_docker_env(project)
        _validate_docker_installation()
        kube_config = _parse_kubernetes_config(backend_config)
        image = _build_docker_image(
            work_dir=work_dir,
            repository_uri=kube_config["repository-uri"],
            base_image=project.docker_env.get('image'),
            run_id=active_run.info.run_id)
        image_digest = kb.push_image_to_registry(image.tags[0])
        submitted_run = kb.run_kubernetes_job(
            project.name, active_run, image.tags[0], image_digest,
            _get_entry_point_command(project, entry_point, parameters,
                                     storage_dir),
            _get_run_env_vars(run_id=active_run.info.run_uuid,
                              experiment_id=active_run.info.experiment_id),
            kube_config.get('kube-context', None),
            kube_config['kube-job-template'])
        return submitted_run

    supported_backends = ["local", "databricks", "kubernetes"]
    raise ExecutionException("Got unsupported execution mode %s. Supported "
                             "values: %s" % (backend_name, supported_backends))
Example #18
0
def load_project(directory):
    mlproject_path = _find_mlproject(directory)

    # TODO: Validate structure of YAML loaded from the file
    yaml_obj = {}
    if mlproject_path is not None:
        with open(mlproject_path) as mlproject_file:
            yaml_obj = yaml.safe_load(mlproject_file)

    project_name = yaml_obj.get("name")

    # Validate config if docker_env parameter is present
    docker_env = yaml_obj.get("docker_env")
    if docker_env:
        if not docker_env.get("image"):
            raise ExecutionException(
                "Project configuration (MLproject file) was invalid: Docker "
                "environment specified but no image attribute found.")
        if docker_env.get("volumes"):
            if not (isinstance(docker_env["volumes"], list) and all(
                [isinstance(i, str) for i in docker_env["volumes"]])):
                raise ExecutionException(
                    "Project configuration (MLproject file) was invalid: "
                    "Docker volumes must be a list of strings, "
                    """e.g.: '["/path1/:/path1", "/path2/:/path2"])""")
        if docker_env.get("environment"):
            if not (isinstance(docker_env["environment"], list) and all([
                    isinstance(i, list) or isinstance(i, str)
                    for i in docker_env["environment"]
            ])):
                raise ExecutionException(
                    "Project configuration (MLproject file) was invalid: "
                    "environment must be a list containing either strings (to copy environment "
                    "variables from host system) or lists of string pairs (to define new "
                    "environment variables)."
                    """E.g.: '[["NEW_VAR", "new_value"], "VAR_TO_COPY_FROM_HOST"])"""
                )

    # Validate config if conda_env parameter is present
    conda_path = yaml_obj.get("conda_env")
    if conda_path and docker_env:
        raise ExecutionException("Project cannot contain both a docker and "
                                 "conda environment.")

    # Parse entry points
    entry_points = {}
    for name, entry_point_yaml in yaml_obj.get("entry_points", {}).items():
        parameters = entry_point_yaml.get("parameters", {})
        command = entry_point_yaml.get("command")
        entry_points[name] = EntryPoint(name, parameters, command)

    if conda_path:
        conda_env_path = os.path.join(directory, conda_path)
        if not os.path.exists(conda_env_path):
            raise ExecutionException(
                "Project specified conda environment file %s, but no such "
                "file was found." % conda_env_path)
        return Project(
            conda_env_path=conda_env_path,
            entry_points=entry_points,
            docker_env=docker_env,
            name=project_name,
        )

    default_conda_path = os.path.join(directory, DEFAULT_CONDA_FILE_NAME)
    if os.path.exists(default_conda_path):
        return Project(conda_env_path=default_conda_path,
                       entry_points=entry_points,
                       docker_env=docker_env,
                       name=project_name)

    return Project(conda_env_path=None,
                   entry_points=entry_points,
                   docker_env=docker_env,
                   name=project_name)
Example #19
0
 def _compute_uri_value(self, user_param_value):
     if not data.is_uri(user_param_value):
         raise ExecutionException("Expected URI for parameter %s but got "
                                  "%s" % (self.name, user_param_value))
     return user_param_value
Example #20
0
def test_execution_exception_string_repr():
    exc = ExecutionException("Uh oh")
    assert str(exc) == "Uh oh"
    json.loads(exc.serialize_as_json())