def _store_pipeline(yaml_file_content: AnyStr, name=None, description=None):

    yaml_dict = yaml.load(yaml_file_content, Loader=yaml.FullLoader)

    template_metadata = yaml_dict.get("metadata") or dict()
    annotations = template_metadata.get("annotations", {})
    pipeline_spec = json.loads(annotations.get("pipelines.kubeflow.org/pipeline_spec", "{}"))

    name = name or template_metadata["name"]
    description = pipeline_spec.get("description", "").strip()
    namespace = pipeline_spec.get("namespace", "").strip()
    pipeline_id = "-".join([generate_id(length=l) for l in [8, 4, 4, 4, 12]])
    created_at = datetime.now()

    parameters = [ApiParameter(name=p.get("name"), description=p.get("description"),
                               default=p.get("default"), value=p.get("value"))
                  for p in yaml_dict["spec"].get("params", {})]

    api_pipeline = ApiPipeline(id=pipeline_id,
                               created_at=created_at,
                               name=name,
                               description=description,
                               parameters=parameters,
                               namespace=namespace)

    uuid = store_data(api_pipeline)

    api_pipeline.id = uuid

    store_file(bucket_name="mlpipeline", prefix=f"pipelines/",
               file_name=f"{pipeline_id}", file_content=yaml_file_content)

    enable_anonymous_read_access(bucket_name="mlpipeline", prefix="pipelines/*")

    return api_pipeline
def _upload_model_yaml(yaml_file_content: AnyStr, name=None, existing_id=None):

    model_def = yaml.load(yaml_file_content, Loader=yaml.FullLoader)

    api_model = ApiModel(
        id=existing_id or model_def.get("model_identifier")
        or generate_id(name=name or model_def["name"]),
        created_at=datetime.now(),
        name=name or model_def["name"],
        description=model_def["description"].strip(),
        domain=model_def.get("domain") or "",
        labels=model_def.get("labels") or dict(),
        framework=model_def["framework"],
        filter_categories=model_def.get("filter_categories") or dict(),
        trainable=model_def.get("train", {}).get("trainable") or False,
        trainable_tested_platforms=model_def.get(
            "train", {}).get("tested_platforms") or [],
        trainable_credentials_required=model_def.get(
            "train", {}).get("credentials_required") or False,
        trainable_parameters=model_def.get("train", {}).get("input_params")
        or [],
        servable=model_def.get("serve", {}).get("servable") or False,
        servable_tested_platforms=model_def.get(
            "serve", {}).get("tested_platforms") or [],
        servable_credentials_required=model_def.get(
            "serve", {}).get("credentials_required") or False,
        servable_parameters=model_def.get("serve", {}).get("input_params")
        or [])

    # convert comma-separate strings to lists
    if type(api_model.trainable_tested_platforms) == str:
        api_model.trainable_tested_platforms = api_model.trainable_tested_platforms.replace(
            ", ", ",").split(",")

    if type(api_model.servable_tested_platforms) == str:
        api_model.servable_tested_platforms = api_model.servable_tested_platforms.replace(
            ", ", ",").split(",")

    uuid = store_data(api_model)

    api_model.id = uuid

    store_file(bucket_name="mlpipeline",
               prefix=f"models/{api_model.id}/",
               file_name="template.yaml",
               file_content=yaml_file_content,
               content_type="text/yaml")

    enable_anonymous_read_access(bucket_name="mlpipeline", prefix="models/*")

    return api_model, 201
def _upload_component_yaml(yaml_file_content: AnyStr,
                           name=None,
                           existing_id=None):

    yaml_dict = yaml.load(yaml_file_content, Loader=yaml.FullLoader)

    template_metadata = yaml_dict.get("metadata") or dict()

    component_id = existing_id or generate_id(name=name or yaml_dict["name"])
    created_at = datetime.now()
    name = name or yaml_dict["name"]
    description = (yaml_dict.get("description") or name).strip()[:255]
    filter_categories = yaml_dict.get("filter_categories") or dict()

    metadata = ApiMetadata(annotations=template_metadata.get("annotations"),
                           labels=template_metadata.get("labels"),
                           tags=template_metadata.get("tags"))

    parameters = [
        ApiParameter(name=p.get("name"),
                     description=p.get("description"),
                     default=p.get("default"),
                     value=p.get("value"))
        for p in yaml_dict.get("inputs", [])
    ]

    api_component = ApiComponent(id=component_id,
                                 created_at=created_at,
                                 name=name,
                                 description=description,
                                 metadata=metadata,
                                 parameters=parameters,
                                 filter_categories=filter_categories)

    uuid = store_data(api_component)

    api_component.id = uuid

    store_file(bucket_name="mlpipeline",
               prefix=f"components/{component_id}/",
               file_name="template.yaml",
               file_content=yaml_file_content,
               content_type="text/yaml")

    enable_anonymous_read_access(bucket_name="mlpipeline",
                                 prefix="components/*")

    return api_component, 201
def run_component(id, parameters, run_name=None):  # noqa: E501
    """run_component

    :param id:
    :type id: str
    :param parameters:
    :type parameters: List[ApiParameter]
    :param run_name: name to identify the run on the Kubeflow Pipelines UI, defaults to component name
    :type run_name: str

    :rtype: ApiRunCodeResponse
    """
    if KFP_HOST == "UNAVAILABLE":
        return f"Kubeflow Pipeline host is 'UNAVAILABLE'", 503

    if connexion.request.is_json:
        parameters = [
            ApiParameter.from_dict(d) for d in connexion.request.get_json()
        ]  # noqa: E501

    parameter_dict = {
        p.name: p.value
        for p in parameters if p.value and p.value.strip() != ""
    }

    api_component, status_code = get_component(id)

    if status_code > 200:
        return f"Component with id '{id}' does not exist", 404

    parameter_errors, status_code = validate_parameters(
        api_component.parameters, parameter_dict)

    if parameter_errors:
        return parameter_errors, status_code

    api_template, _ = get_component_template(id)

    enable_anonymous_read_access(bucket_name="mlpipeline",
                                 prefix="components/*")

    try:
        run_id = run_component_in_experiment(api_component, api_template.url,
                                             parameter_dict, run_name)
        return ApiRunCodeResponse(run_url=f"/runs/details/{run_id}"), 200

    except Exception as e:
        return f"Error while trying to run component {id}: {e}", 500
def run_custom_pipeline(run_custom_pipeline_payload, run_name=None):  # noqa: E501
    """run_custom_pipeline

    Run a complex pipeline defined by a directed acyclic graph (DAG)

    :param run_custom_pipeline_payload: A custom pipeline defined by a directed acyclic graph (DAG) and input parameters
    :type run_custom_pipeline_payload: dict | bytes
    :param run_name: Name to identify the run on the Kubeflow Pipelines UI
    :type run_name: str

    :rtype: ApiRunCodeResponse
    """
    if connexion.request.is_json:
        run_custom_pipeline_payload = ApiPipelineCustomRunPayload.from_dict(connexion.request.get_json())  # noqa: E501

    run_parameters = run_custom_pipeline_payload.run_parameters or {}
    custom_pipeline = run_custom_pipeline_payload.custom_pipeline

    # ensure unique task names
    task_names = [t.name for t in custom_pipeline.dag.tasks]
    duplicate_task_names = [name for name, count in Counter(task_names).items() if count > 1]
    assert not duplicate_task_names, f"duplicate task names: {duplicate_task_names}"

    # validate pipeline dependencies
    pipeline_tasks_by_name: typing.Dict[str, ApiPipelineTask] = {t.name: t for t in custom_pipeline.dag.tasks}
    for t in pipeline_tasks_by_name.values():
        for required_task_name in t.dependencies or []:
            assert required_task_name in pipeline_tasks_by_name, \
                f"missing task '{required_task_name}', as dependency for task '{t.name}'"

    # validate input parameters
    missing_run_parameters = {p.name for p in custom_pipeline.inputs.parameters
                              if p.default is None and p.value is None} - run_parameters.keys()
    assert not missing_run_parameters, f"missing parameters to run pipeline: {missing_run_parameters}"

    # make sure we enable anonymous read access to pipeline task components
    for artifact_type in set([t.artifact_type for t in pipeline_tasks_by_name.values()]):
        enable_anonymous_read_access(bucket_name="mlpipeline", prefix=f"{artifact_type}s/*")

    try:
        run_id = run_custom_pipeline_in_experiment(custom_pipeline, run_name, run_parameters)
        return ApiRunCodeResponse(run_url=f"/runs/details/{run_id}"), 200

    except Exception as e:
        # TODO: remove traceback?
        import traceback
        print(traceback.format_exc())
        return f"Error while trying to run custom pipeline '{run_name}': {e}", 500
Example #6
0
def _upload_notebook_yaml(yaml_file_content: AnyStr,
                          name=None,
                          access_token=None,
                          existing_id=None):

    yaml_dict = yaml.load(yaml_file_content, Loader=yaml.FullLoader)

    template_metadata = yaml_dict.get("metadata") or dict()

    notebook_id = existing_id or generate_id(name=name or yaml_dict["name"])
    created_at = datetime.now()
    name = name or yaml_dict["name"]
    description = yaml_dict["description"].strip()
    url = yaml_dict["implementation"]["github"]["source"]
    requirements = yaml_dict["implementation"]["github"].get("requirements")

    metadata = ApiMetadata(annotations=template_metadata.get("annotations"),
                           labels=template_metadata.get("labels"),
                           tags=template_metadata.get("tags"))

    notebook_content = _download_notebook(
        url, enterprise_github_api_token=access_token)

    # parameters = _extract_notebook_parameters(notebook_content)
    # TODO: not using Papermill any longer, notebook parameters no longer valid?
    #  kfp-notebook  has inputs and outputs ?
    parameters = dict()

    api_notebook = ApiNotebook(id=notebook_id,
                               created_at=created_at,
                               name=name,
                               description=description,
                               url=url,
                               metadata=metadata,
                               parameters=parameters)

    uuid = store_data(api_notebook)

    api_notebook.id = uuid

    store_file(bucket_name="mlpipeline",
               prefix=f"notebooks/{notebook_id}/",
               file_name="template.yaml",
               file_content=yaml_file_content)

    s3_url = store_file(bucket_name="mlpipeline",
                        prefix=f"notebooks/{notebook_id}/",
                        file_name=url.split("/")[-1].split("?")[0],
                        file_content=json.dumps(notebook_content).encode())

    if requirements:

        if _is_url(requirements):
            requirements_url = requirements
            requirements_txt = download_file_content_from_url(
                requirements_url).decode()
        else:
            requirements_txt = "\n".join(requirements.split(","))

        # TODO: remove this after fixing the Elyra-AI/KFP-Notebook runner so that
        #   Elyra should install its own requirements in addition to the provided requirements
        requirements_elyra_url = "https://github.com/elyra-ai/kfp-notebook/blob/master/etc/requirements-elyra.txt"
        requirements_elyra_txt = download_file_content_from_url(
            requirements_elyra_url).decode()
        requirements_elyra = "\n".join([
            line for line in requirements_elyra_txt.split("\n")
            if not line.startswith("#")
        ])

        requirements_all = f"# Required packages for {api_notebook.name}:\n" \
                           f"{requirements_txt}\n" \
                           f"# Requirements from {requirements_elyra_url}:\n" \
                           f"{requirements_elyra}"

        store_file(bucket_name="mlpipeline",
                   prefix=f"notebooks/{notebook_id}/",
                   file_name="requirements.txt",
                   file_content=requirements_all.encode())

    # if the url included an access token, replace the original url with the s3 url
    if "?token=" in url or "github.ibm.com" in url:
        api_notebook.url = s3_url
        update_multiple(ApiNotebook, [notebook_id], "url", s3_url)
        enable_anonymous_read_access(bucket_name="mlpipeline",
                                     prefix="notebooks/*")

    return api_notebook, 201
def _upload_dataset_yaml(yaml_file_content: AnyStr,
                         name=None,
                         existing_id=None):

    yaml_dict = yaml.load(yaml_file_content, Loader=yaml.FullLoader)

    name = name or yaml_dict["name"]
    description = yaml_dict["description"]
    dataset_id = existing_id or generate_id(name=yaml_dict.get("id", name))
    created_at = datetime.now()

    # if yaml_dict.get("id") != dataset_id:
    #     raise ValueError(f"Dataset.id contains non k8s character: {yaml_dict.get('id')}")

    # TODO: re-evaluate if we should use dataset update time as our MLX "created_at" time
    if "updated" in yaml_dict:
        created_at = datetime.strptime(str(yaml_dict["updated"]), "%Y-%m-%d")
    elif "created" in yaml_dict:
        created_at = datetime.strptime(str(yaml_dict["created"]), "%Y-%m-%d")

    license_name = yaml_dict["license"]["name"]
    domain = yaml_dict["domain"]
    format_type = yaml_dict["format"][0]["type"]
    size = yaml_dict["content"][0].get("size")
    version = yaml_dict["version"]

    # # extract number of records and convert thousand separators based on Locale
    # num_records_str = yaml_dict["statistics"]["number_of_records"]
    # num_records_number_str = num_records_str.split()[0]. \
    #     replace("~", ""). \
    #     replace("+", ""). \
    #     replace("k", "000"). \
    #     replace(",", "")  # assumes thousand separators in locale.en_US.UTF-8
    # # locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')  # setting locale does not work reliably in Docker
    # # number_of_records = locale.atoi(num_records_number_str)
    # number_of_records = int(num_records_number_str)
    number_of_records = yaml_dict["content"][0].get("records", 0)

    related_assets = [
        a["application"].get("asset_id")
        for a in yaml_dict.get("related_assets", [])
        if "MLX" in a.get("application", {}).get("name", "")
        and "asset_id" in a.get("application", {})
    ]

    template_metadata = yaml_dict.get("metadata") or dict()
    metadata = ApiMetadata(annotations=template_metadata.get("annotations"),
                           labels=template_metadata.get("labels"),
                           tags=template_metadata.get("tags")
                           or yaml_dict.get("seo_tags"))

    # TODO: add "version" to ApiDataset

    api_dataset = ApiDataset(id=dataset_id,
                             created_at=created_at,
                             name=name,
                             description=description,
                             domain=domain,
                             format=format_type,
                             size=size,
                             number_of_records=number_of_records,
                             license=license_name,
                             metadata=metadata,
                             related_assets=related_assets)

    uuid = store_data(api_dataset)

    api_dataset.id = uuid

    store_file(bucket_name="mlpipeline",
               prefix=f"datasets/{api_dataset.id}/",
               file_name="template.yaml",
               file_content=yaml_file_content)

    enable_anonymous_read_access(bucket_name="mlpipeline", prefix="datasets/*")

    return api_dataset, 201
def run_notebook(id, run_name=None, parameters: dict = None):  # noqa: E501
    """run_notebook

    :param id: 
    :type id: str
    :param run_name: name to identify the run on the Kubeflow Pipelines UI, defaults to notebook name
    :type run_name: str
    :param parameters: optional run parameters, may be required based on pipeline definition
    :type parameters: dict

    :rtype: ApiRunCodeResponse
    """
    if KFP_HOST == "UNAVAILABLE":
        return f"Kubeflow Pipeline host is 'UNAVAILABLE'", 503

    if not parameters and connexion.request.is_json:
        parameter_dict = dict(connexion.request.get_json())  # noqa: E501
    else:
        parameter_dict = parameters

    api_notebook, status_code = get_notebook(id)

    if status_code > 200:
        return f"Notebook with id '{id}' does not exist", 404

    # # TODO: Elyra kfp-notebook currently does not pass parameters on to papermill
    # if parameters:
    #     raise ApiError("The 'elyra-ai/kfp-notebook' executor does not support parameters", 422)

    # parameter_errors, status_code = validate_parameters(api_notebook.parameters, parameter_dict)
    #
    # if parameter_errors:
    #     return parameter_errors, status_code

    # Elyra pulls the requirements.txt from Minio, requiring anonymous read access
    enable_anonymous_read_access(bucket_name="mlpipeline",
                                 prefix="notebooks/*")

    try:
        run_id = run_notebook_in_experiment(notebook=api_notebook,
                                            parameters=parameter_dict,
                                            run_name=run_name)

        # expected output notebook based on:
        #   https://github.com/elyra-ai/kfp-notebook/blob/c8f1298/etc/docker-scripts/bootstrapper.py#L188-L190
        notebook_url = get_object_url(bucket_name="mlpipeline",
                                      prefix=f"notebooks/{api_notebook.id}/",
                                      file_extensions=[".ipynb"])
        # TODO: create a "sandboxed" notebook in a subfolder since Elyra overwrites
        #   the original notebook instead of creating an "-output.ipynb" file:
        #   https://github.com/elyra-ai/kfp-notebook/blob/c8f1298/etc/docker-scripts/bootstrapper.py#L205
        notebook_output_url = notebook_url.replace(".ipynb", "-output.ipynb")

        # instead return link to the generated output .html for the time being
        notebook_output_html = notebook_url.replace(".ipynb", ".html")

        return ApiRunCodeResponse(
            run_url=f"/runs/details/{run_id}",
            run_output_location=notebook_output_html), 200
    except Exception as e:

        return f"Error while trying to run notebook {id}: {e}", 500