Exemple #1
0
def create_katib_experiment(request, pipeline_id, pipeline_metadata,
                            output_path):
    """Create and launch a new Katib experiment.

    The Katib metadata must include all the information required to create an
    Experiment CRD (algorithm, objective, search parameters, ...). This
    information is sanitized a new yaml is written to file. This yaml is then
    submitted to the K8s API server to create the Experiment CR.

    Args:
        request: RPC request object
        pipeline_id: The id of the KFP pipeline that will be run by the Trials
        pipeline_metadata: The Kale notebook metadata
        output_path: The directory to store the YAML definition

    Returns (dict): a dictionary describing the status of the experiment
    """
    try:
        namespace = podutils.get_namespace()
    except Exception:
        # XXX: When not running from within a pod, get_namespace() fails
        # XXX: If that's the case, use the 'kubeflow-user' one
        # XXX: This should probably change. It works for local/MiniKF dev
        namespace = "kubeflow-user"

    katib_name = pipeline_metadata.get("experiment_name")
    katib_spec = pipeline_metadata.get("katib_metadata", None)
    if not katib_spec:
        raise RPCNotFoundError(details=("Could not find Katib specification in"
                                        " notebook's metadata"),
                               trans_id=request.trans_id)
    # Perform a sanitization of the Katib specification, making sure all the
    # required first-layer-fields are set
    katib_spec = _sanitize_katib_spec(request, katib_spec)

    trial_parameters = {
        "image": KATIB_TRIAL_IMAGE,
        "pipeline_id": pipeline_id,
        "experiment_name": pipeline_metadata.get("experiment_name")
    }

    katib_experiment = _define_katib_experiment(katib_name, katib_spec,
                                                trial_parameters)
    definition_path = os.path.abspath(
        os.path.join(output_path, "%s.katib.yaml" % katib_name))
    request.log.info("Saving Katib experiment definition at %s",
                     definition_path)
    with open(definition_path, "w") as yaml_file:
        import yaml
        yaml_text = yaml.dump(katib_experiment)
        yaml_file.write(yaml_text)
    _launch_katib_experiment(request, katib_experiment, namespace)

    return {
        "name": katib_experiment["metadata"]["name"],
        "namespace": namespace,
        "status": None,
        "trials": 0,
        "maxTrialCount": katib_experiment["spec"]["maxTrialCount"]
    }
Exemple #2
0
def is_kfp_step() -> bool:
    """Detect if running inside a KFP step.

    The detection involves two steps:

      1. Auto-detect if the current Pod is part of an Argo workflow
      2. Read one of the annotations that the KFP API Server sets in the
         workflow object (one-off runs and recurring ones have different
         annotations).
    """
    log.info("Checking if running inside a KFP step...")
    try:
        namespace = podutils.get_namespace()
        workflow = workflowutils.get_workflow(
            workflowutils.get_workflow_name(podutils.get_pod_name(),
                                            namespace), namespace)
        annotations = workflow["metadata"]["annotations"]
        try:
            _ = annotations[KFP_RUN_NAME_ANNOTATION_KEY]
        except KeyError:
            _ = annotations[KFP_SWF_NAME_ANNOTATION_KEY]
    except Exception:
        log.info("Not in a KFP step.")
        return False
    log.info("Running in a KFP step.")
    return True
Exemple #3
0
def snapshot_pod(bucket=DEFAULT_BUCKET, wait=False, interactive=False):
    """Take a Rok snapshot of the current Pod."""
    rok = get_client()
    pod_name = podutils.get_pod_name()
    namespace = podutils.get_namespace()
    log.info("Taking a snapshot of pod %s in namespace %s ..." %
             (pod_name, namespace))
    commit_title = "Snapshot of pod {}".format(pod_name)
    commit_message = NOTEBOOK_SNAPSHOT_COMMIT_MESSAGE.format(
        pod_name, namespace)
    params = {
        "pod": pod_name,
        "default_container": podutils.get_container_name(),
        "namespace": namespace,
        "commit_title": commit_title,
        "commit_message": commit_message
    }

    # Create the bucket in case it does not exist
    create_rok_bucket(bucket)

    task_info = rok.version_register(bucket,
                                     pod_name,
                                     "pod",
                                     params,
                                     wait=wait and not interactive)
    if wait:
        if interactive:
            task_id = task_info["task"]["id"]
            return monitor_snapshot_task(task_id)
        else:
            log.info("Successfully took Rok snapshot")
    return task_info
Exemple #4
0
def snapshot_pvc(pvc_name,
                 bucket=DEFAULT_BUCKET,
                 wait=False,
                 interactive=False):
    """Perform a snapshot over a PVC."""
    rok = get_client()
    namespace = podutils.get_namespace()
    log.info("Taking a snapshot of PVC %s in namespace %s ..." %
             (pvc_name, namespace))
    commit_title = "Snapshot PVC '{}'".format(pvc_name)
    commit_message = "This is a snapshot of PVC '{}' triggered by Kale".format(
        pvc_name)
    params = {
        "dataset": pvc_name,
        "namespace": namespace,
        "commit_title": commit_title,
        "commit_message": commit_message
    }

    # Create the bucket in case it does not exist
    create_rok_bucket(bucket)

    task_info = rok.version_register(bucket,
                                     pvc_name,
                                     "dataset",
                                     params,
                                     wait=wait and not interactive)
    if wait:
        if interactive:
            task_id = task_info["task"]["id"]
            return monitor_snapshot_task(task_id, bucket=bucket)
        else:
            log.info("Successfully took Rok snapshot")
    return task_info
Exemple #5
0
def update_uimetadata(artifact_name,
                      uimetadata_path='/mlpipeline-ui-metadata.json'):
    """Update ui-metadata dictionary with a new web-app entry.

    Args:
        artifact_name: Name of the artifact
        uimetadata_path: path to mlpipeline-ui-metadata.json
    """
    # Default empty ui-metadata dict
    outputs = {"outputs": []}
    if os.path.exists(uimetadata_path):
        try:
            outputs = json.loads(open(uimetadata_path, 'r').read())
            if not outputs.get('outputs', None):
                outputs['outputs'] = []
        except json.JSONDecodeError as e:
            print("Failed to parse json file {}: {}\n"
                  "This step will not be able to visualize artifacts in the"
                  " KFP UI".format(uimetadata_path, e))

    pod_name = podutils.get_pod_name()
    namespace = podutils.get_namespace()
    workflow_name = workflowutils.get_workflow_name(pod_name, namespace)
    html_artifact_entry = [{
        'type':
        'web-app',
        'storage':
        'minio',
        'source':
        'minio://mlpipeline/artifacts/{}/{}/{}'.format(workflow_name, pod_name,
                                                       artifact_name + '.tgz')
    }]
    outputs['outputs'] += html_artifact_entry
    with open(uimetadata_path, "w") as f:
        json.dump(outputs, f)
Exemple #6
0
def update_uimetadata(artifact_name,
                      uimetadata_path=KFP_UI_METADATA_FILE_PATH):
    """Update ui-metadata dictionary with a new web-app entry.

    Args:
        artifact_name: Name of the artifact
        uimetadata_path: path to mlpipeline-ui-metadata.json
    """
    try:
        outputs = get_current_uimetadata(uimetadata_path,
                                         default_if_not_exist=True)
    except json.JSONDecodeError:
        log.error("This step will not be able to visualize artifacts in the"
                  " KFP UI")
        return

    pod_name = podutils.get_pod_name()
    namespace = podutils.get_namespace()
    workflow_name = workflowutils.get_workflow_name(pod_name, namespace)
    html_artifact_entry = [{
        'type':
        'web-app',
        'storage':
        'minio',
        'source':
        'minio://mlpipeline/artifacts/{}/{}/{}'.format(workflow_name, pod_name,
                                                       artifact_name + '.tgz')
    }]
    outputs['outputs'] += html_artifact_entry
    with open(uimetadata_path, "w") as f:
        json.dump(outputs, f)
Exemple #7
0
    def __init__(self):
        log.info("%s Initializing MLMD context... %s", "-" * 10, "-" * 10)
        log.info("Connecting to MLMD...")
        self.store = self._connect()
        log.info("Successfully connected to MLMD")
        log.info("Getting step details...")
        log.info("Getting pod name...")
        self.pod_name = podutils.get_pod_name()
        log.info("Successfully retrieved pod name: %s", self.pod_name)
        log.info("Getting pod namespace...")
        self.pod_namespace = podutils.get_namespace()
        log.info("Successfully retrieved pod namespace: %s",
                 self.pod_namespace)
        log.info("Getting pod...")
        self.pod = podutils.get_pod(self.pod_name, self.pod_namespace)
        log.info("Successfully retrieved pod")
        log.info("Getting workflow name from pod...")
        self.workflow_name = self.pod.metadata.labels.get(
            workflowutils.ARGO_WORKFLOW_LABEL_KEY)
        log.info("Successfully retrieved workflow name: %s",
                 self.workflow_name)
        log.info("Getting workflow...")
        self.workflow = workflowutils.get_workflow(self.workflow_name,
                                                   self.pod_namespace)
        log.info("Successfully retrieved workflow")

        workflow_labels = self.workflow["metadata"].get("labels", {})
        self.run_uuid = workflow_labels.get(podutils.KFP_RUN_ID_LABEL_KEY,
                                            self.workflow_name)
        log.info("Successfully retrieved KFP run ID: %s", self.run_uuid)

        workflow_annotations = self.workflow["metadata"].get("annotations", {})
        pipeline_spec = json.loads(
            workflow_annotations.get("pipelines.kubeflow.org/pipeline_spec",
                                     "{}"))
        self.pipeline_name = pipeline_spec.get("name", self.workflow_name)
        if self.pipeline_name:
            log.info("Successfully retrieved KFP pipeline_name: %s",
                     self.pipeline_name)
        else:
            log.info("Could not retrieve KFP pipeline name")

        self.component_id = podutils.compute_component_id(self.pod)
        self.execution_hash = self.pod.metadata.annotations.get(
            MLMD_EXECUTION_HASH_PROPERTY_KEY)
        if self.execution_hash:
            log.info("Successfully retrieved execution hash: %s",
                     self.execution_hash)
        else:
            self.execution_hash = utils.random_string(10)
            log.info(
                "Failed to retrieve execution hash."
                " Generating random string...: %s", self.execution_hash)

        self.run_context = self._get_or_create_run_context()
        self.execution = self._create_execution_in_run_context()
        self._label_with_context_and_execution()
        log.info("%s Successfully initialized MLMD context %s", "-" * 10,
                 "-" * 10)
Exemple #8
0
def _add_owner_references(infs_name: str, pvc_name: str):
    # add owner reference to the PVC
    log.info("Adding owner references to PVC '%s' for InferenceService '%s'",
             pvc_name, infs_name)
    client = k8sutils.get_v1_client()
    infs = get_inference_service(infs_name)
    pvc = client.read_namespaced_persistent_volume_claim(
        pvc_name, podutils.get_namespace())
    ref = kubernetes.client.V1OwnerReference(api_version=API_VERSION,
                                             kind="InferenceService",
                                             name=infs_name,
                                             uid=infs["metadata"]["uid"])
    if not pvc.metadata.owner_references:
        pvc.metadata.owner_references = [ref]
    else:
        pvc.metadata.owner_references.append(ref)
    client.patch_namespaced_persistent_volume_claim(
        name=pvc_name, namespace=podutils.get_namespace(), body=pvc)
Exemple #9
0
 def delete(self):
     """Delete the InferenceService CR."""
     namespace = podutils.get_namespace()
     log.info("Deleting InferenceServer '%s/%s'...", namespace, self.name)
     k8s_co_client = k8sutils.get_co_client()
     k8s_co_client.delete_namespaced_custom_object(CO_GROUP, CO_VERSION,
                                                   namespace, CO_PLURAL,
                                                   self.name)
     log.info("Successfully deleted InferenceService.")
Exemple #10
0
def get_user_namespace():
    """Get the namespace that the jupyterlab pod is running in."""
    client = _get_client()
    if client.get_user_namespace() == '':
        if not os.path.exists('.config/kfp/'):
            os.makedirs('.config/kfp/')
        namespace = podutils.get_namespace()
        client.set_user_namespace(namespace)
    else:
        namespace = client.get_user_namespace()
    return namespace
Exemple #11
0
def detect_run_uuid() -> str:
    """Get the workflow's UUID form inside a pipeline step."""
    namespace = podutils.get_namespace()
    workflow = workflowutils.get_workflow(
        workflowutils.get_workflow_name(podutils.get_pod_name(), namespace),
        namespace)
    run_uuid = (workflow["metadata"].get("labels",
                                         {}).get(KFP_RUN_ID_LABEL_KEY, None))

    # KFP api-server adds run UUID as label to workflows for KFP>=0.1.26.
    # Return run UUID if available. Else return workflow UUID to maintain
    # backwards compatibility.
    return run_uuid or workflow["metadata"]["uid"]
Exemple #12
0
def find_poddefault_labels_on_server(request):
    """Find server's labels that correspond to poddefaults applied."""
    request.log.info("Retrieving PodDefaults applied to server...")
    applied_poddefaults = kfutils.find_applied_poddefaults(
        podutils.get_pod(podutils.get_pod_name(), podutils.get_namespace()),
        kfutils.list_poddefaults())
    pd_names = [pd["metadata"]["name"] for pd in applied_poddefaults]
    request.log.info("Retrieved applied PodDefaults: %s", pd_names)

    labels = kfutils.get_poddefault_labels(applied_poddefaults)
    request.log.info("PodDefault labels applied on server: %s",
                     ", ".join(["%s: %s" % (k, v) for k, v in labels.items()]))
    return labels
Exemple #13
0
 def __repr__(self):
     """Show an interactive text in notebooks."""
     if utils.is_ipython():
         import IPython
         html = ('InferenceService <pre>%s</pre> serving requests at host'
                 ' <pre>%s</pre><br>'
                 'View model <a href="/models/details/%s/%s"'
                 ' target="_blank" >here</a>' %
                 (self.name, get_inference_service_host(
                     self.name), podutils.get_namespace(), self.name))
         IPython.display.display(IPython.display.HTML(html))
         return ""
     else:
         return super(KFServer, self).__repr__()
Exemple #14
0
def _get_kfp_client(host=None):
    user_namespace = podutils.get_namespace()
    host = 'http://ml-pipeline.kubeflow.svc.cluster.local:8888'
    log.info("hbseo _get_kfp_client() 'host:%s', namespace:%s", host,
             user_namespace)
    token_path = os.environ.get('KF_PIPELINES_SA_TOKEN_PATH')
    if token_path:
        with open(token_path, 'r') as f:
            token = f.read()
        log.info("hbseo _get_kfp_client() 'token:%s'", token)
        return Client(host=host,
                      namespace=user_namespace,
                      existing_token=token)
    else:
        return Client(host=host, namespace=user_namespace)
Exemple #15
0
def list_poddefaults(namespace: str = None):
    """List PodDefaults in requested namespace.

    If namespace is not provided, list PodDefaults in pod's namespace.
    """
    if not namespace:
        try:
            namespace = podutils.get_namespace()
        except Exception:
            raise ValueError("'namespace' cannot be empty when not inside a"
                             " pod")
    api_group = "kubeflow.org"
    api_version = "v1alpha1"
    co_name = "poddefaults"
    co_client = podutils._get_k8s_custom_objects_client()
    return co_client.list_namespaced_custom_object(api_group, api_version,
                                                   namespace, co_name)["items"]
Exemple #16
0
def snapshot_notebook(request, bucket=DEFAULT_BUCKET, obj=None):
    """Perform a snapshot over the notebook's pod."""
    rok = _get_client()
    hostname = os.getenv("HOSTNAME")
    namespace = podutils.get_namespace()
    commit_title = "Snapshot of notebook {}".format(hostname)
    commit_message = NOTEBOOK_SNAPSHOT_COMMIT_MESSAGE.format(
        hostname, namespace)
    params = {
        "namespace": namespace,
        "commit_title": commit_title,
        "commit_message": commit_message
    }

    obj = obj or podutils.get_pod_name()
    # Create the bucket in case it does not exist
    podutils.create_rok_bucket(bucket, client=rok)
    return rok.version_register(bucket, obj, "jupyter", params)
Exemple #17
0
def _annotate_trial(trial_name, annotation_key, annotation_value):
    """Add annotation to a trial in the pod's namespace."""
    group = "kubeflow.org"
    version = "v1alpha3"
    plural = "trials"
    patch = {
        "apiVersion": "%s/%s" % (group, version),
        "metadata": {
            "name": trial_name,
            "annotations": {
                annotation_key: annotation_value
            }
        }
    }
    k8s_client = podutils._get_k8s_custom_objects_client()
    k8s_client.patch_namespaced_custom_object(group, version,
                                              podutils.get_namespace(), plural,
                                              trial_name, patch)
Exemple #18
0
def snapshot_notebook(bucket=DEFAULT_BUCKET, obj=None):
    """Take a Rok snapshot of the current Notebook."""
    rok = get_client()
    pod_name = podutils.get_pod_name()
    namespace = podutils.get_namespace()
    log.info("Taking a snapshot of notebook %s in namespace %s ..."
             % (pod_name, namespace))
    commit_title = "Snapshot of notebook {}".format(pod_name)
    commit_message = NOTEBOOK_SNAPSHOT_COMMIT_MESSAGE.format(pod_name,
                                                             namespace)
    params = {"namespace": namespace,
              "commit_title": commit_title,
              "commit_message": commit_message}

    obj = obj or pod_name
    # Create the bucket in case it does not exist
    create_rok_bucket(bucket)
    return rok.version_register(bucket, obj, "jupyter", params)
Exemple #19
0
def check_rok_availability(request):
    """Check if Rok is available."""
    log = request.log if hasattr(request, "log") else logger
    try:
        rok = _get_client()
    except ImportError:
        log.exception("Failed to import RokClient")
        raise RPCNotFoundError(details="Rok Gateway Client module not found",
                               trans_id=request.trans_id)
    except Exception:
        log.exception("Failed to initialize RokClient")
        raise RPCServiceUnavailableError(details=("Failed to initialize"
                                                  " RokClient"),
                                         trans_id=request.trans_id)

    try:
        rok.account_info()
    except Exception:
        log.exception("Failed to retrieve account information")
        raise RPCServiceUnavailableError(details="Failed to access Rok",
                                         trans_id=request.trans_id)

    name = podutils.get_pod_name()
    namespace = podutils.get_namespace()
    try:
        suggestions = rok.version_register_suggest(DEFAULT_BUCKET,
                                                   name,
                                                   "jupyter",
                                                   "params:lab",
                                                   {"namespace": namespace},
                                                   ignore_env=True)
    except Exception as e:
        log.exception("Failed to list lab suggestions")
        message = "%s: %s" % (e.__class__.__name__, e)
        raise RPCServiceUnavailableError(message=message,
                                         details=("Rok cannot list notebooks"
                                                  " in this namespace"),
                                         trans_id=request.trans_id)

    if not any(s["value"] == name for s in suggestions):
        log.error("Could not find notebook '%s' in list of suggestions", name)
        raise RPCNotFoundError(details=("Could not find this notebook in"
                                        " notebooks listed by Rok"),
                               trans_id=request.trans_id)
Exemple #20
0
def run_pipeline(experiment_name: str,
                 pipeline_id: str,
                 run_name: str = None,
                 version_id: str = None,
                 host: str = None,
                 **kwargs) -> Any:
    """Run pipeline (without uploading) in kfp.

    Args:
        run_name: The name of the kfp run (autogenerated if not provided)
        experiment_name: The name of the kfp experiment
        pipeline_id: The ID of the uploaded pipeline to be run
        version_id: the ID of the pipeline to be run
        host: custom host when executing outside of the cluster

    Returns:
        Pipeline run metadata
    """
    client = _get_kfp_client(host)

    log.info("Creating KFP experiment '%s'...", experiment_name)
    experiment = client.create_experiment(experiment_name)
    pipeline = client.pipelines.get_pipeline(pipeline_id)
    pipeline_name = pipeline.name
    _version_id = version_id if version_id else pipeline.default_version.id
    version_name = client.pipelines.get_pipeline_version(_version_id).name
    if not run_name:
        run_name = ("%s-%s-%s" %
                    (pipeline_name, version_name, utils.random_string()))
    display_version = ("(%sversion: '%s')" %
                       ("" if version_id else "default ", version_name))
    log.info("Submitting new pipeline run '%s' for pipeline '%s' %s ...",
             run_name, pipeline_name, display_version)
    run = client.run_pipeline(experiment.id,
                              run_name,
                              pipeline_id=pipeline_id,
                              version_id=_version_id,
                              params=kwargs)
    run_url = ("%s/?ns=%s#/runs/details/%s" %
               (client._get_url_prefix(), podutils.get_namespace(), run.id))
    log.info("Successfully submitted pipeline run.")
    log.info("Run URL: <host>%s", run_url)
    return run
Exemple #21
0
def update_uimetadata(artifact_name,
                      uimetadata_path=KFP_UI_METADATA_FILE_PATH):
    """Update ui-metadata dictionary with a new web-app entry.

    Args:
        artifact_name: Name of the artifact
        uimetadata_path: path to mlpipeline-ui-metadata.json
    """
    log.info("Adding artifact '%s' to KFP UI metadata...", artifact_name)
    try:
        outputs = get_current_uimetadata(uimetadata_path,
                                         default_if_not_exist=True)
    except json.JSONDecodeError:
        log.error("This step will not be able to visualize artifacts in the"
                  " KFP UI")
        return

    pod_name = podutils.get_pod_name()
    namespace = podutils.get_namespace()
    workflow_name = workflowutils.get_workflow_name(pod_name, namespace)
    html_artifact_entry = [{
        'type':
        'web-app',
        'storage':
        'minio',
        'source':
        'minio://mlpipeline/artifacts/{}/{}/{}'.format(workflow_name, pod_name,
                                                       artifact_name + '.tgz')
    }]
    outputs['outputs'] += html_artifact_entry

    try:
        utils.ensure_or_create_dir(uimetadata_path)
    except RuntimeError:
        log.exception(
            "Writing to '%s' failed. This step will not be able to"
            " visualize artifacts in the KFP UI.", uimetadata_path)
        return
    with open(uimetadata_path, "w") as f:
        json.dump(outputs, f)
    log.info("Artifact successfully added")
Exemple #22
0
def discover_katib_version():
    """Retrieve the installed Katib version."""
    log.info("Discovering Katib version...")
    ns = podutils.get_namespace()
    api_version = KATIB_API_VERSION_V1BETA1
    try:
        list_experiments(ns, api_version)
    except ApiException as e:
        if e.status != 403:
            api_version = KATIB_API_VERSION_V1ALPHA3
            try:
                list_experiments(ns, api_version)
            except ApiException as e:
                if e.status != 403:
                    raise RuntimeError("Katib is not installed or has an"
                                       " unsupported CRD version. Supported"
                                       " CRD versions are 'v1alpha3' and"
                                       " 'v1beta1'.")

    log.info("Found Katib version %s", api_version)
    return api_version
Exemple #23
0
def hydrate_pvc_from_snapshot(obj, version, new_pvc_name,
                              bucket=DEFAULT_BUCKET):
    """Create a new PVC out of a Rok snapshot."""
    log.info("Creating new PVC '%s' from Rok version %s ..." %
             (new_pvc_name, version))
    rok = get_client()
    version_info = rok.version_info(bucket, obj, version)
    # size of the snapshot
    size = int(version_info['content_length'])
    units = ["", "Ki", "Mi", "Gi"]
    unit = 0
    while size > 1024 and unit < 3:
        size = math.ceil(size / 1024)
        unit += 1
    size_repr = "%s%s" % (size, units[unit])
    rok_url = version_info['rok_url']
    log.info("Using Rok url: %s" % rok_url)

    # todo: kubernetes python client v11 have a
    #  kubernetes.utils.create_from_dict that would make it much more nicer
    #  here. (KFP support kubernetes <= 10)
    pvc = kubernetes.client.V1PersistentVolumeClaim(
        api_version="v1",
        metadata=kubernetes.client.V1ObjectMeta(
            annotations={"rok/origin": rok_url},
            name=new_pvc_name
        ),
        spec=kubernetes.client.V1PersistentVolumeClaimSpec(
            storage_class_name="rok",
            access_modes=["ReadWriteOnce"],
            resources=kubernetes.client.V1ResourceRequirements(
                requests={"storage": size_repr}
            )
        )
    )
    k8s_client = k8sutils.get_v1_client()
    ns = podutils.get_namespace()
    ns_pvc = k8s_client.create_namespaced_persistent_volume_claim(ns, pvc)
    log.info("Successfully submitted PVC.")
    return {"name": ns_pvc.metadata.name}
Exemple #24
0
def create_inference_service(name: str,
                             predictor: str,
                             pvc_name: str,
                             model_path: str,
                             image: str = None,
                             port: int = None,
                             transformer: bool = False,
                             submit: bool = True) -> KFServer:
    """Create and submit an InferenceService.

    Args:
        name (str): Name of the InferenceService CR
        predictor (str): One of serveutils.PREDICTORS
        pvc_name (str): Name of the PVC which contains the model
        model_path (str): Absolute path to the dump of the model
        image (optional): Image to run the InferenceService
        port (optional): To be used in conjunction with `image`. The port where
            the custom endpoint is exposed.
        transformer (bool): True if the InferenceService is to be deployed with
            a transformer.
        submit (bool): Set to False to just create the YAML and not submit the
            CR to the K8s.

    Returns (str): Path to the generated YAML
    """
    if predictor not in PREDICTORS:
        raise ValueError("Invalid predictor: %s. Choose one of %s" %
                         (predictor, PREDICTORS))

    if predictor == "custom":
        if not image:
            raise ValueError("You must specify an image when using a custom"
                             " predictor.")
        if not port:
            raise ValueError("You must specify a port when using a custom"
                             " predictor.")
        predictor_spec = CUSTOM_PREDICTOR_TEMPLATE.format(
            image=image, port=port, pvc_name=pvc_name, model_path=model_path)
    else:
        if image is not None:
            log.info(
                "Creating an InferenceService with predictor '%s'."
                " Ignoring image...", predictor)
        if port is not None:
            log.info(
                "Creating an InferenceService with predictor '%s'."
                " Ignoring port...", predictor)
        predictor_spec = PVC_PREDICTOR_TEMPLATE.format(predictor=predictor,
                                                       pvc_name=pvc_name,
                                                       model_path=model_path)

    infs_spec = yaml.safe_load(RAW_TEMPLATE.format(name=name))
    predictor_spec = yaml.safe_load(predictor_spec)
    if predictor == "tensorflow":
        # XXX: TF Server is the only predictor being pulled from an external
        # repository. TFServer container are tagger using the library's version
        # number. All the other predictor are built by the KFServing community
        # and are tagged following KFServing's version number. Default values
        # for these can be set in the `inferenceservice-config` ConfigMap.
        _version = _get_runtime_version(predictor)
        predictor_spec["tensorflow"]["runtimeVersion"] = _version
    infs_spec["spec"]["default"]["predictor"] = predictor_spec

    if transformer:
        transformer_spec = yaml.safe_load(
            TRANSFORMER_CUSTOM_TEMPLATE.format(
                image=podutils.get_docker_base_image(),
                pvc_name=pvc_name,
                pvc_mount_point=PVC_ROOT))
        infs_spec["spec"]["default"]["transformer"] = transformer_spec

    yaml_filename = "%s.kfserving.yaml" % name
    yaml_contents = yaml.dump(infs_spec)
    log.info("Saving InferenceService definition at '%s'", yaml_filename)
    with open(yaml_filename, "w") as yaml_file:
        yaml_file.write(yaml_contents)

    if submit:
        _submit_inference_service(infs_spec, podutils.get_namespace())
        _add_owner_references(name, pvc_name)
    return KFServer(name=name, spec=yaml_contents)
Exemple #25
0
def get_inference_service(name: str):
    """Get an InferenceService object."""
    k8s_co_client = k8sutils.get_co_client()
    ns = podutils.get_namespace()
    return k8s_co_client.get_namespaced_custom_object(CO_GROUP, CO_VERSION, ns,
                                                      CO_PLURAL, name)
Exemple #26
0
def get_namespace(request):
    """Retrieve the namespace of the notebook."""
    request.log.info("Retrieving notebook's namespace...")
    namespace = podutils.get_namespace()
    request.log.info("Notebook's namespace is '%s'", namespace)
    return namespace
Exemple #27
0
def create_and_wait_kfp_run(pipeline_id: str,
                            version_id: str,
                            run_name: str,
                            experiment_name: str = "Default",
                            api_version: str = KATIB_API_VERSION_V1BETA1,
                            **kwargs):
    """Create a KFP run, wait for it to complete and retrieve its metrics.

    Create a KFP run from a KFP pipeline with custom arguments and wait for
    it to finish. If it succeeds, return its metrics, logging them in a format
    that can be parsed by Katib's metrics collector.

    Also, annotate the parent trial with the run UUID of the KFP run and
    annotation the KFP workflow with the Katib experiment and trial names and
    ids.

    Args:
        pipeline_id: KFP pipeline
        version_id: KFP pipeline's version
        run_name: The name of the new run
        experiment_name: KFP experiment to create run in. (default: "Default")
        api_version: The version of the Katib CRD (`v1alpha3` or `v1beta1`
        kwargs: All the parameters the pipeline will be fed with

    Returns:
        metrics: Dict of metrics along with their values
    """
    pod_namespace = podutils.get_namespace()
    run = kfputils.run_pipeline(experiment_name=experiment_name,
                                pipeline_id=pipeline_id,
                                version_id=version_id,
                                run_name=run_name,
                                **kwargs)
    run_id = run.id

    log.info("Annotating Trial '%s' with the KFP Run UUID '%s'...",
             run_name, run_id)
    try:
        # Katib Trial name == KFP Run name by design (see rpc.katib)
        annotate_trial(run_name, pod_namespace,
                       {KALE_KATIB_KFP_ANNOTATION_KEY: run_id}, api_version)
    except Exception:
        log.exception("Failed to annotate Trial '%s' with the KFP Run UUID"
                      " '%s'", run_name, run_id)

    log.info("Getting Workflow name for run '%s'...", run_id)
    workflow_name = kfputils.get_workflow_from_run(
        kfputils.get_run(run_id))["metadata"]["name"]
    log.info("Workflow name: %s", workflow_name)
    log.info("Getting the Katib trial...")
    trial = get_trial(run_name, pod_namespace, api_version)
    log.info("Trial name: %s, UID: %s", trial["metadata"]["name"],
             trial["metadata"]["uid"])
    log.info("Getting owner Katib experiment of trial...")
    exp_name, exp_id = get_owner_experiment_from_trial(trial)
    log.info("Experiment name: %s, UID: %s", exp_name, exp_id)
    wf_annotations = {
        EXPERIMENT_NAME_ANNOTATION_KEY: exp_name,
        EXPERIMENT_ID_ANNOTATION_KEY: exp_id,
        TRIAL_NAME_ANNOTATION_KEY: trial["metadata"]["name"],
        TRIAL_ID_ANNOTATION_KEY: trial["metadata"]["uid"],
    }
    try:
        workflowutils.annotate_workflow(workflow_name, pod_namespace,
                                        wf_annotations)
    except Exception:
        log.exception("Failed to annotate Workflow '%s' with the Katib"
                      " details", workflow_name)

    status = kfputils.wait_kfp_run(run_id)

    # If run has not succeeded, return no metrics
    if status != "Succeeded":
        log.warning("KFP run did not run successfully. No metrics to"
                    " return.")
        # exit gracefully with error
        sys.exit(-1)

    # Retrieve metrics
    run_metrics = kfputils.get_kfp_run_metrics(run_id)
    for name, value in run_metrics.items():
        log.info("%s=%s", name, value)

    return run_metrics
Exemple #28
0
def snapshot_pipeline_step(pipeline, step, nb_path, before=True):
    """Take a snapshot of a pipeline step with Rok."""
    # Mark the start of the snapshotting procedure
    log.info("%s Starting Rok snapshot procedure... (%s) %s", "-" * 10,
             "before" if before else "after", "-" * 10)

    log.info("Retrieving KFP run ID...")
    run_uuid = podutils.get_run_uuid()
    log.info("Retrieved KFP run ID: %s", run_uuid)
    bucket = kfputils.get_experiment_from_run_id(run_uuid).name
    obj = "{}-{}".format(pipeline, run_uuid)
    commit_title = "Step: {} ({})".format(step, "start" if before else "end")
    commit_message = "Autosnapshot {} step '{}' of pipeline run '{}'".format(
        "before" if before else "after", step, run_uuid)
    environment = json.dumps({
        "KALE_PIPELINE_STEP": step,
        "KALE_NOTEBOOK_PATH": nb_path,
        "KALE_SNAPSHOT_FINAL": not before
    })
    metadata = json.dumps({
        "environment": environment,
        "kfp_runid": kfputils.format_kfp_run_id_uri(run_uuid),
        "state": "initial" if before else "final"
    })
    params = {
        "pod": podutils.get_pod_name(),
        "metadata": metadata,
        "default_container": "main",
        "commit_title": commit_title,
        "commit_message": commit_message
    }
    rok = get_client()
    # Create the bucket in case it does not exist
    create_rok_bucket(bucket)
    log.info("Registering Rok version for '%s/%s'...", bucket, obj)
    task_info = rok.version_register(bucket, obj, "pod", params, wait=True)
    # FIXME: How do we retrieve the base URL of the ROK UI?
    version = task_info["task"]["result"]["event"]["version"]
    url_path = (
        "/rok/buckets/%s/files/%s/versions/%s?ns=%s" %
        (utils.encode_url_component(bucket), utils.encode_url_component(obj),
         utils.encode_url_component(version),
         utils.encode_url_component(podutils.get_namespace())))
    log.info("Successfully registered Rok version '%s'", version)

    log.info("Successfully created snapshot for step '%s'", step)
    if before:
        log.info("You can explore the state of the notebook at the beginning"
                 " of this step by spawning a new notebook from the following"
                 " Rok snapshot:")
    log.info("%s", url_path)

    reproduce_steps = ("To **explore the execution state** at the **%s** of"
                       " this step follow the instructions below:\n\n"
                       "1\\. View the [snapshot in the Rok UI](%s).\n\n"
                       "2\\. Copy the Rok URL.\n\n"
                       "3\\. Create a new Notebook Server by using this Rok"
                       " URL to autofill the form.")

    if before:
        md_source = (("# Rok autosnapshot\n"
                      "Rok has successfully created a snapshot for step `%s`."
                      "\n\n" + reproduce_steps) %
                     (step, "beginning", url_path))
    else:
        md_source = (("# Rok final autosnapshot\n"
                      "Rok has successfully created a snapshot **after** the"
                      " execution of step `%s`.\n\n" + reproduce_steps) %
                     (step, "end", url_path))

    try:
        metadataui = kfputils.get_current_uimetadata(default_if_not_exist=True)
    except json.JSONDecodeError:
        log.error("This step will not create a Rok markdown artifact.")
    else:
        metadataui["outputs"].append({
            "storage": "inline",
            "source": md_source,
            "type": "markdown"
        })
        with open(kfputils.KFP_UI_METADATA_FILE_PATH, "w") as f:
            json.dump(metadataui, f)
    # Mark the end of the snapshotting procedure
    log.info("%s Successfully ran Rok snapshot procedure (%s) %s", "-" * 10,
             "before" if before else "after", "-" * 10)

    return task_info
Exemple #29
0
def create_and_wait_kfp_run(pipeline_id: str,
                            run_name: str,
                            version_id: str = None,
                            experiment_name: str = "Default",
                            namespace: str = "kubeflow",
                            **kwargs):
    """Create a KFP run, wait for it to complete and retrieve its metrics.

    Create a KFP run from a KFP pipeline with custom arguments and wait for
    it to finish. If it succeeds, return its metrics.

    Args:
        pipeline_id: KFP pipeline
        version_id: KFP pipeline's version (optional, not supported yet)
        experiment_name: KFP experiment to create run in. (default: "Default")
        namespace: Namespace of KFP deployment
        kwargs: All the parameters the pipeline will be fed with

    Returns:
        metrics: Dict of metrics along with their values
    """
    logger = _get_logger()

    pod_namespace = podutils.get_namespace()

    run_id = _create_kfp_run(pipeline_id, run_name, version_id,
                             experiment_name, namespace, **kwargs)

    logger.info("Annotating Trial '%s' with the KFP Run UUID '%s'...",
                run_name, run_id)
    try:
        # Katib Trial name == KFP Run name by design (see rpc.katib)
        katibutils.annotate_trial(run_name, pod_namespace,
                                  {KALE_KATIB_KFP_ANNOTATION: run_id})
    except Exception:
        logger.exception(
            "Failed to annotate Trial '%s' with the KFP Run UUID"
            " '%s'", run_name, run_id)

    logger.info("Getting Workflow name for run '%s'...", run_id)
    workflow_name = _get_workflow_from_run(get_run(run_id))["metadata"]["name"]
    logger.info("Workflow name: %s", workflow_name)
    logger.info("Getting the Katib trial...")
    trial = katibutils.get_trial(run_name, pod_namespace)
    logger.info("Trial name: %s, UID: %s", trial["metadata"]["name"],
                trial["metadata"]["uid"])
    logger.info("Getting owner Katib experiment of trial...")
    exp_name, exp_id = katibutils.get_owner_experiment_from_trial(trial)
    logger.info("Experiment name: %s, UID: %s", exp_name, exp_id)
    wf_annotations = {
        katibutils.EXPERIMENT_NAME_ANNOTATION_KEY: exp_name,
        katibutils.EXPERIMENT_ID_ANNOTATION_KEY: exp_id,
        katibutils.TRIAL_NAME_ANNOTATION_KEY: trial["metadata"]["name"],
        katibutils.TRIAL_ID_ANNOTATION_KEY: trial["metadata"]["uid"],
    }
    try:
        workflowutils.annotate_workflow(workflow_name, pod_namespace,
                                        wf_annotations)
    except Exception:
        logger.exception(
            "Failed to annotate Workflow '%s' with the Katib"
            " details", workflow_name)

    status = _wait_kfp_run(run_id)

    # If run has not succeeded, return no metrics
    if status != "Succeeded":
        logger.warning("KFP run did not run successfully. No metrics to"
                       " return.")
        # exit gracefully with error
        sys.exit(-1)

    # Retrieve metrics
    run_metrics = _get_kfp_run_metrics(run_id, namespace)
    for name, value in run_metrics.items():
        logger.info("%s=%s", name, value)

    return run_metrics
Exemple #30
0
def create_katib_experiment(request, pipeline_id, version_id,
                            pipeline_metadata, output_path):
    """Create and launch a new Katib experiment.

    The Katib metadata must include all the information required to create an
    Experiment CRD (algorithm, objective, search parameters, ...). This
    information is sanitized a new yaml is written to file. This yaml is then
    submitted to the K8s API server to create the Experiment CR.

    Args:
        request: RPC request object
        pipeline_id: The id of the KFP pipeline that will be run by the Trials
        version_id: The id of the KFP pipeline version run by the Trials
        pipeline_metadata: The Kale notebook metadata
        output_path: The directory to store the YAML definition

    Returns (dict): a dictionary describing the status of the experiment
    """
    old_katibutils_logger = katibutils.log
    katibutils.log = request.log

    try:
        namespace = podutils.get_namespace()
    except Exception:
        # XXX: When not running from within a pod, get_namespace() fails
        # XXX: If that's the case, use the 'kubeflow-user' one
        # XXX: This should probably change. It works for local/MiniKF dev
        namespace = "kubeflow-user"

    katib_name = pipeline_metadata.get("experiment_name")
    katib_spec = pipeline_metadata.get("katib_metadata", None)
    if not katib_spec:
        raise RPCNotFoundError(details=("Could not find Katib specification in"
                                        " notebook's metadata"),
                               trans_id=request.trans_id)
    # Perform a sanitization of the Katib specification, making sure all the
    # required first-layer-fields are set
    katib_spec = _sanitize_katib_spec(request, katib_spec)

    katib_experiment = katibutils.construct_experiment_cr(
        name=katib_name,
        experiment_spec=katib_spec,
        pipeline_id=pipeline_id,
        version_id=version_id,
        experiment_name=pipeline_metadata.get("experiment_name"),
        api_version=katibutils.discover_katib_version())
    definition_path = os.path.abspath(
        os.path.join(output_path, "%s.katib.yaml" % katib_name))
    request.log.info("Saving Katib experiment definition at %s",
                     definition_path)
    with open(definition_path, "w") as yaml_file:
        import yaml
        yaml_text = yaml.dump(katib_experiment)
        yaml_file.write(yaml_text)

    try:
        _launch_katib_experiment(request, katib_experiment, namespace)
    except Exception:
        katibutils.log = old_katibutils_logger
        raise

    katibutils.log = old_katibutils_logger

    debugyaml = _construct_experiment_return_base(katib_experiment, namespace)

    request.log.info("hbseo %s", yaml.dump(debugyaml,
                                           default_flow_style=False))

    return debugyaml