def notebook_pipeline():
    """A pipeline to run a Jupyter notebook with elyra-ai/kfp-notebook and Papermill."""

    from elyra.kfp.operator import ExecuteFileOp as NotebookOp

    notebook_op = NotebookOp(name="${name}",
                             pipeline_name="${name}",
                             experiment_name="NOTEBOOK_RUNS",
                             notebook="${notebook}",
                             cos_endpoint="${cos_endpoint}",
                             cos_bucket="${cos_bucket}",
                             cos_directory="${cos_directory}",
                             cos_dependencies_archive="${cos_dependencies_archive}",
                             requirements_url="${requirements_url}",
                             image="${image}")

    from kubernetes.client.models import V1EnvVar

    notebook_op.container.add_env_variable(V1EnvVar(name='AWS_ACCESS_KEY_ID', value="${cos_username}"))
    notebook_op.container.add_env_variable(V1EnvVar(name='AWS_SECRET_ACCESS_KEY', value="${cos_password}"))

    from kfp import onprem

    notebook_op.container.add_env_variable(V1EnvVar(name='DATA_DIR', value="${mount_path}"))
    notebook_op.apply(onprem.mount_pvc(pvc_name='${dataset_pvc}',
                                       volume_name='${dataset_pvc}',
                                       volume_mount_path='${mount_path}'))
Example #2
0
def test_construct_with_both_pipeline_inputs_and_outputs():
    notebook_op = ExecuteFileOp(
        name="test",
        pipeline_name="test-pipeline",
        experiment_name="experiment-name",
        notebook="test_notebook.ipynb",
        cos_endpoint="http://testserver:32525",
        cos_bucket="test_bucket",
        cos_directory="test_directory",
        cos_dependencies_archive="test_archive.tgz",
        pipeline_inputs=["test_input1.txt", "test_input2.txt"],
        pipeline_outputs=["test_output1.txt", "test_output2.txt"],
        image="test/image:dev",
    )
    assert notebook_op.pipeline_inputs == [
        "test_input1.txt", "test_input2.txt"
    ]
    assert notebook_op.pipeline_outputs == [
        "test_output1.txt", "test_output2.txt"
    ]

    assert '--inputs "test_input1.txt;test_input2.txt"' in notebook_op.container.args[
        0]
    assert '--outputs "test_output1.txt;test_output2.txt"' in notebook_op.container.args[
        0]
Example #3
0
def test_fail_without_notebook():
    with pytest.raises(TypeError):
        ExecuteFileOp(
            name="test",
            pipeline_name="test-pipeline",
            experiment_name="experiment-name",
            cos_endpoint="http://testserver:32525",
            cos_bucket="test_bucket",
            cos_directory="test_directory",
            cos_dependencies_archive="test_archive.tgz",
            image="test/image:dev",
        )
Example #4
0
def test_fail_without_runtime_image():
    with pytest.raises(ValueError) as error_info:
        ExecuteFileOp(
            name="test",
            pipeline_name="test-pipeline",
            experiment_name="experiment-name",
            notebook="test_notebook.ipynb",
            cos_endpoint="http://testserver:32525",
            cos_bucket="test_bucket",
            cos_directory="test_directory",
            cos_dependencies_archive="test_archive.tgz",
        )
    assert "You need to provide an image." == str(error_info.value)
Example #5
0
def test_properly_set_python_script_name_when_in_subdirectory():
    notebook_op = ExecuteFileOp(
        name="test",
        pipeline_name="test-pipeline",
        experiment_name="experiment-name",
        notebook="foo/test.py",
        cos_endpoint="http://testserver:32525",
        cos_bucket="test_bucket",
        cos_directory="test_directory",
        cos_dependencies_archive="test_archive.tgz",
        image="test/image:dev",
    )
    assert "test.py" == notebook_op.notebook_name
Example #6
0
def test_fail_with_empty_string_as_name():
    with pytest.raises(ValueError):
        ExecuteFileOp(
            name="",
            pipeline_name="test-pipeline",
            experiment_name="experiment-name",
            notebook="test_notebook.ipynb",
            cos_endpoint="http://testserver:32525",
            cos_bucket="test_bucket",
            cos_directory="test_directory",
            cos_dependencies_archive="test_archive.tgz",
            image="test/image:dev",
        )
Example #7
0
def test_override_requirements_url():
    notebook_op = ExecuteFileOp(
        name="test",
        pipeline_name="test-pipeline",
        experiment_name="experiment-name",
        requirements_url="https://test.server.com/requirements.py",
        notebook="test_notebook.ipynb",
        cos_endpoint="http://testserver:32525",
        cos_bucket="test_bucket",
        cos_directory="test_directory",
        cos_dependencies_archive="test_archive.tgz",
        image="test/image:dev",
    )
    assert notebook_op.requirements_url == "https://test.server.com/requirements.py"
Example #8
0
def test_construct_with_bad_pipeline_outputs():
    with pytest.raises(ValueError) as error_info:
        ExecuteFileOp(
            name="test",
            pipeline_name="test-pipeline",
            experiment_name="experiment-name",
            notebook="test_notebook.ipynb",
            cos_endpoint="http://testserver:32525",
            cos_bucket="test_bucket",
            cos_directory="test_directory",
            cos_dependencies_archive="test_archive.tgz",
            pipeline_outputs=["test_output1.txt", "test;output2.txt"],
            image="test/image:dev",
        )
    assert "Illegal character (;) found in filename 'test;output2.txt'." == str(
        error_info.value)
Example #9
0
def test_user_crio_volume_creation():
    notebook_op = ExecuteFileOp(
        name="test",
        pipeline_name="test-pipeline",
        experiment_name="experiment-name",
        notebook="test_notebook.ipynb",
        cos_endpoint="http://testserver:32525",
        cos_bucket="test_bucket",
        cos_directory="test_directory",
        cos_dependencies_archive="test_archive.tgz",
        image="test/image:dev",
        emptydir_volume_size="20Gi",
    )
    assert notebook_op.emptydir_volume_size == "20Gi"
    assert notebook_op.container_work_dir_root_path == "/opt/app-root/src/"
    assert notebook_op.container.volume_mounts.__len__() == 1
    # Environment variables: PYTHONPATH, ELYRA_RUN_NAME
    assert notebook_op.container.env.__len__() == 2, notebook_op.container.env
Example #10
0
def test_construct_with_env_variables_tekton():
    notebook_op = ExecuteFileOp(
        name="test",
        pipeline_name="test-pipeline",
        experiment_name="experiment-name",
        notebook="test_notebook.ipynb",
        cos_endpoint="http://testserver:32525",
        cos_bucket="test_bucket",
        cos_directory="test_directory",
        cos_dependencies_archive="test_archive.tgz",
        pipeline_envs={
            "ENV_VAR_ONE": "1",
            "ENV_VAR_TWO": "2",
            "ENV_VAR_THREE": "3"
        },
        image="test/image:dev",
        workflow_engine="Tekton",
    )

    confirmation_names = [
        "ENV_VAR_ONE", "ENV_VAR_TWO", "ENV_VAR_THREE", "ELYRA_RUN_NAME"
    ]
    confirmation_values = ["1", "2", "3"]
    field_path = "metadata.annotations['pipelines.kubeflow.org/run_name']"
    for env_val in notebook_op.container.env:
        assert env_val.name in confirmation_names
        confirmation_names.remove(env_val.name)
        if env_val.name == "ELYRA_RUN_NAME":
            assert env_val.value_from.field_ref.field_path == field_path, env_val.value_from.field_ref
        else:
            assert env_val.value in confirmation_values
            confirmation_values.remove(env_val.value)

    # Verify confirmation values have been drained.
    assert len(confirmation_names) == 0
    assert len(confirmation_values) == 0
Example #11
0
    def _cc_pipeline(
        self, pipeline, pipeline_name, pipeline_version="", experiment_name="", cos_directory=None, export=False
    ):

        runtime_configuration = self._get_metadata_configuration(
            schemaspace=Runtimes.RUNTIMES_SCHEMASPACE_ID, name=pipeline.runtime_config
        )

        cos_endpoint = runtime_configuration.metadata["cos_endpoint"]
        cos_username = runtime_configuration.metadata.get("cos_username")
        cos_password = runtime_configuration.metadata.get("cos_password")
        cos_secret = runtime_configuration.metadata.get("cos_secret")
        cos_bucket = runtime_configuration.metadata.get("cos_bucket")
        if cos_directory is None:
            cos_directory = pipeline_name

        engine = runtime_configuration.metadata["engine"]

        self.log_pipeline_info(
            pipeline_name,
            f"processing pipeline dependencies to: {cos_endpoint} " f"bucket: {cos_bucket} folder: {cos_directory}",
        )
        t0_all = time.time()

        emptydir_volume_size = ""
        container_runtime = bool(os.getenv("CRIO_RUNTIME", "False").lower() == "true")

        # Create dictionary that maps component Id to its ContainerOp instance
        target_ops = {}

        # Sort operations based on dependency graph (topological order)
        sorted_operations = PipelineProcessor._sort_operations(pipeline.operations)

        # Determine whether access to cloud storage is required
        for operation in sorted_operations:
            if isinstance(operation, GenericOperation):
                self._verify_cos_connectivity(runtime_configuration)
                break

        # All previous operation outputs should be propagated throughout the pipeline.
        # In order to process this recursively, the current operation's inputs should be combined
        # from its parent's inputs (which, themselves are derived from the outputs of their parent)
        # and its parent's outputs.

        PipelineProcessor._propagate_operation_inputs_outputs(pipeline, sorted_operations)

        for operation in sorted_operations:

            if container_runtime:
                # Volume size to create when using CRI-o, NOTE: IBM Cloud minimum is 20Gi
                emptydir_volume_size = "20Gi"

            sanitized_operation_name = self._sanitize_operation_name(operation.name)

            # Create pipeline operation
            # If operation is one of the "generic" set of NBs or scripts, construct custom ExecuteFileOp
            if isinstance(operation, GenericOperation):

                # Collect env variables
                pipeline_envs = self._collect_envs(
                    operation, cos_secret=cos_secret, cos_username=cos_username, cos_password=cos_password
                )

                operation_artifact_archive = self._get_dependency_archive_name(operation)

                self.log.debug(f"Creating pipeline component:\n {operation} archive : {operation_artifact_archive}")

                target_ops[operation.id] = ExecuteFileOp(
                    name=sanitized_operation_name,
                    pipeline_name=pipeline_name,
                    experiment_name=experiment_name,
                    notebook=operation.filename,
                    cos_endpoint=cos_endpoint,
                    cos_bucket=cos_bucket,
                    cos_directory=cos_directory,
                    cos_dependencies_archive=operation_artifact_archive,
                    pipeline_version=pipeline_version,
                    pipeline_source=pipeline.source,
                    pipeline_inputs=operation.inputs,
                    pipeline_outputs=operation.outputs,
                    pipeline_envs=pipeline_envs,
                    emptydir_volume_size=emptydir_volume_size,
                    cpu_request=operation.cpu,
                    mem_request=operation.memory,
                    gpu_limit=operation.gpu,
                    workflow_engine=engine,
                    image=operation.runtime_image,
                    file_outputs={
                        "mlpipeline-metrics": f"{pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']}/mlpipeline-metrics.json",  # noqa
                        "mlpipeline-ui-metadata": f"{pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']}/mlpipeline-ui-metadata.json",  # noqa
                    },
                )

                if operation.doc:
                    target_ops[operation.id].add_pod_annotation("elyra/node-user-doc", operation.doc)

                # TODO Can we move all of this to apply to non-standard components as well? Test when servers are up
                if cos_secret and not export:
                    target_ops[operation.id].apply(use_aws_secret(cos_secret))

                image_namespace = self._get_metadata_configuration(RuntimeImages.RUNTIME_IMAGES_SCHEMASPACE_ID)
                for image_instance in image_namespace:
                    if image_instance.metadata["image_name"] == operation.runtime_image and image_instance.metadata.get(
                        "pull_policy"
                    ):
                        target_ops[operation.id].container.set_image_pull_policy(image_instance.metadata["pull_policy"])

                self.log_pipeline_info(
                    pipeline_name,
                    f"processing operation dependencies for id: {operation.id}",
                    operation_name=operation.name,
                )

                self._upload_dependencies_to_object_store(runtime_configuration, cos_directory, operation)

            # If operation is a "non-standard" component, load it's spec and create operation with factory function
            else:
                # Retrieve component from cache
                component = ComponentCache.instance().get_component(self._type, operation.classifier)

                # Convert the user-entered value of certain properties according to their type
                for component_property in component.properties:
                    # Get corresponding property's value from parsed pipeline
                    property_value = operation.component_params.get(component_property.ref)

                    self.log.debug(
                        f"Processing component parameter '{component_property.name}' "
                        f"of type '{component_property.data_type}'"
                    )

                    if component_property.data_type == "inputpath":
                        output_node_id = property_value["value"]
                        output_node_parameter_key = property_value["option"].replace("elyra_output_", "")
                        operation.component_params[component_property.ref] = target_ops[output_node_id].outputs[
                            output_node_parameter_key
                        ]
                    elif component_property.data_type == "inputvalue":
                        active_property = property_value["activeControl"]
                        active_property_value = property_value.get(active_property, None)

                        # If the value is not found, assign it the default value assigned in parser
                        if active_property_value is None:
                            active_property_value = component_property.value

                        if isinstance(active_property_value, dict) and set(active_property_value.keys()) == {
                            "value",
                            "option",
                        }:
                            output_node_id = active_property_value["value"]
                            output_node_parameter_key = active_property_value["option"].replace("elyra_output_", "")
                            operation.component_params[component_property.ref] = target_ops[output_node_id].outputs[
                                output_node_parameter_key
                            ]
                        elif component_property.default_data_type == "dictionary":
                            processed_value = self._process_dictionary_value(active_property_value)
                            operation.component_params[component_property.ref] = processed_value
                        elif component_property.default_data_type == "list":
                            processed_value = self._process_list_value(active_property_value)
                            operation.component_params[component_property.ref] = processed_value
                        else:
                            operation.component_params[component_property.ref] = active_property_value

                # Build component task factory
                try:
                    factory_function = components.load_component_from_text(component.definition)
                except Exception as e:
                    # TODO Fix error messaging and break exceptions down into categories
                    self.log.error(f"Error loading component spec for {operation.name}: {str(e)}")
                    raise RuntimeError(f"Error loading component spec for {operation.name}.")

                # Add factory function, which returns a ContainerOp task instance, to pipeline operation dict
                try:
                    comp_spec_inputs = [
                        inputs.name.lower().replace(" ", "_") for inputs in factory_function.component_spec.inputs
                    ]

                    # Remove inputs and outputs from params dict
                    # TODO: need to have way to retrieve only required params
                    parameter_removal_list = ["inputs", "outputs"]
                    for component_param in operation.component_params_as_dict.keys():
                        if component_param not in comp_spec_inputs:
                            parameter_removal_list.append(component_param)

                    for parameter in parameter_removal_list:
                        operation.component_params_as_dict.pop(parameter, None)

                    # Create ContainerOp instance and assign appropriate user-provided name
                    sanitized_component_params = {
                        self._sanitize_param_name(name): value
                        for name, value in operation.component_params_as_dict.items()
                    }
                    container_op = factory_function(**sanitized_component_params)
                    container_op.set_display_name(operation.name)

                    if operation.doc:
                        container_op.add_pod_annotation("elyra/node-user-doc", operation.doc)

                    target_ops[operation.id] = container_op
                except Exception as e:
                    # TODO Fix error messaging and break exceptions down into categories
                    self.log.error(f"Error constructing component {operation.name}: {str(e)}")
                    raise RuntimeError(f"Error constructing component {operation.name}.")

        # Process dependencies after all the operations have been created
        for operation in pipeline.operations.values():
            op = target_ops[operation.id]
            for parent_operation_id in operation.parent_operation_ids:
                parent_op = target_ops[parent_operation_id]  # Parent Operation
                op.after(parent_op)

        self.log_pipeline_info(pipeline_name, "pipeline dependencies processed", duration=(time.time() - t0_all))

        return target_ops
Example #12
0
def test_normalize_label_value():
    valid_middle_chars = "-_."

    # test min length
    assert ExecuteFileOp._normalize_label_value(None) == ""
    assert ExecuteFileOp._normalize_label_value("") == ""
    # test max length (63)
    assert ExecuteFileOp._normalize_label_value("a" * 63) == "a" * 63
    assert ExecuteFileOp._normalize_label_value("a" *
                                                64) == "a" * 63  # truncated
    # test first and last char
    assert ExecuteFileOp._normalize_label_value("1") == "1"
    assert ExecuteFileOp._normalize_label_value("22") == "22"
    assert ExecuteFileOp._normalize_label_value("3_3") == "3_3"
    assert ExecuteFileOp._normalize_label_value("4u4") == "4u4"
    assert ExecuteFileOp._normalize_label_value("5$5") == "5_5"

    # test first char
    for c in string.printable:
        if c in string.ascii_letters + string.digits:
            # first char is valid
            # no length violation
            assert ExecuteFileOp._normalize_label_value(c) == c
            assert ExecuteFileOp._normalize_label_value(c + "B") == c + "B"
            # max length
            assert ExecuteFileOp._normalize_label_value(c +
                                                        "B" * 62) == (c +
                                                                      "B" * 62)
            # max length exceeded
            assert ExecuteFileOp._normalize_label_value(c + "B" * 63) == (
                c + "B" * 62)  # truncated
        else:
            # first char is invalid, e.g. '#a', and becomes the
            # second char, which might require replacement
            rv = c
            if c not in valid_middle_chars:
                rv = "_"
            # no length violation
            assert ExecuteFileOp._normalize_label_value(c) == "a" + rv + "a"
            assert ExecuteFileOp._normalize_label_value(c +
                                                        "B") == "a" + rv + "B"
            # max length
            assert ExecuteFileOp._normalize_label_value(c + "B" * 62) == (
                "a" + rv + "B" * 61)  # truncated
            # max length exceeded
            assert ExecuteFileOp._normalize_label_value(c + "B" * 63) == (
                "a" + rv + "B" * 61)  # truncated

    # test last char
    for c in string.printable:
        if c in string.ascii_letters + string.digits:
            # no length violation
            assert ExecuteFileOp._normalize_label_value("b" + c) == "b" + c
            # max length
            assert ExecuteFileOp._normalize_label_value("b" * 62 +
                                                        c) == ("b" * 62 + c)
            # max length exceeded
            assert ExecuteFileOp._normalize_label_value("b" * 63 + c) == ("b" *
                                                                          63)
        else:
            # last char is invalid, e.g. 'a#', and requires
            # patching
            rv = c
            if c not in valid_middle_chars:
                rv = "_"
            # no length violation (char is appended)
            assert ExecuteFileOp._normalize_label_value("b" +
                                                        c) == "b" + rv + "a"
            # max length (char is replaced)
            assert ExecuteFileOp._normalize_label_value("b" * 62 +
                                                        c) == ("b" * 62 + "a")
            # max length exceeded (no action required)
            assert ExecuteFileOp._normalize_label_value("b" * 63 + c) == ("b" *
                                                                          63)

    # test first and last char
    for c in string.printable:
        if c in string.ascii_letters + string.digits:
            # no length violation
            assert ExecuteFileOp._normalize_label_value(
                c + "b" + c) == c + "b" + c  # nothing is modified
            # max length
            assert ExecuteFileOp._normalize_label_value(c + "b" * 61 + c) == (
                c + "b" * 61 + c)  # nothing is modified
            # max length exceeded
            assert ExecuteFileOp._normalize_label_value(
                c + "b" * 62 + c) == c + "b" * 62  # truncate only
        else:
            # first and last characters are invalid, e.g. '#a#'
            rv = c
            if c not in valid_middle_chars:
                rv = "_"
            # no length violation
            assert ExecuteFileOp._normalize_label_value(
                c + "b" + c) == "a" + rv + "b" + rv + "a"
            # max length
            assert ExecuteFileOp._normalize_label_value(c + "b" * 59 +
                                                        c) == ("a" + rv +
                                                               "b" * 59 + rv +
                                                               "a")
            # max length exceeded after processing, scenario 1
            # resolved by adding char before first, replace last
            assert ExecuteFileOp._normalize_label_value(c + "b" * 60 +
                                                        c) == ("a" + rv +
                                                               "b" * 60 + "a")
            # max length exceeded after processing, scenario 2
            # resolved by adding char before first, appending after last
            assert ExecuteFileOp._normalize_label_value(c + "b" * 59 +
                                                        c) == ("a" + rv +
                                                               "b" * 59 + rv +
                                                               "a")
            # max length exceeded before processing, scenario 1
            # resolved by adding char before first, truncating last
            assert ExecuteFileOp._normalize_label_value(c + "b" * 62 +
                                                        c) == ("a" + rv +
                                                               "b" * 61)
            # max length exceeded before processing, scenario 2
            # resolved by adding char before first, replacing last
            assert ExecuteFileOp._normalize_label_value(c + "b" * 60 +
                                                        c * 3) == ("a" + rv +
                                                                   "b" * 60 +
                                                                   "a")

    # test char in a position other than first and last
    # if invalid, the char is replaced with '_'
    for c in string.printable:
        if c in string.ascii_letters + string.digits + "-_.":
            assert ExecuteFileOp._normalize_label_value("A" + c +
                                                        "Z") == "A" + c + "Z"
        else:
            assert ExecuteFileOp._normalize_label_value("A" + c + "Z") == "A_Z"

    # encore
    assert ExecuteFileOp._normalize_label_value(r"¯\_(ツ)_/¯") == "a_________a"
Example #13
0
def test_construct_with_env_variables_argo():
    notebook_op = ExecuteFileOp(
        name="test",
        pipeline_name="test-pipeline",
        experiment_name="experiment-name",
        notebook="test_notebook.ipynb",
        cos_endpoint="http://testserver:32525",
        cos_bucket="test_bucket",
        cos_directory="test_directory",
        cos_dependencies_archive="test_archive.tgz",
        pipeline_envs={
            "ENV_VAR_ONE": "1",
            "ENV_VAR_TWO": "2",
            "ENV_VAR_THREE": "3"
        },
        image="test/image:dev",
    )

    confirmation_names = [
        "ENV_VAR_ONE", "ENV_VAR_TWO", "ENV_VAR_THREE", "ELYRA_RUN_NAME"
    ]
    confirmation_values = ["1", "2", "3", RUN_ID_PLACEHOLDER]
    for env_val in notebook_op.container.env:
        assert env_val.name in confirmation_names
        assert env_val.value in confirmation_values
        confirmation_names.remove(env_val.name)
        confirmation_values.remove(env_val.value)

    # Verify confirmation values have been drained.
    assert len(confirmation_names) == 0
    assert len(confirmation_values) == 0

    # same as before but explicitly specify the workflow engine type
    # as Argo
    notebook_op = ExecuteFileOp(
        name="test",
        pipeline_name="test-pipeline",
        experiment_name="experiment-name",
        notebook="test_notebook.ipynb",
        cos_endpoint="http://testserver:32525",
        cos_bucket="test_bucket",
        cos_directory="test_directory",
        cos_dependencies_archive="test_archive.tgz",
        pipeline_envs={
            "ENV_VAR_ONE": "1",
            "ENV_VAR_TWO": "2",
            "ENV_VAR_THREE": "3"
        },
        image="test/image:dev",
        workflow_engine="Argo",
    )

    confirmation_names = [
        "ENV_VAR_ONE", "ENV_VAR_TWO", "ENV_VAR_THREE", "ELYRA_RUN_NAME"
    ]
    confirmation_values = ["1", "2", "3", RUN_ID_PLACEHOLDER]
    for env_val in notebook_op.container.env:
        assert env_val.name in confirmation_names
        assert env_val.value in confirmation_values
        confirmation_names.remove(env_val.name)
        confirmation_values.remove(env_val.value)

    # Verify confirmation values have been drained.
    assert len(confirmation_names) == 0
    assert len(confirmation_values) == 0