def notebook_pipeline(): """A pipeline to run a Jupyter notebook with elyra-ai/kfp-notebook and Papermill.""" from elyra.kfp.operator import ExecuteFileOp as NotebookOp notebook_op = NotebookOp(name="${name}", pipeline_name="${name}", experiment_name="NOTEBOOK_RUNS", notebook="${notebook}", cos_endpoint="${cos_endpoint}", cos_bucket="${cos_bucket}", cos_directory="${cos_directory}", cos_dependencies_archive="${cos_dependencies_archive}", requirements_url="${requirements_url}", image="${image}") from kubernetes.client.models import V1EnvVar notebook_op.container.add_env_variable(V1EnvVar(name='AWS_ACCESS_KEY_ID', value="${cos_username}")) notebook_op.container.add_env_variable(V1EnvVar(name='AWS_SECRET_ACCESS_KEY', value="${cos_password}")) from kfp import onprem notebook_op.container.add_env_variable(V1EnvVar(name='DATA_DIR', value="${mount_path}")) notebook_op.apply(onprem.mount_pvc(pvc_name='${dataset_pvc}', volume_name='${dataset_pvc}', volume_mount_path='${mount_path}'))
def test_construct_with_both_pipeline_inputs_and_outputs(): notebook_op = ExecuteFileOp( name="test", pipeline_name="test-pipeline", experiment_name="experiment-name", notebook="test_notebook.ipynb", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", pipeline_inputs=["test_input1.txt", "test_input2.txt"], pipeline_outputs=["test_output1.txt", "test_output2.txt"], image="test/image:dev", ) assert notebook_op.pipeline_inputs == [ "test_input1.txt", "test_input2.txt" ] assert notebook_op.pipeline_outputs == [ "test_output1.txt", "test_output2.txt" ] assert '--inputs "test_input1.txt;test_input2.txt"' in notebook_op.container.args[ 0] assert '--outputs "test_output1.txt;test_output2.txt"' in notebook_op.container.args[ 0]
def test_fail_without_notebook(): with pytest.raises(TypeError): ExecuteFileOp( name="test", pipeline_name="test-pipeline", experiment_name="experiment-name", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", image="test/image:dev", )
def test_fail_without_runtime_image(): with pytest.raises(ValueError) as error_info: ExecuteFileOp( name="test", pipeline_name="test-pipeline", experiment_name="experiment-name", notebook="test_notebook.ipynb", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", ) assert "You need to provide an image." == str(error_info.value)
def test_properly_set_python_script_name_when_in_subdirectory(): notebook_op = ExecuteFileOp( name="test", pipeline_name="test-pipeline", experiment_name="experiment-name", notebook="foo/test.py", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", image="test/image:dev", ) assert "test.py" == notebook_op.notebook_name
def test_fail_with_empty_string_as_name(): with pytest.raises(ValueError): ExecuteFileOp( name="", pipeline_name="test-pipeline", experiment_name="experiment-name", notebook="test_notebook.ipynb", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", image="test/image:dev", )
def test_override_requirements_url(): notebook_op = ExecuteFileOp( name="test", pipeline_name="test-pipeline", experiment_name="experiment-name", requirements_url="https://test.server.com/requirements.py", notebook="test_notebook.ipynb", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", image="test/image:dev", ) assert notebook_op.requirements_url == "https://test.server.com/requirements.py"
def test_construct_with_bad_pipeline_outputs(): with pytest.raises(ValueError) as error_info: ExecuteFileOp( name="test", pipeline_name="test-pipeline", experiment_name="experiment-name", notebook="test_notebook.ipynb", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", pipeline_outputs=["test_output1.txt", "test;output2.txt"], image="test/image:dev", ) assert "Illegal character (;) found in filename 'test;output2.txt'." == str( error_info.value)
def test_user_crio_volume_creation(): notebook_op = ExecuteFileOp( name="test", pipeline_name="test-pipeline", experiment_name="experiment-name", notebook="test_notebook.ipynb", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", image="test/image:dev", emptydir_volume_size="20Gi", ) assert notebook_op.emptydir_volume_size == "20Gi" assert notebook_op.container_work_dir_root_path == "/opt/app-root/src/" assert notebook_op.container.volume_mounts.__len__() == 1 # Environment variables: PYTHONPATH, ELYRA_RUN_NAME assert notebook_op.container.env.__len__() == 2, notebook_op.container.env
def test_construct_with_env_variables_tekton(): notebook_op = ExecuteFileOp( name="test", pipeline_name="test-pipeline", experiment_name="experiment-name", notebook="test_notebook.ipynb", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", pipeline_envs={ "ENV_VAR_ONE": "1", "ENV_VAR_TWO": "2", "ENV_VAR_THREE": "3" }, image="test/image:dev", workflow_engine="Tekton", ) confirmation_names = [ "ENV_VAR_ONE", "ENV_VAR_TWO", "ENV_VAR_THREE", "ELYRA_RUN_NAME" ] confirmation_values = ["1", "2", "3"] field_path = "metadata.annotations['pipelines.kubeflow.org/run_name']" for env_val in notebook_op.container.env: assert env_val.name in confirmation_names confirmation_names.remove(env_val.name) if env_val.name == "ELYRA_RUN_NAME": assert env_val.value_from.field_ref.field_path == field_path, env_val.value_from.field_ref else: assert env_val.value in confirmation_values confirmation_values.remove(env_val.value) # Verify confirmation values have been drained. assert len(confirmation_names) == 0 assert len(confirmation_values) == 0
def _cc_pipeline( self, pipeline, pipeline_name, pipeline_version="", experiment_name="", cos_directory=None, export=False ): runtime_configuration = self._get_metadata_configuration( schemaspace=Runtimes.RUNTIMES_SCHEMASPACE_ID, name=pipeline.runtime_config ) cos_endpoint = runtime_configuration.metadata["cos_endpoint"] cos_username = runtime_configuration.metadata.get("cos_username") cos_password = runtime_configuration.metadata.get("cos_password") cos_secret = runtime_configuration.metadata.get("cos_secret") cos_bucket = runtime_configuration.metadata.get("cos_bucket") if cos_directory is None: cos_directory = pipeline_name engine = runtime_configuration.metadata["engine"] self.log_pipeline_info( pipeline_name, f"processing pipeline dependencies to: {cos_endpoint} " f"bucket: {cos_bucket} folder: {cos_directory}", ) t0_all = time.time() emptydir_volume_size = "" container_runtime = bool(os.getenv("CRIO_RUNTIME", "False").lower() == "true") # Create dictionary that maps component Id to its ContainerOp instance target_ops = {} # Sort operations based on dependency graph (topological order) sorted_operations = PipelineProcessor._sort_operations(pipeline.operations) # Determine whether access to cloud storage is required for operation in sorted_operations: if isinstance(operation, GenericOperation): self._verify_cos_connectivity(runtime_configuration) break # All previous operation outputs should be propagated throughout the pipeline. # In order to process this recursively, the current operation's inputs should be combined # from its parent's inputs (which, themselves are derived from the outputs of their parent) # and its parent's outputs. PipelineProcessor._propagate_operation_inputs_outputs(pipeline, sorted_operations) for operation in sorted_operations: if container_runtime: # Volume size to create when using CRI-o, NOTE: IBM Cloud minimum is 20Gi emptydir_volume_size = "20Gi" sanitized_operation_name = self._sanitize_operation_name(operation.name) # Create pipeline operation # If operation is one of the "generic" set of NBs or scripts, construct custom ExecuteFileOp if isinstance(operation, GenericOperation): # Collect env variables pipeline_envs = self._collect_envs( operation, cos_secret=cos_secret, cos_username=cos_username, cos_password=cos_password ) operation_artifact_archive = self._get_dependency_archive_name(operation) self.log.debug(f"Creating pipeline component:\n {operation} archive : {operation_artifact_archive}") target_ops[operation.id] = ExecuteFileOp( name=sanitized_operation_name, pipeline_name=pipeline_name, experiment_name=experiment_name, notebook=operation.filename, cos_endpoint=cos_endpoint, cos_bucket=cos_bucket, cos_directory=cos_directory, cos_dependencies_archive=operation_artifact_archive, pipeline_version=pipeline_version, pipeline_source=pipeline.source, pipeline_inputs=operation.inputs, pipeline_outputs=operation.outputs, pipeline_envs=pipeline_envs, emptydir_volume_size=emptydir_volume_size, cpu_request=operation.cpu, mem_request=operation.memory, gpu_limit=operation.gpu, workflow_engine=engine, image=operation.runtime_image, file_outputs={ "mlpipeline-metrics": f"{pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']}/mlpipeline-metrics.json", # noqa "mlpipeline-ui-metadata": f"{pipeline_envs['ELYRA_WRITABLE_CONTAINER_DIR']}/mlpipeline-ui-metadata.json", # noqa }, ) if operation.doc: target_ops[operation.id].add_pod_annotation("elyra/node-user-doc", operation.doc) # TODO Can we move all of this to apply to non-standard components as well? Test when servers are up if cos_secret and not export: target_ops[operation.id].apply(use_aws_secret(cos_secret)) image_namespace = self._get_metadata_configuration(RuntimeImages.RUNTIME_IMAGES_SCHEMASPACE_ID) for image_instance in image_namespace: if image_instance.metadata["image_name"] == operation.runtime_image and image_instance.metadata.get( "pull_policy" ): target_ops[operation.id].container.set_image_pull_policy(image_instance.metadata["pull_policy"]) self.log_pipeline_info( pipeline_name, f"processing operation dependencies for id: {operation.id}", operation_name=operation.name, ) self._upload_dependencies_to_object_store(runtime_configuration, cos_directory, operation) # If operation is a "non-standard" component, load it's spec and create operation with factory function else: # Retrieve component from cache component = ComponentCache.instance().get_component(self._type, operation.classifier) # Convert the user-entered value of certain properties according to their type for component_property in component.properties: # Get corresponding property's value from parsed pipeline property_value = operation.component_params.get(component_property.ref) self.log.debug( f"Processing component parameter '{component_property.name}' " f"of type '{component_property.data_type}'" ) if component_property.data_type == "inputpath": output_node_id = property_value["value"] output_node_parameter_key = property_value["option"].replace("elyra_output_", "") operation.component_params[component_property.ref] = target_ops[output_node_id].outputs[ output_node_parameter_key ] elif component_property.data_type == "inputvalue": active_property = property_value["activeControl"] active_property_value = property_value.get(active_property, None) # If the value is not found, assign it the default value assigned in parser if active_property_value is None: active_property_value = component_property.value if isinstance(active_property_value, dict) and set(active_property_value.keys()) == { "value", "option", }: output_node_id = active_property_value["value"] output_node_parameter_key = active_property_value["option"].replace("elyra_output_", "") operation.component_params[component_property.ref] = target_ops[output_node_id].outputs[ output_node_parameter_key ] elif component_property.default_data_type == "dictionary": processed_value = self._process_dictionary_value(active_property_value) operation.component_params[component_property.ref] = processed_value elif component_property.default_data_type == "list": processed_value = self._process_list_value(active_property_value) operation.component_params[component_property.ref] = processed_value else: operation.component_params[component_property.ref] = active_property_value # Build component task factory try: factory_function = components.load_component_from_text(component.definition) except Exception as e: # TODO Fix error messaging and break exceptions down into categories self.log.error(f"Error loading component spec for {operation.name}: {str(e)}") raise RuntimeError(f"Error loading component spec for {operation.name}.") # Add factory function, which returns a ContainerOp task instance, to pipeline operation dict try: comp_spec_inputs = [ inputs.name.lower().replace(" ", "_") for inputs in factory_function.component_spec.inputs ] # Remove inputs and outputs from params dict # TODO: need to have way to retrieve only required params parameter_removal_list = ["inputs", "outputs"] for component_param in operation.component_params_as_dict.keys(): if component_param not in comp_spec_inputs: parameter_removal_list.append(component_param) for parameter in parameter_removal_list: operation.component_params_as_dict.pop(parameter, None) # Create ContainerOp instance and assign appropriate user-provided name sanitized_component_params = { self._sanitize_param_name(name): value for name, value in operation.component_params_as_dict.items() } container_op = factory_function(**sanitized_component_params) container_op.set_display_name(operation.name) if operation.doc: container_op.add_pod_annotation("elyra/node-user-doc", operation.doc) target_ops[operation.id] = container_op except Exception as e: # TODO Fix error messaging and break exceptions down into categories self.log.error(f"Error constructing component {operation.name}: {str(e)}") raise RuntimeError(f"Error constructing component {operation.name}.") # Process dependencies after all the operations have been created for operation in pipeline.operations.values(): op = target_ops[operation.id] for parent_operation_id in operation.parent_operation_ids: parent_op = target_ops[parent_operation_id] # Parent Operation op.after(parent_op) self.log_pipeline_info(pipeline_name, "pipeline dependencies processed", duration=(time.time() - t0_all)) return target_ops
def test_normalize_label_value(): valid_middle_chars = "-_." # test min length assert ExecuteFileOp._normalize_label_value(None) == "" assert ExecuteFileOp._normalize_label_value("") == "" # test max length (63) assert ExecuteFileOp._normalize_label_value("a" * 63) == "a" * 63 assert ExecuteFileOp._normalize_label_value("a" * 64) == "a" * 63 # truncated # test first and last char assert ExecuteFileOp._normalize_label_value("1") == "1" assert ExecuteFileOp._normalize_label_value("22") == "22" assert ExecuteFileOp._normalize_label_value("3_3") == "3_3" assert ExecuteFileOp._normalize_label_value("4u4") == "4u4" assert ExecuteFileOp._normalize_label_value("5$5") == "5_5" # test first char for c in string.printable: if c in string.ascii_letters + string.digits: # first char is valid # no length violation assert ExecuteFileOp._normalize_label_value(c) == c assert ExecuteFileOp._normalize_label_value(c + "B") == c + "B" # max length assert ExecuteFileOp._normalize_label_value(c + "B" * 62) == (c + "B" * 62) # max length exceeded assert ExecuteFileOp._normalize_label_value(c + "B" * 63) == ( c + "B" * 62) # truncated else: # first char is invalid, e.g. '#a', and becomes the # second char, which might require replacement rv = c if c not in valid_middle_chars: rv = "_" # no length violation assert ExecuteFileOp._normalize_label_value(c) == "a" + rv + "a" assert ExecuteFileOp._normalize_label_value(c + "B") == "a" + rv + "B" # max length assert ExecuteFileOp._normalize_label_value(c + "B" * 62) == ( "a" + rv + "B" * 61) # truncated # max length exceeded assert ExecuteFileOp._normalize_label_value(c + "B" * 63) == ( "a" + rv + "B" * 61) # truncated # test last char for c in string.printable: if c in string.ascii_letters + string.digits: # no length violation assert ExecuteFileOp._normalize_label_value("b" + c) == "b" + c # max length assert ExecuteFileOp._normalize_label_value("b" * 62 + c) == ("b" * 62 + c) # max length exceeded assert ExecuteFileOp._normalize_label_value("b" * 63 + c) == ("b" * 63) else: # last char is invalid, e.g. 'a#', and requires # patching rv = c if c not in valid_middle_chars: rv = "_" # no length violation (char is appended) assert ExecuteFileOp._normalize_label_value("b" + c) == "b" + rv + "a" # max length (char is replaced) assert ExecuteFileOp._normalize_label_value("b" * 62 + c) == ("b" * 62 + "a") # max length exceeded (no action required) assert ExecuteFileOp._normalize_label_value("b" * 63 + c) == ("b" * 63) # test first and last char for c in string.printable: if c in string.ascii_letters + string.digits: # no length violation assert ExecuteFileOp._normalize_label_value( c + "b" + c) == c + "b" + c # nothing is modified # max length assert ExecuteFileOp._normalize_label_value(c + "b" * 61 + c) == ( c + "b" * 61 + c) # nothing is modified # max length exceeded assert ExecuteFileOp._normalize_label_value( c + "b" * 62 + c) == c + "b" * 62 # truncate only else: # first and last characters are invalid, e.g. '#a#' rv = c if c not in valid_middle_chars: rv = "_" # no length violation assert ExecuteFileOp._normalize_label_value( c + "b" + c) == "a" + rv + "b" + rv + "a" # max length assert ExecuteFileOp._normalize_label_value(c + "b" * 59 + c) == ("a" + rv + "b" * 59 + rv + "a") # max length exceeded after processing, scenario 1 # resolved by adding char before first, replace last assert ExecuteFileOp._normalize_label_value(c + "b" * 60 + c) == ("a" + rv + "b" * 60 + "a") # max length exceeded after processing, scenario 2 # resolved by adding char before first, appending after last assert ExecuteFileOp._normalize_label_value(c + "b" * 59 + c) == ("a" + rv + "b" * 59 + rv + "a") # max length exceeded before processing, scenario 1 # resolved by adding char before first, truncating last assert ExecuteFileOp._normalize_label_value(c + "b" * 62 + c) == ("a" + rv + "b" * 61) # max length exceeded before processing, scenario 2 # resolved by adding char before first, replacing last assert ExecuteFileOp._normalize_label_value(c + "b" * 60 + c * 3) == ("a" + rv + "b" * 60 + "a") # test char in a position other than first and last # if invalid, the char is replaced with '_' for c in string.printable: if c in string.ascii_letters + string.digits + "-_.": assert ExecuteFileOp._normalize_label_value("A" + c + "Z") == "A" + c + "Z" else: assert ExecuteFileOp._normalize_label_value("A" + c + "Z") == "A_Z" # encore assert ExecuteFileOp._normalize_label_value(r"¯\_(ツ)_/¯") == "a_________a"
def test_construct_with_env_variables_argo(): notebook_op = ExecuteFileOp( name="test", pipeline_name="test-pipeline", experiment_name="experiment-name", notebook="test_notebook.ipynb", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", pipeline_envs={ "ENV_VAR_ONE": "1", "ENV_VAR_TWO": "2", "ENV_VAR_THREE": "3" }, image="test/image:dev", ) confirmation_names = [ "ENV_VAR_ONE", "ENV_VAR_TWO", "ENV_VAR_THREE", "ELYRA_RUN_NAME" ] confirmation_values = ["1", "2", "3", RUN_ID_PLACEHOLDER] for env_val in notebook_op.container.env: assert env_val.name in confirmation_names assert env_val.value in confirmation_values confirmation_names.remove(env_val.name) confirmation_values.remove(env_val.value) # Verify confirmation values have been drained. assert len(confirmation_names) == 0 assert len(confirmation_values) == 0 # same as before but explicitly specify the workflow engine type # as Argo notebook_op = ExecuteFileOp( name="test", pipeline_name="test-pipeline", experiment_name="experiment-name", notebook="test_notebook.ipynb", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", pipeline_envs={ "ENV_VAR_ONE": "1", "ENV_VAR_TWO": "2", "ENV_VAR_THREE": "3" }, image="test/image:dev", workflow_engine="Argo", ) confirmation_names = [ "ENV_VAR_ONE", "ENV_VAR_TWO", "ENV_VAR_THREE", "ELYRA_RUN_NAME" ] confirmation_values = ["1", "2", "3", RUN_ID_PLACEHOLDER] for env_val in notebook_op.container.env: assert env_val.name in confirmation_names assert env_val.value in confirmation_values confirmation_names.remove(env_val.name) confirmation_values.remove(env_val.value) # Verify confirmation values have been drained. assert len(confirmation_names) == 0 assert len(confirmation_values) == 0