def test_build_cmd_with_inputs_and_outputs(): pipeline_inputs = ["test.txt", "test2.txt"] pipeline_outputs = ["test3.txt", "test4.txt"] boot_build = BootscriptBuilder( filename="test_notebook.ipynb", pipeline_name="test-pipeline", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", inputs=pipeline_inputs, outputs=pipeline_outputs, ) assert boot_build.inputs == pipeline_inputs assert boot_build.outputs == pipeline_outputs boot_arg_list = boot_build.container_cmd.split("--") for arg in boot_arg_list: arg_value = arg.split(" ")[1] if "outputs" in arg: assert arg_value == f"'{';'.join(pipeline_outputs)}'" if "inputs" in arg: assert arg_value == f"'{';'.join(pipeline_inputs)}'"
def test_fail_without_filename(): with pytest.raises(TypeError): BootscriptBuilder( cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", )
def test_fail_without_cos_endpoint(): with pytest.raises(TypeError): BootscriptBuilder( filename="test_notebook.ipynb", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", )
def test_fail_with_empty_string_as_filename(): with pytest.raises(ValueError) as error_info: BootscriptBuilder( filename="", cos_endpoint="http://testserver:32525", cos_bucket="test_bucket", cos_directory="test_directory", cos_dependencies_archive="test_archive.tgz", ) assert "You need to provide a filename for the operation." == str( error_info.value)
def _cc_pipeline(self, pipeline, pipeline_name): runtime_configuration = self._get_metadata_configuration( schemaspace=Runtimes.RUNTIMES_SCHEMASPACE_ID, name=pipeline.runtime_config) image_namespace = self._get_metadata_configuration( schemaspace=RuntimeImages.RUNTIME_IMAGES_SCHEMASPACE_ID) cos_endpoint = runtime_configuration.metadata.get("cos_endpoint") cos_username = runtime_configuration.metadata.get("cos_username") cos_password = runtime_configuration.metadata.get("cos_password") cos_secret = runtime_configuration.metadata.get("cos_secret") cos_directory = pipeline_name cos_bucket = runtime_configuration.metadata.get("cos_bucket") # Create dictionary that maps component Id to its ContainerOp instance target_ops = [] self.log_pipeline_info( pipeline_name, f"processing pipeline dependencies to: {cos_endpoint} " f"bucket: {cos_bucket} folder: {pipeline_name}", ) t0_all = time.time() # Sort operations based on dependency graph (topological order) sorted_operations = PipelineProcessor._sort_operations( pipeline.operations) # Determine whether access to cloud storage is required and check connectivity for operation in sorted_operations: if isinstance(operation, GenericOperation): self._verify_cos_connectivity(runtime_configuration) break # All previous operation outputs should be propagated throughout the pipeline. # In order to process this recursively, the current operation's inputs should be combined # from its parent's inputs (which, themselves are derived from the outputs of their parent) # and its parent's outputs. PipelineProcessor._propagate_operation_inputs_outputs( pipeline, sorted_operations) # Scrub all node labels of invalid characters scrubbed_operations = self._scrub_invalid_characters_from_list( sorted_operations) # Generate unique names for all operations unique_operations = self._create_unique_node_names(scrubbed_operations) for operation in unique_operations: if isinstance(operation, GenericOperation): operation_artifact_archive = self._get_dependency_archive_name( operation) self.log.debug( f"Creating pipeline component:\n {operation} archive : {operation_artifact_archive}" ) # Collect env variables pipeline_envs = self._collect_envs(operation, cos_secret=cos_secret, cos_username=cos_username, cos_password=cos_password) # Generate unique ELYRA_RUN_NAME value and expose it as an # environment variable in the container. # Notebook | script nodes are implemented using the kubernetes_pod_operator # (https://airflow.apache.org/docs/apache-airflow/1.10.12/_api/airflow/contrib/operators/kubernetes_pod_operator/index.html) # Environment variables that are passed to this operator are # pre-processed by Airflow at runtime and placeholder values (expressed as '{{ xyz }}' # - see https://airflow.apache.org/docs/apache-airflow/1.10.12/macros-ref#default-variables) # replaced. if pipeline_envs is None: pipeline_envs = {} pipeline_envs[ "ELYRA_RUN_NAME"] = f"{pipeline_name}-{{{{ ts_nodash }}}}" image_pull_policy = None runtime_image_pull_secret = None for image_instance in image_namespace: if image_instance.metadata[ "image_name"] == operation.runtime_image: if image_instance.metadata.get("pull_policy"): image_pull_policy = image_instance.metadata[ "pull_policy"] if image_instance.metadata.get("pull_secret"): runtime_image_pull_secret = image_instance.metadata[ "pull_secret"] break bootscript = BootscriptBuilder( filename=operation.filename, cos_endpoint=cos_endpoint, cos_bucket=cos_bucket, cos_directory=cos_directory, cos_dependencies_archive=operation_artifact_archive, inputs=operation.inputs, outputs=operation.outputs, ) target_op = { "notebook": operation.name, "id": operation.id, "argument_list": bootscript.container_cmd, "runtime_image": operation.runtime_image, "pipeline_envs": pipeline_envs, "parent_operation_ids": operation.parent_operation_ids, "image_pull_policy": image_pull_policy, "cpu_request": operation.cpu, "mem_request": operation.memory, "gpu_limit": operation.gpu, "operator_source": operation.component_params["filename"], "is_generic_operator": True, "doc": operation.doc, } if runtime_image_pull_secret is not None: target_op[ "runtime_image_pull_secret"] = runtime_image_pull_secret target_ops.append(target_op) self.log_pipeline_info( pipeline_name, f"processing operation dependencies for id: {operation.id}", operation_name=operation.name, ) self._upload_dependencies_to_object_store( runtime_configuration, pipeline_name, operation) else: # Retrieve component from cache component = ComponentCache.instance().get_component( self._type, operation.classifier) # Convert the user-entered value of certain properties according to their type for component_property in component.properties: # Skip properties for which no value was given if component_property.ref not in operation.component_params.keys( ): continue # Get corresponding property's value from parsed pipeline property_value_dict = operation.component_params.get( component_property.ref) # The type and value of this property can vary depending on what the user chooses # in the pipeline editor. So we get the current active parameter (e.g. StringControl) # from the activeControl value active_property_name = property_value_dict["activeControl"] # One we have the value (e.g. StringControl) we use can retrieve the value # assigned to it property_value = property_value_dict.get( active_property_name, None) # If the value is not found, assign it the default value assigned in parser if property_value is None: property_value = component_property.value self.log.debug( f"Active property name : {active_property_name}, value : {property_value}" ) self.log.debug( f"Processing component parameter '{component_property.name}' " f"of type '{component_property.data_type}'") if (property_value and str(property_value)[0] == "{" and str(property_value)[-1] == "}" and isinstance( json.loads(json.dumps(property_value)), dict) and set( json.loads(json.dumps(property_value)).keys()) == {"value", "option"}): parent_node_name = self._get_node_name( target_ops, json.loads(json.dumps(property_value))["value"]) processed_value = "\"{{ ti.xcom_pull(task_ids='" + parent_node_name + "') }}\"" operation.component_params[ component_property.ref] = processed_value elif component_property.data_type == "boolean": operation.component_params[ component_property.ref] = property_value elif component_property.data_type == "string": # Add surrounding quotation marks to string value for correct rendering # in jinja DAG template operation.component_params[ component_property.ref] = json.dumps( property_value) elif component_property.data_type == "dictionary": processed_value = self._process_dictionary_value( property_value) operation.component_params[ component_property.ref] = processed_value elif component_property.data_type == "list": processed_value = self._process_list_value( property_value) operation.component_params[ component_property.ref] = processed_value # Remove inputs and outputs from params dict until support for data exchange is provided operation.component_params_as_dict.pop("inputs") operation.component_params_as_dict.pop("outputs") # Locate the import statement. If not found raise... import_stmts = [] # Check for import statement on Component object, otherwise get from class_import_map import_stmt = component.import_statement or self.class_import_map.get( component.name) if import_stmt: import_stmts.append(import_stmt) else: # If we didn't find a mapping to the import statement, let's check if the component # name includes a package prefix. If it does, log a warning, but proceed, otherwise # raise an exception. if len(component.name.split( ".")) > 1: # We (presumably) have a package prefix self.log.warning( f"Operator '{component.name}' of node '{operation.name}' is not configured " f"in the list of available Airflow operators but appears to include a " f"package prefix and processing will proceed.") else: raise ValueError( f"Operator '{component.name}' of node '{operation.name}' is not configured " f"in the list of available operators. Please add the fully-qualified " f"package name for '{component.name}' to the " f"AirflowPipelineProcessor.available_airflow_operators configuration." ) target_op = { "notebook": operation.name, "id": operation.id, "imports": import_stmts, "class_name": component.name, "parent_operation_ids": operation.parent_operation_ids, "component_params": operation.component_params_as_dict, "operator_source": component.component_source, "is_generic_operator": False, "doc": operation.doc, } target_ops.append(target_op) ordered_target_ops = OrderedDict() while target_ops: for i in range(len(target_ops)): target_op = target_ops.pop(0) if not target_op["parent_operation_ids"]: ordered_target_ops[target_op["id"]] = target_op self.log.debug("Root Node added : %s", ordered_target_ops[target_op["id"]]) elif all(deps in ordered_target_ops.keys() for deps in target_op["parent_operation_ids"]): ordered_target_ops[target_op["id"]] = target_op self.log.debug("Dependent Node added : %s", ordered_target_ops[target_op["id"]]) else: target_ops.append(target_op) self.log_pipeline_info(pipeline_name, "pipeline dependencies processed", duration=(time.time() - t0_all)) return ordered_target_ops