Exemple #1
0
def test_processing_url_runtime_specific_component(monkeypatch, processor,
                                                   sample_metadata, tmpdir):
    # Define the appropriate reader for a URL-type component definition
    kfp_supported_file_types = [".yaml"]
    reader = UrlComponentCatalogConnector(kfp_supported_file_types)

    # Assign test resource location
    url = ("https://raw.githubusercontent.com/elyra-ai/elyra/master/"
           "elyra/tests/pipeline/resources/components/filter_text.yaml")

    # Read contents of given path -- read_component_definition() returns a
    # a dictionary of component definition content indexed by path
    entry_data = reader.get_entry_data({"url": url}, {})
    component_definition = entry_data.definition

    # Instantiate a url-based component
    component_id = "test_component"
    component = Component(
        id=component_id,
        name="Filter text",
        description="",
        op="filter-text",
        catalog_type="url-catalog",
        component_reference={"url": url},
        definition=component_definition,
        categories=[],
        properties=[],
    )

    # Fabricate the component cache to include single filename-based component for testing
    ComponentCache.instance()._component_cache[processor._type.name] = {
        "spoofed_catalog": {
            "components": {
                component_id: component
            }
        }
    }

    # Construct hypothetical operation for component
    operation_name = "Filter text test"
    operation_params = {"text": "path/to/text.txt", "pattern": "hello"}
    operation = Operation(
        id="filter-text-id",
        type="execution_node",
        classifier=component_id,
        name=operation_name,
        parent_operation_ids=[],
        component_params=operation_params,
    )

    # Build a mock runtime config for use in _cc_pipeline
    mocked_runtime = Metadata(name="test-metadata",
                              display_name="test",
                              schema_name="kfp",
                              metadata=sample_metadata)

    mocked_func = mock.Mock(return_value="default",
                            side_effect=[mocked_runtime, sample_metadata])
    monkeypatch.setattr(processor, "_get_metadata_configuration", mocked_func)

    # Construct single-operation pipeline
    pipeline = Pipeline(id="pipeline-id",
                        name="kfp_test",
                        runtime="kfp",
                        runtime_config="test",
                        source="filter_text.pipeline")
    pipeline.operations[operation.id] = operation

    # Establish path and function to construct pipeline
    pipeline_path = os.path.join(tmpdir, "kfp_test.yaml")
    constructed_pipeline_function = lambda: processor._cc_pipeline(
        pipeline=pipeline, pipeline_name="test_pipeline")

    # TODO Check against both argo and tekton compilations
    # Compile pipeline and save into pipeline_path
    kfp_argo_compiler.Compiler().compile(constructed_pipeline_function,
                                         pipeline_path)

    # Read contents of pipeline YAML
    with open(pipeline_path) as f:
        pipeline_yaml = yaml.safe_load(f.read())

    # Check the pipeline file contents for correctness
    pipeline_template = pipeline_yaml["spec"]["templates"][0]
    assert pipeline_template["metadata"]["annotations"][
        "pipelines.kubeflow.org/task_display_name"] == operation_name
    assert pipeline_template["inputs"]["artifacts"][0]["raw"][
        "data"] == operation_params["text"]
Exemple #2
0
    def _cc_pipeline(self, pipeline: Pipeline, pipeline_name: str, pipeline_instance_id: str) -> OrderedDict:
        """
        Compile the pipeline in preparation for DAG generation
        """

        runtime_configuration = self._get_metadata_configuration(
            schemaspace=Runtimes.RUNTIMES_SCHEMASPACE_ID, name=pipeline.runtime_config
        )
        image_namespace = self._get_metadata_configuration(
            schemaspace=RuntimeImages.RUNTIME_IMAGES_SCHEMASPACE_ID, name=None
        )

        cos_endpoint = runtime_configuration.metadata.get("cos_endpoint")
        cos_username = runtime_configuration.metadata.get("cos_username")
        cos_password = runtime_configuration.metadata.get("cos_password")
        cos_secret = runtime_configuration.metadata.get("cos_secret")
        cos_bucket = runtime_configuration.metadata.get("cos_bucket")

        pipeline_instance_id = pipeline_instance_id or pipeline_name
        artifact_object_prefix = join_paths(pipeline.pipeline_parameters.get(COS_OBJECT_PREFIX), pipeline_instance_id)

        self.log_pipeline_info(
            pipeline_name,
            f"processing pipeline dependencies for upload to '{cos_endpoint}' "
            f"bucket '{cos_bucket}' folder '{artifact_object_prefix}'",
        )

        # Create dictionary that maps component Id to its ContainerOp instance
        target_ops = []

        t0_all = time.time()

        # Sort operations based on dependency graph (topological order)
        sorted_operations = PipelineProcessor._sort_operations(pipeline.operations)

        # Determine whether access to cloud storage is required and check connectivity
        for operation in sorted_operations:
            if isinstance(operation, GenericOperation):
                self._verify_cos_connectivity(runtime_configuration)
                break

        # All previous operation outputs should be propagated throughout the pipeline.
        # In order to process this recursively, the current operation's inputs should be combined
        # from its parent's inputs (which, themselves are derived from the outputs of their parent)
        # and its parent's outputs.

        PipelineProcessor._propagate_operation_inputs_outputs(pipeline, sorted_operations)

        # Scrub all node labels of invalid characters
        scrubbed_operations = self._scrub_invalid_characters_from_list(sorted_operations)
        # Generate unique names for all operations
        unique_operations = self._create_unique_node_names(scrubbed_operations)

        for operation in unique_operations:

            if isinstance(operation, GenericOperation):
                operation_artifact_archive = self._get_dependency_archive_name(operation)

                self.log.debug(f"Creating pipeline component:\n {operation} archive : {operation_artifact_archive}")

                # Collect env variables
                pipeline_envs = self._collect_envs(
                    operation, cos_secret=cos_secret, cos_username=cos_username, cos_password=cos_password
                )

                # Generate unique ELYRA_RUN_NAME value and expose it as an
                # environment variable in the container.
                # Notebook | script nodes are implemented using the kubernetes_pod_operator
                # (https://airflow.apache.org/docs/apache-airflow/1.10.12/_api/airflow/contrib/operators/kubernetes_pod_operator/index.html)
                # Environment variables that are passed to this operator are
                # pre-processed by Airflow at runtime and placeholder values (expressed as '{{ xyz }}'
                #  - see https://airflow.apache.org/docs/apache-airflow/1.10.12/macros-ref#default-variables)
                # replaced.
                if pipeline_envs is None:
                    pipeline_envs = {}
                pipeline_envs["ELYRA_RUN_NAME"] = f"{pipeline_name}-{{{{ ts_nodash }}}}"

                image_pull_policy = None
                runtime_image_pull_secret = None
                for image_instance in image_namespace:
                    if image_instance.metadata["image_name"] == operation.runtime_image:
                        if image_instance.metadata.get("pull_policy"):
                            image_pull_policy = image_instance.metadata["pull_policy"]
                        if image_instance.metadata.get("pull_secret"):
                            runtime_image_pull_secret = image_instance.metadata["pull_secret"]
                        break

                bootscript = BootscriptBuilder(
                    filename=operation.filename,
                    pipeline_name=pipeline_name,
                    cos_endpoint=cos_endpoint,
                    cos_bucket=cos_bucket,
                    cos_directory=artifact_object_prefix,
                    cos_dependencies_archive=operation_artifact_archive,
                    inputs=operation.inputs,
                    outputs=operation.outputs,
                )

                target_op = {
                    "notebook": operation.name,
                    "id": operation.id,
                    "argument_list": bootscript.container_cmd,
                    "runtime_image": operation.runtime_image,
                    "pipeline_envs": pipeline_envs,
                    "parent_operation_ids": operation.parent_operation_ids,
                    "image_pull_policy": image_pull_policy,
                    "cpu_request": operation.cpu,
                    "mem_request": operation.memory,
                    "gpu_limit": operation.gpu,
                    "operator_source": operation.component_params["filename"],
                    "is_generic_operator": True,
                    "doc": operation.doc,
                    "volume_mounts": operation.component_params.get(MOUNTED_VOLUMES, []),
                    "kubernetes_secrets": operation.component_params.get(KUBERNETES_SECRETS, []),
                }

                if runtime_image_pull_secret is not None:
                    target_op["runtime_image_pull_secret"] = runtime_image_pull_secret

                target_ops.append(target_op)

                self.log_pipeline_info(
                    pipeline_name,
                    f"processing operation dependencies for id '{operation.id}'",
                    operation_name=operation.name,
                )

                self._upload_dependencies_to_object_store(
                    runtime_configuration, pipeline_name, operation, prefix=artifact_object_prefix
                )

            else:
                # Retrieve component from cache
                component = ComponentCache.instance().get_component(self._type, operation.classifier)

                # Convert the user-entered value of certain properties according to their type
                for component_property in component.properties:
                    # Skip properties for which no value was given
                    if component_property.ref not in operation.component_params.keys():
                        continue

                    # Get corresponding property's value from parsed pipeline
                    property_value_dict = operation.component_params.get(component_property.ref)

                    # The type and value of this property can vary depending on what the user chooses
                    # in the pipeline editor. So we get the current active parameter (e.g. StringControl)
                    # from the activeControl value
                    active_property_name = property_value_dict["activeControl"]

                    # One we have the value (e.g. StringControl) we use can retrieve the value
                    # assigned to it
                    property_value = property_value_dict.get(active_property_name, None)

                    # If the value is not found, assign it the default value assigned in parser
                    if property_value is None:
                        property_value = component_property.value

                    self.log.debug(f"Active property name : {active_property_name}, value : {property_value}")
                    self.log.debug(
                        f"Processing component parameter '{component_property.name}' "
                        f"of type '{component_property.data_type}'"
                    )

                    if (
                        property_value
                        and str(property_value)[0] == "{"
                        and str(property_value)[-1] == "}"
                        and isinstance(json.loads(json.dumps(property_value)), dict)
                        and set(json.loads(json.dumps(property_value)).keys()) == {"value", "option"}
                    ):
                        parent_node_name = self._get_node_name(
                            target_ops, json.loads(json.dumps(property_value))["value"]
                        )
                        processed_value = "\"{{ ti.xcom_pull(task_ids='" + parent_node_name + "') }}\""
                        operation.component_params[component_property.ref] = processed_value
                    elif component_property.data_type == "boolean":
                        operation.component_params[component_property.ref] = property_value
                    elif component_property.data_type == "string":
                        # Add surrounding quotation marks to string value for correct rendering
                        # in jinja DAG template
                        operation.component_params[component_property.ref] = json.dumps(property_value)
                    elif component_property.data_type == "dictionary":
                        processed_value = self._process_dictionary_value(property_value)
                        operation.component_params[component_property.ref] = processed_value
                    elif component_property.data_type == "list":
                        processed_value = self._process_list_value(property_value)
                        operation.component_params[component_property.ref] = processed_value

                # Remove inputs and outputs from params dict until support for data exchange is provided
                operation.component_params_as_dict.pop("inputs")
                operation.component_params_as_dict.pop("outputs")

                # Locate the import statement. If not found raise...
                import_stmts = []
                # Check for import statement on Component object, otherwise get from class_import_map
                import_stmt = component.import_statement or self.class_import_map.get(component.name)
                if import_stmt:
                    import_stmts.append(import_stmt)
                else:
                    # If we didn't find a mapping to the import statement, let's check if the component
                    # name includes a package prefix.  If it does, log a warning, but proceed, otherwise
                    # raise an exception.
                    if len(component.name.split(".")) > 1:  # We (presumably) have a package prefix
                        self.log.warning(
                            f"Operator '{component.name}' of node '{operation.name}' is not configured "
                            f"in the list of available Airflow operators but appears to include a "
                            f"package prefix and processing will proceed."
                        )
                    else:
                        raise ValueError(
                            f"Operator '{component.name}' of node '{operation.name}' is not configured "
                            f"in the list of available operators.  Please add the fully-qualified "
                            f"package name for '{component.name}' to the "
                            f"AirflowPipelineProcessor.available_airflow_operators configuration."
                        )

                target_op = {
                    "notebook": operation.name,
                    "id": operation.id,
                    "imports": import_stmts,
                    "class_name": component.name,
                    "parent_operation_ids": operation.parent_operation_ids,
                    "component_params": operation.component_params_as_dict,
                    "operator_source": component.component_source,
                    "is_generic_operator": False,
                    "doc": operation.doc,
                }

                target_ops.append(target_op)

        ordered_target_ops = OrderedDict()

        while target_ops:
            for i in range(len(target_ops)):
                target_op = target_ops.pop(0)
                if not target_op["parent_operation_ids"]:
                    ordered_target_ops[target_op["id"]] = target_op
                    self.log.debug(f"Added root node {ordered_target_ops[target_op['id']]}")
                elif all(deps in ordered_target_ops.keys() for deps in target_op["parent_operation_ids"]):
                    ordered_target_ops[target_op["id"]] = target_op
                    self.log.debug(f"Added dependent node {ordered_target_ops[target_op['id']]}")
                else:
                    target_ops.append(target_op)

        self.log_pipeline_info(pipeline_name, "pipeline dependencies processed", duration=(time.time() - t0_all))

        return ordered_target_ops