Beispiel #1
0
    def get_component(self, id: str, name: str, description: str,
                      properties: List[ComponentParameter],
                      file_extension: str) -> Component:
        """
        Construct a Component object given the arguments (as parsed from the definition file)
        and the relevant information from the catalog from which the component originates.
        """
        params = {
            "id": id,
            "name": name,
            "description": description,
            "properties": properties,
            "catalog_type": self.catalog_type,
            "component_reference": self.entry_reference,
            "definition": self.entry_data.definition,
            "runtime_type": self.runtime_type,
            "categories": self.categories,
            "extensions": [self.entry_data.file_extension or file_extension],
        }

        if isinstance(self.entry_data, AirflowEntryData):
            params["package_name"] = self.entry_data.package_name

        return Component(**params)
Beispiel #2
0
class ComponentCache(SingletonConfigurable):
    """Represents the cache of component definitions indexed by runtime-type, then by catalog name."""

    # The component_cache is indexed at the top level by runtime type name, e.g. 'APACHE_AIRFLOW',
    # and has as its value another dictionary. At the second level, each sub-dictionary is indexed by
    # a ComponentCatalogMetadata instance name; its value is also a sub-dictionary. This sub-dictionary
    # consists of two additional dictionaries: 1.) one with key "components" whose dictionary is
    # indexed by component id and maps to the corresponding Component object, and 2.) one with key
    # "status" and value of a final sub-dictionary with key-value pairs "state":"<current/updating/errors>"
    # and "errors":["<error1>", "<error2>", ...] to dynamically indicate the status of this catalog instance
    _component_cache: ComponentCacheType = {}

    _generic_category_label = "Elyra"
    _generic_components: Dict[str, Component] = {
        "notebook": Component(
            id="notebook",
            name="Notebook",
            description="Run notebook file",
            op="execute-notebook-node",
            catalog_type="elyra",
            component_reference="elyra",
            extensions=[".ipynb"],
            categories=[_generic_category_label],
        ),
        "python-script": Component(
            id="python-script",
            name="Python Script",
            description="Run Python script",
            op="execute-python-node",
            catalog_type="elyra",
            component_reference="elyra",
            extensions=[".py"],
            categories=[_generic_category_label],
        ),
        "r-script": Component(
            id="r-script",
            name="R Script",
            description="Run R script",
            op="execute-r-node",
            catalog_type="elyra",
            component_reference="elyra",
            extensions=[".r"],
            categories=[_generic_category_label],
        ),
    }

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.is_server_process = ComponentCache._determine_server_process(**kwargs)
        self.manifest_dir = jupyter_runtime_dir()
        # Ensure queue attribute exists for non-server instances as well.
        self.refresh_queue: Optional[RefreshQueue] = None
        self.update_queue: Optional[UpdateQueue] = None
        if self.is_server_process:
            self.refresh_queue = RefreshQueue()
            self.update_queue = UpdateQueue()

            # Set up watchdog for manifest file for out-of-process updates
            self.observer = Observer()
            self.observer.schedule(ManifestFileChangeHandler(self), self.manifest_dir)

            # Start a thread to manage updates to the component cache
            manager = CacheUpdateManager(self.log, self._component_cache, self.refresh_queue, self.update_queue)
            self.cache_manager = manager
            self.cache_manager.start()
            self.log.debug("CacheUpdateManager started...")
        else:
            self.manifest_filename = os.path.join(self.manifest_dir, f"elyra-component-manifest-{os.getpid()}.json")

    @staticmethod
    def _determine_server_process(**kwargs) -> bool:
        """Determines if this process is a server (extension) process."""
        app_names = ["ServerApp", "ElyraApp"]
        is_server_process = False
        if "parent" in kwargs and kwargs["parent"].__class__.__name__ in app_names:
            is_server_process = True
        elif "emulate_server_app" in kwargs and kwargs["emulate_server_app"]:  # Used in unittests
            is_server_process = True

        return is_server_process

    def load(self):
        """
        Completes a series of actions during system startup, such as creating
        the component manifest file and triggering the build of the component
        cache for existing ComponentCatalog metadata instances.
        """
        # Proceed only if singleton instance has been created
        if self.initialized:
            # The cache manager will work on manifest and cache tasks on an
            # in-process basis as load() is only called during startup from
            # the server process.
            if self.is_server_process:
                # Remove all existing manifest files from previous processes
                self._remove_all_manifest_files()

                # Start the watchdog if it's not alive, prevents redundant starts
                if not self.observer.is_alive():
                    self.observer.start()

                # Fetch all component catalog instances and trigger their add to the
                # component cache if this is not already happening (it seems some server
                # test fixtures could be loading the server extensions multiple times).
                if not self.cache_manager.is_refreshing():
                    self.refresh()

    def refresh(self):
        """Triggers a refresh of all catalogs in the component cache.

        Raises RefreshInProgressError if a complete refresh is in progress.
        Note that we do not preclude non-server processes from performing a
        complete refresh.  In such cases, each of the catalog entries will be
        written to the manifest, which will be placed into the update queue.
        As a result, non-server applications could by-pass the "refresh in progress"
        constraint, but we're assuming a CLI application won't be as likely to
        "pound" refresh like a UI application can.
        """
        if self.is_server_process and self.cache_manager.is_refreshing():
            raise RefreshInProgressError()
        catalogs = MetadataManager(schemaspace=ComponentCatalogs.COMPONENT_CATALOGS_SCHEMASPACE_ID).get_all()
        for catalog in catalogs:
            self._insert_request(self.refresh_queue, catalog, "modify")

    def update(self, catalog: Metadata, action: str):
        """
        Triggers an update of the component cache for the given catalog name.  If this is a non-server
        process, the entry is written to the manifest file where it will be "processed" by the watchdog
        and inserted into the component cache queue, otherwise we update the cache queue directly.
        """
        self._insert_request(self.update_queue, catalog, action)

    def _insert_request(self, queue: Queue, catalog: Metadata, action: str):
        """
        If running as a server process, the request is submitted to the desired queue, otherwise
        it is posted to the manifest where the server process (if running) can detect the manifest
        file update and send the request to the update queue.

        Note that any calls to ComponentCache.refresh() from non-server processes will still
        perform the refresh, but via the update queue rather than the refresh queue.  We could,
        instead, raise NotImplementedError in such cases, but we may want the ability to refresh
        the entire component cache from a CLI utility and the current implementation would allow that.
        """
        if self.is_server_process:
            queue.put((catalog, action))
        else:
            manifest: Dict[str, str] = self._load_manifest()
            manifest[catalog.name] = action
            self.update_manifest(manifest=manifest)

    def _remove_all_manifest_files(self):
        """
        Remove all existing manifest files in the Jupyter runtimes directory.
        """
        manifest_files = Path(self.manifest_dir).glob("**/elyra-component-manifest-*.json")
        for file in manifest_files:
            os.remove(str(file))

    def _load_manifest(self, filename: Optional[str] = None) -> Dict[str, str]:
        """Read and return the contents of a manifest file.

        If 'filename' is not provided, this process's manifest file will be read.
        """
        filename = filename or self.manifest_filename
        if not os.path.isfile(filename):
            self.log.debug(f"Manifest file '{filename}' doesn't exist and will be created.")
            return {}
        with open(filename, "r") as f:
            manifest: Dict[str, str] = json.load(f)
        self.log.debug(f"Reading manifest '{manifest}' from file '{filename}'")
        return manifest

    def update_manifest(self, filename: Optional[str] = None, manifest: Optional[Dict[str, str]] = None) -> None:
        """Update the manifest file with the given entry."""
        filename = filename or self.manifest_filename
        manifest = manifest or {}
        self.log.debug(f"Updating manifest '{manifest}' to file '{filename}'")
        with open(filename, "w") as f:
            json.dump(manifest, f, indent=2)

    def wait_for_all_cache_tasks(self):
        """
        Block execution and wait for all tasks in the cache task update queue to complete.
        Primarily used for testing.
        """
        if self.is_server_process:
            self.update_queue.join()
            self.refresh_queue.join()

    def get_all_components(self, platform: RuntimeProcessorType) -> List[Component]:
        """
        Retrieve all components from component catalog cache
        """
        components: List[Component] = []

        catalogs = self._component_cache.get(platform.name, {})
        for catalog_name, catalog_properties in catalogs.items():
            components.extend(list(catalog_properties.get("components", {}).values()))

        if not components and platform != RuntimeProcessorType.LOCAL:
            self.log.error(f"No components could be found in any catalog for platform type '{platform.name}'.")

        return components

    def get_component(self, platform: RuntimeProcessorType, component_id: str) -> Optional[Component]:
        """
        Retrieve the component with a given component_id from component catalog cache
        """
        component: Optional[Component] = None

        catalogs = self._component_cache.get(platform.name, {})
        for catalog_name, catalog_properties in catalogs.items():
            component = catalog_properties.get("components", {}).get(component_id)
            if component:
                break

        if not component:
            self.log.error(f"Component with ID '{component_id}' could not be found in any catalog.")

        return component

    def _load_catalog_reader_class(
        self, catalog: ComponentCatalogMetadata, file_types: List[str]
    ) -> Optional[ComponentCatalogConnector]:
        """
        Load the appropriate entrypoint class based on the schema name indicated in
        the ComponentCatalogMetadata instance and the file types associated with the component
        parser in use
        """
        try:
            catalog_reader = entrypoints.get_group_named("elyra.component.catalog_types").get(catalog.schema_name)
            if not catalog_reader:
                self.log.error(
                    f"No entrypoint with name '{catalog.schema_name}' was found in group "
                    f"'elyra.component.catalog_types' to match the 'schema_name' given in catalog "
                    f"'{catalog.display_name}'. Skipping..."
                )
                return None

            catalog_reader = catalog_reader.load()(file_types, parent=self.parent)
        except Exception as e:
            self.log.error(f"Could not load appropriate ComponentCatalogConnector class: {e}. Skipping...")
            return None

        return catalog_reader

    def read_component_catalog(self, catalog: ComponentCatalogMetadata) -> Dict[str, Component]:
        """
        Read a component catalog and return a dictionary of components indexed by component_id.

        :param catalog: a metadata instances from which to read and construct Component objects

        :returns: a dictionary of component id to Component object for all read/parsed components
        """
        components: Dict[str, Component] = {}

        # Assign component parser based on the runtime platform type
        parser = ComponentParser.create_instance(platform=catalog.runtime_type)

        # Assign reader based on the type of the catalog (the 'schema_name')
        catalog_reader = self._load_catalog_reader_class(catalog, parser.file_types)
        if not catalog_reader:
            return components

        # Get content of component definition file for each component in this catalog
        self.log.debug(f"Processing components in catalog '{catalog.display_name}'")
        catalog_entries = catalog_reader.read_component_definitions(catalog)
        if not catalog_entries:
            return components

        for catalog_entry in catalog_entries:
            # Parse the entry to get a fully qualified Component object
            try:
                parsed_components = parser.parse(catalog_entry) or []
            except Exception as e:
                self.log.warning(
                    f"Could not parse definition for component with identifying information: "
                    f"'{catalog_entry.entry_reference}' -> {str(e)}"
                )
            else:
                for component in parsed_components:
                    components[component.id] = component

        return components

    @staticmethod
    def get_generic_components() -> List[Component]:
        return list(ComponentCache._generic_components.values())

    @staticmethod
    def get_generic_component(component_id: str) -> Optional[Component]:
        return ComponentCache._generic_components.get(component_id)

    @staticmethod
    def get_generic_component_ops() -> List[str]:
        return [component.op for component in ComponentCache.get_generic_components()]

    @staticmethod
    def load_jinja_template(template_name: str) -> Template:
        """
        Loads the jinja template of the given name from the
        elyra/templates/components folder
        """
        loader = PackageLoader("elyra", "templates/components")
        template_env = Environment(loader=loader)

        return template_env.get_template(template_name)

    @staticmethod
    def to_canvas_palette(components: List[Component]) -> Dict:
        """
        Converts catalog components into appropriate canvas palette format
        """
        template = ComponentCache.load_jinja_template("canvas_palette_template.jinja2")

        # Define a fallback category for components with no given categories
        fallback_category_name = "No Category"

        # Convert the list of all components into a dictionary of
        # component lists keyed by category
        category_to_components: Dict[str, List[Component]] = {}
        for component in components:
            categories = component.categories

            # Assign a fallback category so that component is not
            # lost during palette render
            if not categories:
                categories = [fallback_category_name]

            for category in categories:
                if category not in category_to_components.keys():
                    category_to_components[category] = []

                if component.id not in [comp.id for comp in category_to_components[category]]:
                    category_to_components[category].append(component)

        # Render template
        canvas_palette = template.render(category_dict=category_to_components)
        return json.loads(canvas_palette)

    @staticmethod
    def to_canvas_properties(component: Component) -> Dict:
        """
        Converts catalog components into appropriate canvas properties format

        If component_id is one of the generic set, generic template is rendered,
        otherwise, the  runtime-specific property template is rendered
        """
        if ComponentCache.get_generic_component(component.id) is not None:
            template = ComponentCache.load_jinja_template("generic_properties_template.jinja2")
        else:
            template = ComponentCache.load_jinja_template("canvas_properties_template.jinja2")

        canvas_properties = template.render(component=component)
        return json.loads(canvas_properties)
Beispiel #3
0
def test_processing_filename_runtime_specific_component(
        monkeypatch, processor, sample_metadata, tmpdir):
    # Define the appropriate reader for a filesystem-type component definition
    kfp_supported_file_types = [".yaml"]
    reader = FilesystemComponentCatalogConnector(kfp_supported_file_types)

    # Assign test resource location
    absolute_path = os.path.abspath(
        os.path.join(os.path.dirname(__file__), "..", "resources",
                     "components", "download_data.yaml"))

    # Read contents of given path -- read_component_definition() returns a
    # a dictionary of component definition content indexed by path
    entry_data = reader.get_entry_data({"path": absolute_path}, {})
    component_definition = entry_data.definition

    # Instantiate a file-based component
    component_id = "test-component"
    component = Component(
        id=component_id,
        name="Download data",
        description="",
        op="download-data",
        catalog_type="elyra-kfp-examples-catalog",
        component_reference={"path": absolute_path},
        definition=component_definition,
        properties=[],
        categories=[],
    )

    # Fabricate the component cache to include single filename-based component for testing
    ComponentCache.instance()._component_cache[processor._type.name] = {
        "spoofed_catalog": {
            "components": {
                component_id: component
            }
        }
    }

    # Construct hypothetical operation for component
    operation_name = "Download data test"
    operation_params = {
        "url":
        "https://raw.githubusercontent.com/elyra-ai/elyra/master/tests/assets/helloworld.ipynb",
        "curl_options": "--location",
    }
    operation = Operation(
        id="download-data-id",
        type="execution_node",
        classifier=component_id,
        name=operation_name,
        parent_operation_ids=[],
        component_params=operation_params,
    )

    # Build a mock runtime config for use in _cc_pipeline
    mocked_runtime = Metadata(name="test-metadata",
                              display_name="test",
                              schema_name="kfp",
                              metadata=sample_metadata)

    mocked_func = mock.Mock(return_value="default",
                            side_effect=[mocked_runtime, sample_metadata])
    monkeypatch.setattr(processor, "_get_metadata_configuration", mocked_func)

    # Construct single-operation pipeline
    pipeline = Pipeline(id="pipeline-id",
                        name="kfp_test",
                        runtime="kfp",
                        runtime_config="test",
                        source="download_data.pipeline")
    pipeline.operations[operation.id] = operation

    # Establish path and function to construct pipeline
    pipeline_path = os.path.join(tmpdir, "kfp_test.yaml")
    constructed_pipeline_function = lambda: processor._cc_pipeline(
        pipeline=pipeline, pipeline_name="test_pipeline")

    # TODO Check against both argo and tekton compilations
    # Compile pipeline and save into pipeline_path
    kfp_argo_compiler.Compiler().compile(constructed_pipeline_function,
                                         pipeline_path)

    # Read contents of pipeline YAML
    with open(pipeline_path) as f:
        pipeline_yaml = yaml.safe_load(f.read())

    # Check the pipeline file contents for correctness
    pipeline_template = pipeline_yaml["spec"]["templates"][0]
    assert pipeline_template["metadata"]["annotations"][
        "pipelines.kubeflow.org/task_display_name"] == operation_name
    assert pipeline_template["container"]["command"][3] == operation_params[
        "url"]
Beispiel #4
0
def test_cc_pipeline_component_no_input(monkeypatch, processor,
                                        component_cache, sample_metadata,
                                        tmpdir):
    """
    Verifies that cc_pipeline can handle KFP component definitions that don't
    include any inputs
    """
    # Define the appropriate reader for a filesystem-type component definition
    kfp_supported_file_types = [".yaml"]
    reader = FilesystemComponentCatalogConnector(kfp_supported_file_types)

    # Assign test resource location
    cpath = (Path(__file__).parent / ".." / "resources" / "components" /
             "kfp_test_operator_no_inputs.yaml").resolve()
    assert cpath.is_file()
    cpath = str(cpath)

    # Read contents of given path -- read_component_definition() returns a
    # a dictionary of component definition content indexed by path
    entry_data = reader.get_entry_data({"path": cpath}, {})
    component_definition = entry_data.definition

    # Instantiate a file-based component
    component_id = "test-component"
    component = Component(
        id=component_id,
        name="No input data",
        description="",
        op="no-input-data",
        catalog_type="elyra-kfp-examples-catalog",
        component_reference={"path": cpath},
        definition=component_definition,
        properties=[],
        categories=[],
    )

    # Fabricate the component cache to include single filename-based component for testing
    component_cache._component_cache[processor._type.name] = {
        "spoofed_catalog": {
            "components": {
                component_id: component
            }
        }
    }

    # Construct hypothetical operation for component
    operation_name = "no-input-test"
    operation_params = {}
    operation = Operation(
        id="no-input-id",
        type="execution_node",
        classifier=component_id,
        name=operation_name,
        parent_operation_ids=[],
        component_params=operation_params,
    )

    # Build a mock runtime config for use in _cc_pipeline
    mocked_runtime = Metadata(name="test-metadata",
                              display_name="test",
                              schema_name="kfp",
                              metadata=sample_metadata)

    mocked_func = mock.Mock(return_value="default",
                            side_effect=[mocked_runtime, sample_metadata])
    monkeypatch.setattr(processor, "_get_metadata_configuration", mocked_func)

    # Construct single-operation pipeline
    pipeline = Pipeline(id="pipeline-id",
                        name="kfp_test",
                        runtime="kfp",
                        runtime_config="test",
                        source="no_input.pipeline")
    pipeline.operations[operation.id] = operation

    constructed_pipeline_function = lambda: processor._cc_pipeline(
        pipeline=pipeline, pipeline_name="test_pipeline")
    pipeline_path = str(Path(tmpdir) / "no_inputs_test.yaml")

    # Compile pipeline and save into pipeline_path
    kfp_argo_compiler.Compiler().compile(constructed_pipeline_function,
                                         pipeline_path)