Beispiel #1
0
    def _create_catalog(  # pylint: disable=no-self-use,too-many-arguments
        self,
        conf_catalog: Dict[str, Any],
        conf_creds: Dict[str, Any],
        save_version: str = None,
        journal: Journal = None,
        load_versions: Dict[str, str] = None,
    ) -> DataCatalog:
        """A factory method for the DataCatalog instantiation.

        Returns:
            DataCatalog defined in `catalog.yml`.

        """
        hook_manager = get_hook_manager()
        catalog = hook_manager.hook.register_catalog(  # pylint: disable=no-member
            catalog=conf_catalog,
            credentials=conf_creds,
            load_versions=load_versions,
            save_version=save_version,
            journal=journal,
        )

        return catalog or DataCatalog.from_config(  # for backwards compatibility
            conf_catalog, conf_creds, load_versions, save_version, journal)
Beispiel #2
0
    def _get_config_loader(self) -> ConfigLoader:
        """A hook for changing the creation of a ConfigLoader instance.

        Returns:
            Instance of `ConfigLoader` created by `register_config_loader` hook.
        Raises:
            KedroContextError: Incorrect ``ConfigLoader`` registered for the project.

        """
        conf_root = _get_project_settings(self.package_name, "CONF_ROOT",
                                          self.CONF_ROOT)
        conf_paths = [
            str(self.project_path / conf_root / "base"),
            str(self.project_path / conf_root / self.env),
        ]
        hook_manager = get_hook_manager()
        config_loader = hook_manager.hook.register_config_loader(  # pylint: disable=no-member
            conf_paths=conf_paths,
            env=self.env,
            extra_params=self._extra_params,
        )
        if not isinstance(config_loader, ConfigLoader):
            raise KedroContextError(
                f"Expected an instance of `ConfigLoader`, "
                f"got `{type(config_loader).__name__}` instead.")
        return config_loader
Beispiel #3
0
    def _register_hooks_setuptools(self):
        """Register pluggy hooks from setuptools entrypoints."""
        hook_manager = get_hook_manager()
        already_registered = hook_manager.get_plugins()
        found = hook_manager.load_setuptools_entrypoints(_PLUGIN_HOOKS)
        disable_plugins = set(self.static_data.get("disable_hooks_for_plugins", []))

        # Get list of plugin/distinfo tuples for all setuptools registered plugins.
        plugininfo = hook_manager.list_plugin_distinfo()
        plugin_names = []
        disabled_plugin_names = []
        for plugin, dist in plugininfo:
            if dist.project_name in disable_plugins:
                # `unregister()` is used instead of `set_blocked()` because
                # we want to disable hooks for specific plugin based on project
                # name and not `entry_point` name. Also, we log project names with
                # version for which hooks were registered.
                hook_manager.unregister(plugin=plugin)
                found -= 1
                disabled_plugin_names.append(f"{dist.project_name}-{dist.version}")
            elif plugin not in already_registered:
                plugin_names.append(f"{dist.project_name}-{dist.version}")

        if disabled_plugin_names:
            logging.info(
                "Hooks are disabled for plugin(s): %s",
                ", ".join(sorted(disabled_plugin_names)),
            )

        if plugin_names:
            logging.info(
                "Registered hooks from %d installed plugin(s): %s",
                found,
                ", ".join(sorted(plugin_names)),
            )
    def test_package_name_provided(
        self,
        mock_logging,
        mock_run_node,
        mock_register_hooks,
        mock_configure_project,
        is_async,
        mocker,
    ):
        mocker.patch("multiprocessing.get_start_method", return_value="spawn")
        node = mocker.sentinel.node
        catalog = mocker.sentinel.catalog
        run_id = "fake_run_id"
        package_name = mocker.sentinel.package_name
        hook_manager = get_hook_manager()

        _run_node_synchronization(node,
                                  catalog,
                                  is_async,
                                  run_id,
                                  package_name=package_name)
        mock_run_node.assert_called_once_with(node, catalog, is_async, run_id)
        mock_register_hooks.assert_called_once_with(hook_manager)
        mock_logging.assert_called_once_with({})
        mock_configure_project.assert_called_once_with(package_name)
Beispiel #5
0
def _call_node_run(
    node: Node,
    catalog: DataCatalog,
    inputs: Dict[str, Any],
    is_async: bool,
    run_id: str = None,
) -> Dict[str, Any]:
    hook_manager = get_hook_manager()
    try:
        outputs = node.run(inputs)
    except Exception as exc:
        hook_manager.hook.on_node_error(  # pylint: disable=no-member
            error=exc,
            node=node,
            catalog=catalog,
            inputs=inputs,
            is_async=is_async,
            run_id=run_id,
        )
        raise exc
    hook_manager.hook.after_node_run(  # pylint: disable=no-member
        node=node,
        catalog=catalog,
        inputs=inputs,
        outputs=outputs,
        is_async=is_async,
        run_id=run_id,
    )
    return outputs
Beispiel #6
0
def _run_node_sequential(node: Node,
                         catalog: DataCatalog,
                         run_id: str = None) -> Node:
    inputs = {}
    hook_manager = get_hook_manager()

    for name in node.inputs:
        hook_manager.hook.before_dataset_loaded(  # pylint: disable=no-member
            dataset_name=name)
        inputs[name] = catalog.load(name)
        hook_manager.hook.after_dataset_loaded(  # pylint: disable=no-member
            dataset_name=name, data=inputs[name])

    is_async = False

    additional_inputs = _collect_inputs_from_hook(node,
                                                  catalog,
                                                  inputs,
                                                  is_async,
                                                  run_id=run_id)
    inputs.update(additional_inputs)

    outputs = _call_node_run(node, catalog, inputs, is_async, run_id=run_id)

    for name, data in outputs.items():
        hook_manager.hook.before_dataset_saved(  # pylint: disable=no-member
            dataset_name=name, data=data)
        catalog.save(name, data)
        hook_manager.hook.after_dataset_saved(  # pylint: disable=no-member
            dataset_name=name, data=data)
    return node
Beispiel #7
0
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node:
    inputs = {name: catalog.load(name) for name in node.inputs}
    hook_manager = get_hook_manager()
    is_async = False
    hook_manager.hook.before_node_run(  # pylint: disable=no-member
        node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id
    )
    try:
        outputs = node.run(inputs)
    except Exception as exc:
        hook_manager.hook.on_node_error(  # pylint: disable=no-member
            error=exc,
            node=node,
            catalog=catalog,
            inputs=inputs,
            is_async=is_async,
            run_id=run_id,
        )
        raise exc
    hook_manager.hook.after_node_run(  # pylint: disable=no-member
        node=node,
        catalog=catalog,
        inputs=inputs,
        outputs=outputs,
        is_async=is_async,
        run_id=run_id,
    )

    for name, data in outputs.items():
        catalog.save(name, data)
    return node
Beispiel #8
0
def _collect_inputs_from_hook(
    node: Node,
    catalog: DataCatalog,
    inputs: Dict[str, Any],
    is_async: bool,
    run_id: str = None,
) -> Dict[str, Any]:
    inputs = inputs.copy(
    )  # shallow copy to prevent in-place modification by the hook
    hook_manager = get_hook_manager()
    hook_response = hook_manager.hook.before_node_run(  # pylint: disable=no-member
        node=node,
        catalog=catalog,
        inputs=inputs,
        is_async=is_async,
        run_id=run_id,
    )

    additional_inputs = {}
    for response in hook_response:
        if response is not None and not isinstance(response, dict):
            response_type = type(response).__name__
            raise TypeError(
                f"`before_node_run` must return either None or a dictionary mapping "
                f"dataset names to updated values, got `{response_type}` instead."
            )
        response = response or {}
        additional_inputs.update(response)

    return additional_inputs
Beispiel #9
0
def _clear_hook_manager():
    from kedro.framework.hooks import get_hook_manager

    hook_manager = get_hook_manager()
    name_plugin_pairs = hook_manager.list_name_plugin()
    for name, plugin in name_plugin_pairs:
        hook_manager.unregister(name=name, plugin=plugin)  # pragma: no cover
Beispiel #10
0
    def create(  # pylint: disable=too-many-arguments
        cls,
        package_name: str = None,
        project_path: Union[Path, str] = None,
        save_on_close: bool = True,
        env: str = None,
        extra_params: Dict[str, Any] = None,
    ) -> "KedroSession":
        """Create a new instance of ``KedroSession`` with the session data.

        Args:
            package_name: Package name for the Kedro project the session is
                created for.
            project_path: Path to the project root directory. Default is
                current working directory Path.cwd().
            save_on_close: Whether or not to save the session when it's closed.
            env: Environment for the KedroContext.
            extra_params: Optional dictionary containing extra project parameters
            for underlying KedroContext. If specified, will update (and therefore take
            precedence over) the parameters retrieved from the project configuration.

        Returns:
            A new ``KedroSession`` instance.
        """
        # pylint: disable=protected-access
        session = cls(
            package_name=package_name,
            project_path=project_path,
            session_id=generate_timestamp(),
            save_on_close=save_on_close,
        )

        # have to explicity type session_data otherwise mypy will complain
        # possibly related to this: https://github.com/python/mypy/issues/1430
        session_data: Dict[str, Any] = {
            "package_name": session._package_name,
            "project_path": session._project_path,
            "session_id": session.session_id,
            **_describe_git(session._project_path),
        }

        ctx = click.get_current_context(silent=True)
        if ctx:
            session_data["cli"] = _jsonify_cli_context(ctx)

        if env:
            session_data["env"] = env

        if extra_params:
            session_data["extra_params"] = extra_params

        session._store.update(session_data)

        hook_manager = get_hook_manager()
        _register_all_project_hooks(hook_manager)
        # we need a ConfigLoader registered in order to be able to set up logging
        session._setup_logging()
        return session
Beispiel #11
0
def reload_kedro(path, line=None):
    """Line magic which reloads all Kedro default variables."""
    global startup_error
    global context
    global catalog
    global session

    try:
        import kedro.config.default_logger
        from kedro.framework.hooks import get_hook_manager
        from kedro.framework.project import configure_project
        from kedro.framework.session import KedroSession
        from kedro.framework.session.session import _activate_session
        from kedro.framework.cli.jupyter import collect_line_magic
    except ImportError:
        logging.error(
            "Kedro appears not to be installed in your current environment "
            "or your current IPython session was not started in a valid Kedro project."
        )
        raise

    try:
        path = path or project_path

        # clear hook manager
        hook_manager = get_hook_manager()
        name_plugin_pairs = hook_manager.list_name_plugin()
        for name, plugin in name_plugin_pairs:
            hook_manager.unregister(name=name, plugin=plugin)

        # remove cached user modules
        metadata = _get_project_metadata(path)
        to_remove = [
            mod for mod in sys.modules if mod.startswith(metadata.package_name)
        ]
        # `del` is used instead of `reload()` because: If the new version of a module does not
        # define a name that was defined by the old version, the old definition remains.
        for module in to_remove:
            del sys.modules[module]

        configure_project(metadata.package_name)
        session = KedroSession.create(metadata.package_name, path)
        _activate_session(session, force=True)
        logging.debug("Loading the context from %s", str(path))
        context = session.load_context()
        catalog = context.catalog

        logging.info("** Kedro project %s", str(metadata.project_name))
        logging.info("Defined global variable `context` and `catalog`")

        for line_magic in collect_line_magic():
            register_line_magic(needs_local_scope(line_magic))
            logging.info("Registered line magic `%s`", line_magic.__name__)
    except Exception as err:
        startup_error = err
        logging.exception("Kedro's ipython session startup script failed:\n%s",
                          str(err))
        raise err
Beispiel #12
0
def dummy_context(tmp_path, prepare_project_dir, env, extra_params):  # pylint: disable=unused-argument
    context = KedroContext(MOCK_PACKAGE_NAME,
                           str(tmp_path),
                           env=env,
                           extra_params=extra_params)

    hook_manager = get_hook_manager()
    _register_all_project_hooks(hook_manager, MOCK_PACKAGE_NAME)
    return context
Beispiel #13
0
 def _register_hooks(self) -> None:
     """Register all hooks as specified in ``hooks`` with the global ``hook_manager``.
     """
     self._hook_manager = get_hook_manager()
     for hooks_collection in self.hooks:
         # Sometimes users might create more than one context instance, in which case
         # hooks have already been registered, so we perform a simple check here
         # to avoid an error being raised and break user's workflow.
         if not self._hook_manager.is_registered(hooks_collection):
             self._hook_manager.register(hooks_collection)
def reload_kedro(path, line=None):
    """Line magic which reloads all Kedro default variables."""
    global startup_error
    global context
    global catalog

    try:
        import kedro.config.default_logger  # noqa
        from kedro.framework.cli.jupyter import collect_line_magic
        from kedro.framework.context import load_context
    except ImportError:
        logging.error(
            "Kedro appears not to be installed in your current environment "
            "or your current IPython session was not started in a valid Kedro project."
        )
        raise

    try:
        path = path or project_path

        # remove cached user modules
        context = load_context(path)
        to_remove = [
            mod for mod in sys.modules if mod.startswith(context.package_name)
        ]
        # `del` is used instead of `reload()` because: If the new version of a module does not
        # define a name that was defined by the old version, the old definition remains.
        for module in to_remove:
            del sys.modules[module]

        # clear hook manager; hook implementations will be re-registered when the
        # context is instantiated again in `load_context()` below
        hook_manager = get_hook_manager()
        name_plugin_pairs = hook_manager.list_name_plugin()
        for name, plugin in name_plugin_pairs:
            hook_manager.unregister(name=name, plugin=plugin)

        logging.debug("Loading the context from %s", str(path))
        # Reload context to fix `pickle` related error (it is unable to serialize reloaded objects)
        # Some details can be found here:
        # https://modwsgi.readthedocs.io/en/develop/user-guides/issues-with-pickle-module.html#packing-and-script-reloading
        context = load_context(path)
        catalog = context.catalog

        logging.info("** Kedro project %s", str(context.project_name))
        logging.info("Defined global variable `context` and `catalog`")

        for line_magic in collect_line_magic():
            register_line_magic(needs_local_scope(line_magic))
            logging.info("Registered line magic `%s`", line_magic.__name__)
    except Exception as err:
        startup_error = err
        logging.exception("Kedro's ipython session startup script failed:\n%s",
                          str(err))
        raise err
Beispiel #15
0
def _run_node_async(node: Node,
                    catalog: DataCatalog,
                    run_id: str = None) -> Node:
    def _synchronous_dataset_load(dataset_name: str):
        """Minimal wrapper to ensure Hooks are run synchronously
        within an asynchronous dataset load."""
        hook_manager.hook.before_dataset_loaded(  # pylint: disable=no-member
            dataset_name=dataset_name)
        return_ds = catalog.load(dataset_name)
        hook_manager.hook.after_dataset_loaded(  # pylint: disable=no-member
            dataset_name=dataset_name,
            data=return_ds)
        return return_ds

    with ThreadPoolExecutor() as pool:
        inputs: Dict[str, Future] = {}
        hook_manager = get_hook_manager()

        for name in node.inputs:
            inputs[name] = pool.submit(_synchronous_dataset_load, name)

        wait(inputs.values(), return_when=ALL_COMPLETED)
        inputs = {key: value.result() for key, value in inputs.items()}
        is_async = True
        additional_inputs = _collect_inputs_from_hook(node,
                                                      catalog,
                                                      inputs,
                                                      is_async,
                                                      run_id=run_id)
        inputs.update(additional_inputs)

        outputs = _call_node_run(node,
                                 catalog,
                                 inputs,
                                 is_async,
                                 run_id=run_id)

        save_futures = set()

        for name, data in outputs.items():
            hook_manager.hook.before_dataset_saved(  # pylint: disable=no-member
                dataset_name=name, data=data)
            save_futures.add(pool.submit(catalog.save, name, data))

        for future in as_completed(save_futures):
            exception = future.exception()
            if exception:
                raise exception
            hook_manager.hook.after_dataset_saved(  # pylint: disable=no-member
                dataset_name=name,
                data=data  # pylint: disable=undefined-loop-variable
            )
    return node
Beispiel #16
0
    def _create_config_loader(  # pylint: disable=no-self-use
            self, conf_paths: Iterable[str]) -> ConfigLoader:
        """A factory method for the ConfigLoader instantiation.

        Returns:
            Instance of `ConfigLoader`.

        """
        hook_manager = get_hook_manager()
        config_loader = hook_manager.hook.register_config_loader(  # pylint: disable=no-member
            conf_paths=conf_paths)
        return config_loader or ConfigLoader(
            conf_paths)  # for backwards compatibility
Beispiel #17
0
def _run_node_async(node: Node,
                    catalog: DataCatalog,
                    run_id: str = None) -> Node:
    with ThreadPoolExecutor() as pool:
        inputs = {
            name: pool.submit(catalog.load, name)
            for name in node.inputs
        }  # Python dict is thread-safe
        wait(inputs.values(), return_when=ALL_COMPLETED)
        inputs = {key: value.result() for key, value in inputs.items()}
        hook_manager = get_hook_manager()
        is_async = True
        hook_manager.hook.before_node_run(  # pylint: disable=no-member
            node=node,
            catalog=catalog,
            inputs=inputs,
            is_async=is_async,
            run_id=run_id)
        try:
            outputs = node.run(inputs)
        except Exception as exc:
            hook_manager.hook.on_node_error(  # pylint: disable=no-member
                error=exc,
                node=node,
                catalog=catalog,
                inputs=inputs,
                is_async=is_async,
                run_id=run_id,
            )
            raise exc
        hook_manager.hook.after_node_run(  # pylint: disable=no-member
            node=node,
            catalog=catalog,
            inputs=inputs,
            outputs=outputs,
            is_async=is_async,
            run_id=run_id,
        )

        save_futures = set()

        for name, data in outputs.items():
            save_futures.add(pool.submit(catalog.save, name, data))

        for future in as_completed(save_futures):
            exception = future.exception()
            if exception:
                raise exception
    return node
Beispiel #18
0
def configure_project(package_name: str):
    """Configure a Kedro project by populating its settings with values
    defined in user's settings.py and pipeline_registry.py.
    """
    settings_module = f"{package_name}.settings"
    _validate_module(settings_module)
    settings.configure(settings_module)

    # set up all hooks so we can discover all pipelines
    hook_manager = get_hook_manager()
    _register_hooks(hook_manager, settings.HOOKS)
    _register_hooks_setuptools(hook_manager,
                               settings.DISABLE_HOOKS_FOR_PLUGINS)

    pipelines_module = f"{package_name}.pipeline_registry"
    pipelines.configure(pipelines_module)
Beispiel #19
0
    def _register_hooks_setuptools():
        """Register pluggy hooks from setuptools entrypoints."""
        hook_manager = get_hook_manager()
        already_registered = hook_manager.get_plugins()
        found = hook_manager.load_setuptools_entrypoints(_PLUGIN_HOOKS)

        if found:
            plugininfo = hook_manager.list_plugin_distinfo()
            plugin_names = sorted(
                f"{dist.project_name}-{dist.version}"
                for plugin, dist in plugininfo
                if plugin not in already_registered
            )
            logging.info(
                "Registered hooks from %d installed plugin(s): %s",
                found,
                ", ".join(plugin_names),
            )
Beispiel #20
0
    def _register_hooks(self, auto: bool = False) -> None:
        """Register all hooks as specified in ``hooks`` with the global ``hook_manager``,
        and, optionally, from installed plugins.

        Args:
            auto: An optional flag to enable auto-discovery and registration of plugin hooks.
        """
        hook_manager = get_hook_manager()

        if auto:
            self._register_hooks_setuptools()

        for hooks_collection in self.hooks:
            # Sometimes users might create more than one context instance, in which case
            # hooks have already been registered, so we perform a simple check here
            # to avoid an error being raised and break user's workflow.
            if not hook_manager.is_registered(hooks_collection):
                hook_manager.register(hooks_collection)
Beispiel #21
0
    def _get_pipelines(self) -> Dict[str, Pipeline]:  # pylint: disable=no-self-use
        """Abstract method for a hook for changing the creation of a Pipeline instance.

        Returns:
            A dictionary of defined pipelines.
        """
        hook_manager = get_hook_manager()
        pipelines_dicts = (
            hook_manager.hook.register_pipelines()  # pylint: disable=no-member
        )

        pipelines = {}  # type: Dict[str, Pipeline]
        for pipeline_collection in pipelines_dicts:
            duplicate_keys = pipeline_collection.keys() & pipelines.keys()
            if duplicate_keys:
                warn(
                    f"Found duplicate pipeline entries. "
                    f"The following will be overwritten: {', '.join(duplicate_keys)}"
                )
            pipelines.update(pipeline_collection)

        return pipelines
Beispiel #22
0
def configure_project(package_name: str):
    """Configure a Kedro project by populating its settings with values
    defined in user's settings.py and pipeline_registry.py.
    """
    settings_module = f"{package_name}.settings"
    settings.configure(settings_module)

    # set up all hooks so we can discover all pipelines
    hook_manager = get_hook_manager()
    _register_hooks(hook_manager, settings.HOOKS)
    _register_hooks_setuptools(hook_manager,
                               settings.DISABLE_HOOKS_FOR_PLUGINS)

    pipelines_module = f"{package_name}.pipeline_registry"
    pipelines.configure(pipelines_module)

    # Once the project is successfully configured once, store PACKAGE_NAME as a
    # global variable to make it easily accessible. This is used by validate_settings()
    # below, and also by ParallelRunner on Windows, as package_name is required every
    # time a new subprocess is spawned.
    global PACKAGE_NAME
    PACKAGE_NAME = package_name
Beispiel #23
0
    def _get_catalog(
        self,
        save_version: str = None,
        journal: Journal = None,
        load_versions: Dict[str, str] = None,
    ) -> DataCatalog:
        """A hook for changing the creation of a DataCatalog instance.

        Returns:
            DataCatalog defined in `catalog.yml`.

        """
        # '**/catalog*' reads modular pipeline configs
        conf_catalog = self.config_loader.get("catalog*", "catalog*/**", "**/catalog*")
        # turn relative paths in conf_catalog into absolute paths
        # before initializing the catalog
        conf_catalog = _convert_paths_to_absolute_posix(
            project_path=self.project_path, conf_dictionary=conf_catalog
        )
        conf_creds = self._get_config_credentials()
        catalog = self._create_catalog(
            conf_catalog, conf_creds, save_version, journal, load_versions
        )
        feed_dict = self._get_feed_dict()
        catalog.add_feed_dict(feed_dict)
        if catalog.layers:
            _validate_layers_for_transcoding(catalog)
        hook_manager = get_hook_manager()
        hook_manager.hook.after_catalog_created(  # pylint: disable=no-member
            catalog=catalog,
            conf_catalog=conf_catalog,
            conf_creds=conf_creds,
            feed_dict=feed_dict,
            save_version=save_version,
            load_versions=load_versions,
            run_id=self.run_id,
        )
        return catalog
Beispiel #24
0
    def _register_hooks(self, auto: bool = False) -> None:
        """Register all hooks as specified in ``hooks`` with the global ``hook_manager``,
        and, optionally, from installed plugins.

        Args:
            auto: An optional flag to enable auto-discovery and registration of plugin hooks.
        """
        hook_manager = get_hook_manager()

        # enrich with hooks specified in .kedro.yml or pyproject.toml if .kedro.yml doesn't exist
        hooks_locations = self.static_data.get("hooks", [])
        configured_hooks = tuple(load_obj(hook) for hook in hooks_locations)

        all_hooks = self.hooks + configured_hooks
        for hooks_collection in all_hooks:
            # Sometimes users might create more than one context instance, in which case
            # hooks have already been registered, so we perform a simple check here
            # to avoid an error being raised and break user's workflow.
            if not hook_manager.is_registered(hooks_collection):
                hook_manager.register(hooks_collection)

        if auto:
            self._register_hooks_setuptools()
Beispiel #25
0
    def _load_data(self):
        """Lazily read pipelines defined in the pipelines registry module"""

        # If the pipelines dictionary has not been configured with a pipelines module
        # or if data has been loaded
        if self._pipelines_module is None or self._is_data_loaded:
            return

        try:
            register_pipelines = self._get_pipelines_registry_callable(
                self._pipelines_module)
        except (ModuleNotFoundError, AttributeError) as exc:
            # for backwards compatibility with templates < 0.17.2
            # where no pipelines_registry is defined
            if self._pipelines_module in str(exc):  # pragma: no cover
                project_pipelines = {}
            else:
                raise
        else:
            project_pipelines = register_pipelines()

        hook_manager = get_hook_manager()
        pipelines_dicts = (
            hook_manager.hook.register_pipelines()  # pylint: disable=no-member
        )
        for pipeline_collection in pipelines_dicts:
            duplicate_keys = pipeline_collection.keys(
            ) & project_pipelines.keys()
            if duplicate_keys:
                warn(
                    f"Found duplicate pipeline entries. "
                    f"The following will be overwritten: {', '.join(duplicate_keys)}"
                )
            project_pipelines.update(pipeline_collection)

        self._content = project_pipelines
        self._is_data_loaded = True
Beispiel #26
0
    def _register_hooks(self, auto: bool = False) -> None:
        """Register all hooks as specified in ``hooks`` with the global ``hook_manager``,
        and, optionally, from installed plugins.

        Args:
            auto: An optional flag to enable auto-discovery and registration of plugin hooks.
        """
        self._hook_manager = get_hook_manager()

        if auto:
            found = self._hook_manager.load_setuptools_entrypoints(
                _PLUGIN_HOOKS)
            if found:  # pragma: no cover
                logging.info("Registered hooks from %d installed plugin(s)",
                             found)

        for hooks_collection in self.hooks:
            # Sometimes users might create more than one context instance, in which case
            # hooks have already been registered, so we perform a simple check here
            # to avoid an error being raised and break user's workflow.
            if not self._hook_manager.is_registered(
                    hooks_collection):  # pragma: no cover
                self._hook_manager.register(
                    hooks_collection)  # pragma: no cover
Beispiel #27
0
    def run(  # pylint: disable=too-many-arguments,too-many-locals
        self,
        tags: Iterable[str] = None,
        runner: AbstractRunner = None,
        node_names: Iterable[str] = None,
        from_nodes: Iterable[str] = None,
        to_nodes: Iterable[str] = None,
        from_inputs: Iterable[str] = None,
        to_outputs: Iterable[str] = None,
        load_versions: Dict[str, str] = None,
        pipeline_name: str = None,
    ) -> Dict[str, Any]:
        """Runs the pipeline with a specified runner.

        Args:
            tags: An optional list of node tags which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                containing *any* of these tags will be run.
            runner: An optional parameter specifying the runner that you want to run
                the pipeline with.
            node_names: An optional list of node names which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                with these names will be run.
            from_nodes: An optional list of node names which should be used as a
                starting point of the new ``Pipeline``.
            to_nodes: An optional list of node names which should be used as an
                end point of the new ``Pipeline``.
            from_inputs: An optional list of input datasets which should be used as a
                starting point of the new ``Pipeline``.
            to_outputs: An optional list of output datasets which should be used as an
                end point of the new ``Pipeline``.
            load_versions: An optional flag to specify a particular dataset version timestamp
                to load.
            pipeline_name: Name of the ``Pipeline`` to execute.
                Defaults to "__default__".
        Raises:
            KedroContextError: If the resulting ``Pipeline`` is empty
                or incorrect tags are provided.
            Exception: Any uncaught exception will be re-raised
                after being passed to``on_pipeline_error``.
        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.
        """
        # Report project name
        logging.info("** Kedro project %s", self.project_path.name)

        pipeline = self._get_pipeline(name=pipeline_name)
        filtered_pipeline = self._filter_pipeline(
            pipeline=pipeline,
            tags=tags,
            from_nodes=from_nodes,
            to_nodes=to_nodes,
            node_names=node_names,
            from_inputs=from_inputs,
            to_outputs=to_outputs,
        )

        save_version = self._get_save_version()
        run_id = self.run_id or save_version

        record_data = {
            "run_id": run_id,
            "project_path": str(self.project_path),
            "env": self.env,
            "tags": tags,
            "from_nodes": from_nodes,
            "to_nodes": to_nodes,
            "node_names": node_names,
            "from_inputs": from_inputs,
            "to_outputs": to_outputs,
            "load_versions": load_versions,
            "pipeline_name": pipeline_name,
            "extra_params": self._extra_params,
        }
        journal = Journal(record_data)

        catalog = self._get_catalog(save_version=save_version,
                                    journal=journal,
                                    load_versions=load_versions)

        # Run the runner
        runner = runner or SequentialRunner()
        hook_manager = get_hook_manager()
        hook_manager.hook.before_pipeline_run(  # pylint: disable=no-member
            run_params=record_data,
            pipeline=filtered_pipeline,
            catalog=catalog)

        try:
            run_result = runner.run(filtered_pipeline, catalog, run_id)
        except Exception as exc:
            hook_manager.hook.on_pipeline_error(  # pylint: disable=no-member
                error=exc,
                run_params=record_data,
                pipeline=filtered_pipeline,
                catalog=catalog,
            )
            raise exc

        hook_manager.hook.after_pipeline_run(  # pylint: disable=no-member
            run_params=record_data,
            run_result=run_result,
            pipeline=filtered_pipeline,
            catalog=catalog,
        )
        return run_result
Beispiel #28
0
    def run(  # pylint: disable=too-many-arguments,too-many-locals
        self,
        pipeline_name: str = None,
        tags: Iterable[str] = None,
        runner: AbstractRunner = None,
        node_names: Iterable[str] = None,
        from_nodes: Iterable[str] = None,
        to_nodes: Iterable[str] = None,
        from_inputs: Iterable[str] = None,
        load_versions: Dict[str, str] = None,
        extra_params: Dict[str, Any] = None,
    ) -> Dict[str, Any]:
        """Runs the pipeline with a specified runner.

        Args:
            pipeline_name: Name of the pipeline that is being run.
            tags: An optional list of node tags which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                containing *any* of these tags will be run.
            runner: An optional parameter specifying the runner that you want to run
                the pipeline with.
            node_names: An optional list of node names which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                with these names will be run.
            from_nodes: An optional list of node names which should be used as a
                starting point of the new ``Pipeline``.
            to_nodes: An optional list of node names which should be used as an
                end point of the new ``Pipeline``.
            from_inputs: An optional list of input datasets which should be
                used as a starting point of the new ``Pipeline``.
            load_versions: An optional flag to specify a particular dataset
                version timestamp to load.
            extra_params: Additional run parameters.
        Raises:
            Exception: Any uncaught exception during the run will be re-raised
                after being passed to ``on_pipeline_error`` hook.
        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.
        """
        # pylint: disable=protected-access,no-member
        # Report project name
        logging.info("** Kedro project %s", self._project_path.name)

        save_version = run_id = self.store["session_id"]
        extra_params = deepcopy(extra_params) or dict()
        context = self.context

        pipeline = context._get_pipeline(name=pipeline_name)
        filtered_pipeline = context._filter_pipeline(
            pipeline=pipeline,
            tags=tags,
            from_nodes=from_nodes,
            to_nodes=to_nodes,
            node_names=node_names,
            from_inputs=from_inputs,
        )

        record_data = {
            "run_id": run_id,
            "project_path": self._project_path.as_posix(),
            "env": context.env,
            "kedro_version": self.store["kedro_version"],
            "tags": tags,
            "from_nodes": from_nodes,
            "to_nodes": to_nodes,
            "node_names": node_names,
            "from_inputs": from_inputs,
            "load_versions": load_versions,
            "extra_params": extra_params,
            "pipeline_name": pipeline_name,
        }

        catalog = context._get_catalog(save_version=save_version,
                                       load_versions=load_versions)

        # Run the runner
        runner = runner or SequentialRunner()
        hook = get_hook_manager().hook
        hook.before_pipeline_run(run_params=record_data,
                                 pipeline=filtered_pipeline,
                                 catalog=catalog)

        try:
            run_result = runner.run(filtered_pipeline, catalog, run_id)
        except Exception as error:
            hook.on_pipeline_error(
                error=error,
                run_params=record_data,
                pipeline=filtered_pipeline,
                catalog=catalog,
            )
            raise

        hook.after_pipeline_run(
            run_params=record_data,
            run_result=run_result,
            pipeline=filtered_pipeline,
            catalog=catalog,
        )
        return run_result
Beispiel #29
0
def clear_hook_manager():
    yield
    hook_manager = get_hook_manager()
    plugins = hook_manager.get_plugins()
    for plugin in plugins:
        hook_manager.unregister(plugin)