def _create_catalog( # pylint: disable=no-self-use,too-many-arguments self, conf_catalog: Dict[str, Any], conf_creds: Dict[str, Any], save_version: str = None, journal: Journal = None, load_versions: Dict[str, str] = None, ) -> DataCatalog: """A factory method for the DataCatalog instantiation. Returns: DataCatalog defined in `catalog.yml`. """ hook_manager = get_hook_manager() catalog = hook_manager.hook.register_catalog( # pylint: disable=no-member catalog=conf_catalog, credentials=conf_creds, load_versions=load_versions, save_version=save_version, journal=journal, ) return catalog or DataCatalog.from_config( # for backwards compatibility conf_catalog, conf_creds, load_versions, save_version, journal)
def _get_config_loader(self) -> ConfigLoader: """A hook for changing the creation of a ConfigLoader instance. Returns: Instance of `ConfigLoader` created by `register_config_loader` hook. Raises: KedroContextError: Incorrect ``ConfigLoader`` registered for the project. """ conf_root = _get_project_settings(self.package_name, "CONF_ROOT", self.CONF_ROOT) conf_paths = [ str(self.project_path / conf_root / "base"), str(self.project_path / conf_root / self.env), ] hook_manager = get_hook_manager() config_loader = hook_manager.hook.register_config_loader( # pylint: disable=no-member conf_paths=conf_paths, env=self.env, extra_params=self._extra_params, ) if not isinstance(config_loader, ConfigLoader): raise KedroContextError( f"Expected an instance of `ConfigLoader`, " f"got `{type(config_loader).__name__}` instead.") return config_loader
def _register_hooks_setuptools(self): """Register pluggy hooks from setuptools entrypoints.""" hook_manager = get_hook_manager() already_registered = hook_manager.get_plugins() found = hook_manager.load_setuptools_entrypoints(_PLUGIN_HOOKS) disable_plugins = set(self.static_data.get("disable_hooks_for_plugins", [])) # Get list of plugin/distinfo tuples for all setuptools registered plugins. plugininfo = hook_manager.list_plugin_distinfo() plugin_names = [] disabled_plugin_names = [] for plugin, dist in plugininfo: if dist.project_name in disable_plugins: # `unregister()` is used instead of `set_blocked()` because # we want to disable hooks for specific plugin based on project # name and not `entry_point` name. Also, we log project names with # version for which hooks were registered. hook_manager.unregister(plugin=plugin) found -= 1 disabled_plugin_names.append(f"{dist.project_name}-{dist.version}") elif plugin not in already_registered: plugin_names.append(f"{dist.project_name}-{dist.version}") if disabled_plugin_names: logging.info( "Hooks are disabled for plugin(s): %s", ", ".join(sorted(disabled_plugin_names)), ) if plugin_names: logging.info( "Registered hooks from %d installed plugin(s): %s", found, ", ".join(sorted(plugin_names)), )
def test_package_name_provided( self, mock_logging, mock_run_node, mock_register_hooks, mock_configure_project, is_async, mocker, ): mocker.patch("multiprocessing.get_start_method", return_value="spawn") node = mocker.sentinel.node catalog = mocker.sentinel.catalog run_id = "fake_run_id" package_name = mocker.sentinel.package_name hook_manager = get_hook_manager() _run_node_synchronization(node, catalog, is_async, run_id, package_name=package_name) mock_run_node.assert_called_once_with(node, catalog, is_async, run_id) mock_register_hooks.assert_called_once_with(hook_manager) mock_logging.assert_called_once_with({}) mock_configure_project.assert_called_once_with(package_name)
def _call_node_run( node: Node, catalog: DataCatalog, inputs: Dict[str, Any], is_async: bool, run_id: str = None, ) -> Dict[str, Any]: hook_manager = get_hook_manager() try: outputs = node.run(inputs) except Exception as exc: hook_manager.hook.on_node_error( # pylint: disable=no-member error=exc, node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id, ) raise exc hook_manager.hook.after_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, outputs=outputs, is_async=is_async, run_id=run_id, ) return outputs
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: inputs = {} hook_manager = get_hook_manager() for name in node.inputs: hook_manager.hook.before_dataset_loaded( # pylint: disable=no-member dataset_name=name) inputs[name] = catalog.load(name) hook_manager.hook.after_dataset_loaded( # pylint: disable=no-member dataset_name=name, data=inputs[name]) is_async = False additional_inputs = _collect_inputs_from_hook(node, catalog, inputs, is_async, run_id=run_id) inputs.update(additional_inputs) outputs = _call_node_run(node, catalog, inputs, is_async, run_id=run_id) for name, data in outputs.items(): hook_manager.hook.before_dataset_saved( # pylint: disable=no-member dataset_name=name, data=data) catalog.save(name, data) hook_manager.hook.after_dataset_saved( # pylint: disable=no-member dataset_name=name, data=data) return node
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: inputs = {name: catalog.load(name) for name in node.inputs} hook_manager = get_hook_manager() is_async = False hook_manager.hook.before_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id ) try: outputs = node.run(inputs) except Exception as exc: hook_manager.hook.on_node_error( # pylint: disable=no-member error=exc, node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id, ) raise exc hook_manager.hook.after_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, outputs=outputs, is_async=is_async, run_id=run_id, ) for name, data in outputs.items(): catalog.save(name, data) return node
def _collect_inputs_from_hook( node: Node, catalog: DataCatalog, inputs: Dict[str, Any], is_async: bool, run_id: str = None, ) -> Dict[str, Any]: inputs = inputs.copy( ) # shallow copy to prevent in-place modification by the hook hook_manager = get_hook_manager() hook_response = hook_manager.hook.before_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id, ) additional_inputs = {} for response in hook_response: if response is not None and not isinstance(response, dict): response_type = type(response).__name__ raise TypeError( f"`before_node_run` must return either None or a dictionary mapping " f"dataset names to updated values, got `{response_type}` instead." ) response = response or {} additional_inputs.update(response) return additional_inputs
def _clear_hook_manager(): from kedro.framework.hooks import get_hook_manager hook_manager = get_hook_manager() name_plugin_pairs = hook_manager.list_name_plugin() for name, plugin in name_plugin_pairs: hook_manager.unregister(name=name, plugin=plugin) # pragma: no cover
def create( # pylint: disable=too-many-arguments cls, package_name: str = None, project_path: Union[Path, str] = None, save_on_close: bool = True, env: str = None, extra_params: Dict[str, Any] = None, ) -> "KedroSession": """Create a new instance of ``KedroSession`` with the session data. Args: package_name: Package name for the Kedro project the session is created for. project_path: Path to the project root directory. Default is current working directory Path.cwd(). save_on_close: Whether or not to save the session when it's closed. env: Environment for the KedroContext. extra_params: Optional dictionary containing extra project parameters for underlying KedroContext. If specified, will update (and therefore take precedence over) the parameters retrieved from the project configuration. Returns: A new ``KedroSession`` instance. """ # pylint: disable=protected-access session = cls( package_name=package_name, project_path=project_path, session_id=generate_timestamp(), save_on_close=save_on_close, ) # have to explicity type session_data otherwise mypy will complain # possibly related to this: https://github.com/python/mypy/issues/1430 session_data: Dict[str, Any] = { "package_name": session._package_name, "project_path": session._project_path, "session_id": session.session_id, **_describe_git(session._project_path), } ctx = click.get_current_context(silent=True) if ctx: session_data["cli"] = _jsonify_cli_context(ctx) if env: session_data["env"] = env if extra_params: session_data["extra_params"] = extra_params session._store.update(session_data) hook_manager = get_hook_manager() _register_all_project_hooks(hook_manager) # we need a ConfigLoader registered in order to be able to set up logging session._setup_logging() return session
def reload_kedro(path, line=None): """Line magic which reloads all Kedro default variables.""" global startup_error global context global catalog global session try: import kedro.config.default_logger from kedro.framework.hooks import get_hook_manager from kedro.framework.project import configure_project from kedro.framework.session import KedroSession from kedro.framework.session.session import _activate_session from kedro.framework.cli.jupyter import collect_line_magic except ImportError: logging.error( "Kedro appears not to be installed in your current environment " "or your current IPython session was not started in a valid Kedro project." ) raise try: path = path or project_path # clear hook manager hook_manager = get_hook_manager() name_plugin_pairs = hook_manager.list_name_plugin() for name, plugin in name_plugin_pairs: hook_manager.unregister(name=name, plugin=plugin) # remove cached user modules metadata = _get_project_metadata(path) to_remove = [ mod for mod in sys.modules if mod.startswith(metadata.package_name) ] # `del` is used instead of `reload()` because: If the new version of a module does not # define a name that was defined by the old version, the old definition remains. for module in to_remove: del sys.modules[module] configure_project(metadata.package_name) session = KedroSession.create(metadata.package_name, path) _activate_session(session, force=True) logging.debug("Loading the context from %s", str(path)) context = session.load_context() catalog = context.catalog logging.info("** Kedro project %s", str(metadata.project_name)) logging.info("Defined global variable `context` and `catalog`") for line_magic in collect_line_magic(): register_line_magic(needs_local_scope(line_magic)) logging.info("Registered line magic `%s`", line_magic.__name__) except Exception as err: startup_error = err logging.exception("Kedro's ipython session startup script failed:\n%s", str(err)) raise err
def dummy_context(tmp_path, prepare_project_dir, env, extra_params): # pylint: disable=unused-argument context = KedroContext(MOCK_PACKAGE_NAME, str(tmp_path), env=env, extra_params=extra_params) hook_manager = get_hook_manager() _register_all_project_hooks(hook_manager, MOCK_PACKAGE_NAME) return context
def _register_hooks(self) -> None: """Register all hooks as specified in ``hooks`` with the global ``hook_manager``. """ self._hook_manager = get_hook_manager() for hooks_collection in self.hooks: # Sometimes users might create more than one context instance, in which case # hooks have already been registered, so we perform a simple check here # to avoid an error being raised and break user's workflow. if not self._hook_manager.is_registered(hooks_collection): self._hook_manager.register(hooks_collection)
def reload_kedro(path, line=None): """Line magic which reloads all Kedro default variables.""" global startup_error global context global catalog try: import kedro.config.default_logger # noqa from kedro.framework.cli.jupyter import collect_line_magic from kedro.framework.context import load_context except ImportError: logging.error( "Kedro appears not to be installed in your current environment " "or your current IPython session was not started in a valid Kedro project." ) raise try: path = path or project_path # remove cached user modules context = load_context(path) to_remove = [ mod for mod in sys.modules if mod.startswith(context.package_name) ] # `del` is used instead of `reload()` because: If the new version of a module does not # define a name that was defined by the old version, the old definition remains. for module in to_remove: del sys.modules[module] # clear hook manager; hook implementations will be re-registered when the # context is instantiated again in `load_context()` below hook_manager = get_hook_manager() name_plugin_pairs = hook_manager.list_name_plugin() for name, plugin in name_plugin_pairs: hook_manager.unregister(name=name, plugin=plugin) logging.debug("Loading the context from %s", str(path)) # Reload context to fix `pickle` related error (it is unable to serialize reloaded objects) # Some details can be found here: # https://modwsgi.readthedocs.io/en/develop/user-guides/issues-with-pickle-module.html#packing-and-script-reloading context = load_context(path) catalog = context.catalog logging.info("** Kedro project %s", str(context.project_name)) logging.info("Defined global variable `context` and `catalog`") for line_magic in collect_line_magic(): register_line_magic(needs_local_scope(line_magic)) logging.info("Registered line magic `%s`", line_magic.__name__) except Exception as err: startup_error = err logging.exception("Kedro's ipython session startup script failed:\n%s", str(err)) raise err
def _run_node_async(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: def _synchronous_dataset_load(dataset_name: str): """Minimal wrapper to ensure Hooks are run synchronously within an asynchronous dataset load.""" hook_manager.hook.before_dataset_loaded( # pylint: disable=no-member dataset_name=dataset_name) return_ds = catalog.load(dataset_name) hook_manager.hook.after_dataset_loaded( # pylint: disable=no-member dataset_name=dataset_name, data=return_ds) return return_ds with ThreadPoolExecutor() as pool: inputs: Dict[str, Future] = {} hook_manager = get_hook_manager() for name in node.inputs: inputs[name] = pool.submit(_synchronous_dataset_load, name) wait(inputs.values(), return_when=ALL_COMPLETED) inputs = {key: value.result() for key, value in inputs.items()} is_async = True additional_inputs = _collect_inputs_from_hook(node, catalog, inputs, is_async, run_id=run_id) inputs.update(additional_inputs) outputs = _call_node_run(node, catalog, inputs, is_async, run_id=run_id) save_futures = set() for name, data in outputs.items(): hook_manager.hook.before_dataset_saved( # pylint: disable=no-member dataset_name=name, data=data) save_futures.add(pool.submit(catalog.save, name, data)) for future in as_completed(save_futures): exception = future.exception() if exception: raise exception hook_manager.hook.after_dataset_saved( # pylint: disable=no-member dataset_name=name, data=data # pylint: disable=undefined-loop-variable ) return node
def _create_config_loader( # pylint: disable=no-self-use self, conf_paths: Iterable[str]) -> ConfigLoader: """A factory method for the ConfigLoader instantiation. Returns: Instance of `ConfigLoader`. """ hook_manager = get_hook_manager() config_loader = hook_manager.hook.register_config_loader( # pylint: disable=no-member conf_paths=conf_paths) return config_loader or ConfigLoader( conf_paths) # for backwards compatibility
def _run_node_async(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: with ThreadPoolExecutor() as pool: inputs = { name: pool.submit(catalog.load, name) for name in node.inputs } # Python dict is thread-safe wait(inputs.values(), return_when=ALL_COMPLETED) inputs = {key: value.result() for key, value in inputs.items()} hook_manager = get_hook_manager() is_async = True hook_manager.hook.before_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id) try: outputs = node.run(inputs) except Exception as exc: hook_manager.hook.on_node_error( # pylint: disable=no-member error=exc, node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id, ) raise exc hook_manager.hook.after_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, outputs=outputs, is_async=is_async, run_id=run_id, ) save_futures = set() for name, data in outputs.items(): save_futures.add(pool.submit(catalog.save, name, data)) for future in as_completed(save_futures): exception = future.exception() if exception: raise exception return node
def configure_project(package_name: str): """Configure a Kedro project by populating its settings with values defined in user's settings.py and pipeline_registry.py. """ settings_module = f"{package_name}.settings" _validate_module(settings_module) settings.configure(settings_module) # set up all hooks so we can discover all pipelines hook_manager = get_hook_manager() _register_hooks(hook_manager, settings.HOOKS) _register_hooks_setuptools(hook_manager, settings.DISABLE_HOOKS_FOR_PLUGINS) pipelines_module = f"{package_name}.pipeline_registry" pipelines.configure(pipelines_module)
def _register_hooks_setuptools(): """Register pluggy hooks from setuptools entrypoints.""" hook_manager = get_hook_manager() already_registered = hook_manager.get_plugins() found = hook_manager.load_setuptools_entrypoints(_PLUGIN_HOOKS) if found: plugininfo = hook_manager.list_plugin_distinfo() plugin_names = sorted( f"{dist.project_name}-{dist.version}" for plugin, dist in plugininfo if plugin not in already_registered ) logging.info( "Registered hooks from %d installed plugin(s): %s", found, ", ".join(plugin_names), )
def _register_hooks(self, auto: bool = False) -> None: """Register all hooks as specified in ``hooks`` with the global ``hook_manager``, and, optionally, from installed plugins. Args: auto: An optional flag to enable auto-discovery and registration of plugin hooks. """ hook_manager = get_hook_manager() if auto: self._register_hooks_setuptools() for hooks_collection in self.hooks: # Sometimes users might create more than one context instance, in which case # hooks have already been registered, so we perform a simple check here # to avoid an error being raised and break user's workflow. if not hook_manager.is_registered(hooks_collection): hook_manager.register(hooks_collection)
def _get_pipelines(self) -> Dict[str, Pipeline]: # pylint: disable=no-self-use """Abstract method for a hook for changing the creation of a Pipeline instance. Returns: A dictionary of defined pipelines. """ hook_manager = get_hook_manager() pipelines_dicts = ( hook_manager.hook.register_pipelines() # pylint: disable=no-member ) pipelines = {} # type: Dict[str, Pipeline] for pipeline_collection in pipelines_dicts: duplicate_keys = pipeline_collection.keys() & pipelines.keys() if duplicate_keys: warn( f"Found duplicate pipeline entries. " f"The following will be overwritten: {', '.join(duplicate_keys)}" ) pipelines.update(pipeline_collection) return pipelines
def configure_project(package_name: str): """Configure a Kedro project by populating its settings with values defined in user's settings.py and pipeline_registry.py. """ settings_module = f"{package_name}.settings" settings.configure(settings_module) # set up all hooks so we can discover all pipelines hook_manager = get_hook_manager() _register_hooks(hook_manager, settings.HOOKS) _register_hooks_setuptools(hook_manager, settings.DISABLE_HOOKS_FOR_PLUGINS) pipelines_module = f"{package_name}.pipeline_registry" pipelines.configure(pipelines_module) # Once the project is successfully configured once, store PACKAGE_NAME as a # global variable to make it easily accessible. This is used by validate_settings() # below, and also by ParallelRunner on Windows, as package_name is required every # time a new subprocess is spawned. global PACKAGE_NAME PACKAGE_NAME = package_name
def _get_catalog( self, save_version: str = None, journal: Journal = None, load_versions: Dict[str, str] = None, ) -> DataCatalog: """A hook for changing the creation of a DataCatalog instance. Returns: DataCatalog defined in `catalog.yml`. """ # '**/catalog*' reads modular pipeline configs conf_catalog = self.config_loader.get("catalog*", "catalog*/**", "**/catalog*") # turn relative paths in conf_catalog into absolute paths # before initializing the catalog conf_catalog = _convert_paths_to_absolute_posix( project_path=self.project_path, conf_dictionary=conf_catalog ) conf_creds = self._get_config_credentials() catalog = self._create_catalog( conf_catalog, conf_creds, save_version, journal, load_versions ) feed_dict = self._get_feed_dict() catalog.add_feed_dict(feed_dict) if catalog.layers: _validate_layers_for_transcoding(catalog) hook_manager = get_hook_manager() hook_manager.hook.after_catalog_created( # pylint: disable=no-member catalog=catalog, conf_catalog=conf_catalog, conf_creds=conf_creds, feed_dict=feed_dict, save_version=save_version, load_versions=load_versions, run_id=self.run_id, ) return catalog
def _register_hooks(self, auto: bool = False) -> None: """Register all hooks as specified in ``hooks`` with the global ``hook_manager``, and, optionally, from installed plugins. Args: auto: An optional flag to enable auto-discovery and registration of plugin hooks. """ hook_manager = get_hook_manager() # enrich with hooks specified in .kedro.yml or pyproject.toml if .kedro.yml doesn't exist hooks_locations = self.static_data.get("hooks", []) configured_hooks = tuple(load_obj(hook) for hook in hooks_locations) all_hooks = self.hooks + configured_hooks for hooks_collection in all_hooks: # Sometimes users might create more than one context instance, in which case # hooks have already been registered, so we perform a simple check here # to avoid an error being raised and break user's workflow. if not hook_manager.is_registered(hooks_collection): hook_manager.register(hooks_collection) if auto: self._register_hooks_setuptools()
def _load_data(self): """Lazily read pipelines defined in the pipelines registry module""" # If the pipelines dictionary has not been configured with a pipelines module # or if data has been loaded if self._pipelines_module is None or self._is_data_loaded: return try: register_pipelines = self._get_pipelines_registry_callable( self._pipelines_module) except (ModuleNotFoundError, AttributeError) as exc: # for backwards compatibility with templates < 0.17.2 # where no pipelines_registry is defined if self._pipelines_module in str(exc): # pragma: no cover project_pipelines = {} else: raise else: project_pipelines = register_pipelines() hook_manager = get_hook_manager() pipelines_dicts = ( hook_manager.hook.register_pipelines() # pylint: disable=no-member ) for pipeline_collection in pipelines_dicts: duplicate_keys = pipeline_collection.keys( ) & project_pipelines.keys() if duplicate_keys: warn( f"Found duplicate pipeline entries. " f"The following will be overwritten: {', '.join(duplicate_keys)}" ) project_pipelines.update(pipeline_collection) self._content = project_pipelines self._is_data_loaded = True
def _register_hooks(self, auto: bool = False) -> None: """Register all hooks as specified in ``hooks`` with the global ``hook_manager``, and, optionally, from installed plugins. Args: auto: An optional flag to enable auto-discovery and registration of plugin hooks. """ self._hook_manager = get_hook_manager() if auto: found = self._hook_manager.load_setuptools_entrypoints( _PLUGIN_HOOKS) if found: # pragma: no cover logging.info("Registered hooks from %d installed plugin(s)", found) for hooks_collection in self.hooks: # Sometimes users might create more than one context instance, in which case # hooks have already been registered, so we perform a simple check here # to avoid an error being raised and break user's workflow. if not self._hook_manager.is_registered( hooks_collection): # pragma: no cover self._hook_manager.register( hooks_collection) # pragma: no cover
def run( # pylint: disable=too-many-arguments,too-many-locals self, tags: Iterable[str] = None, runner: AbstractRunner = None, node_names: Iterable[str] = None, from_nodes: Iterable[str] = None, to_nodes: Iterable[str] = None, from_inputs: Iterable[str] = None, to_outputs: Iterable[str] = None, load_versions: Dict[str, str] = None, pipeline_name: str = None, ) -> Dict[str, Any]: """Runs the pipeline with a specified runner. Args: tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be run. runner: An optional parameter specifying the runner that you want to run the pipeline with. node_names: An optional list of node names which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes with these names will be run. from_nodes: An optional list of node names which should be used as a starting point of the new ``Pipeline``. to_nodes: An optional list of node names which should be used as an end point of the new ``Pipeline``. from_inputs: An optional list of input datasets which should be used as a starting point of the new ``Pipeline``. to_outputs: An optional list of output datasets which should be used as an end point of the new ``Pipeline``. load_versions: An optional flag to specify a particular dataset version timestamp to load. pipeline_name: Name of the ``Pipeline`` to execute. Defaults to "__default__". Raises: KedroContextError: If the resulting ``Pipeline`` is empty or incorrect tags are provided. Exception: Any uncaught exception will be re-raised after being passed to``on_pipeline_error``. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # Report project name logging.info("** Kedro project %s", self.project_path.name) pipeline = self._get_pipeline(name=pipeline_name) filtered_pipeline = self._filter_pipeline( pipeline=pipeline, tags=tags, from_nodes=from_nodes, to_nodes=to_nodes, node_names=node_names, from_inputs=from_inputs, to_outputs=to_outputs, ) save_version = self._get_save_version() run_id = self.run_id or save_version record_data = { "run_id": run_id, "project_path": str(self.project_path), "env": self.env, "tags": tags, "from_nodes": from_nodes, "to_nodes": to_nodes, "node_names": node_names, "from_inputs": from_inputs, "to_outputs": to_outputs, "load_versions": load_versions, "pipeline_name": pipeline_name, "extra_params": self._extra_params, } journal = Journal(record_data) catalog = self._get_catalog(save_version=save_version, journal=journal, load_versions=load_versions) # Run the runner runner = runner or SequentialRunner() hook_manager = get_hook_manager() hook_manager.hook.before_pipeline_run( # pylint: disable=no-member run_params=record_data, pipeline=filtered_pipeline, catalog=catalog) try: run_result = runner.run(filtered_pipeline, catalog, run_id) except Exception as exc: hook_manager.hook.on_pipeline_error( # pylint: disable=no-member error=exc, run_params=record_data, pipeline=filtered_pipeline, catalog=catalog, ) raise exc hook_manager.hook.after_pipeline_run( # pylint: disable=no-member run_params=record_data, run_result=run_result, pipeline=filtered_pipeline, catalog=catalog, ) return run_result
def run( # pylint: disable=too-many-arguments,too-many-locals self, pipeline_name: str = None, tags: Iterable[str] = None, runner: AbstractRunner = None, node_names: Iterable[str] = None, from_nodes: Iterable[str] = None, to_nodes: Iterable[str] = None, from_inputs: Iterable[str] = None, load_versions: Dict[str, str] = None, extra_params: Dict[str, Any] = None, ) -> Dict[str, Any]: """Runs the pipeline with a specified runner. Args: pipeline_name: Name of the pipeline that is being run. tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be run. runner: An optional parameter specifying the runner that you want to run the pipeline with. node_names: An optional list of node names which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes with these names will be run. from_nodes: An optional list of node names which should be used as a starting point of the new ``Pipeline``. to_nodes: An optional list of node names which should be used as an end point of the new ``Pipeline``. from_inputs: An optional list of input datasets which should be used as a starting point of the new ``Pipeline``. load_versions: An optional flag to specify a particular dataset version timestamp to load. extra_params: Additional run parameters. Raises: Exception: Any uncaught exception during the run will be re-raised after being passed to ``on_pipeline_error`` hook. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # pylint: disable=protected-access,no-member # Report project name logging.info("** Kedro project %s", self._project_path.name) save_version = run_id = self.store["session_id"] extra_params = deepcopy(extra_params) or dict() context = self.context pipeline = context._get_pipeline(name=pipeline_name) filtered_pipeline = context._filter_pipeline( pipeline=pipeline, tags=tags, from_nodes=from_nodes, to_nodes=to_nodes, node_names=node_names, from_inputs=from_inputs, ) record_data = { "run_id": run_id, "project_path": self._project_path.as_posix(), "env": context.env, "kedro_version": self.store["kedro_version"], "tags": tags, "from_nodes": from_nodes, "to_nodes": to_nodes, "node_names": node_names, "from_inputs": from_inputs, "load_versions": load_versions, "extra_params": extra_params, "pipeline_name": pipeline_name, } catalog = context._get_catalog(save_version=save_version, load_versions=load_versions) # Run the runner runner = runner or SequentialRunner() hook = get_hook_manager().hook hook.before_pipeline_run(run_params=record_data, pipeline=filtered_pipeline, catalog=catalog) try: run_result = runner.run(filtered_pipeline, catalog, run_id) except Exception as error: hook.on_pipeline_error( error=error, run_params=record_data, pipeline=filtered_pipeline, catalog=catalog, ) raise hook.after_pipeline_run( run_params=record_data, run_result=run_result, pipeline=filtered_pipeline, catalog=catalog, ) return run_result
def clear_hook_manager(): yield hook_manager = get_hook_manager() plugins = hook_manager.get_plugins() for plugin in plugins: hook_manager.unregister(plugin)