Ejemplo n.º 1
0
    def _run(self, pipeline: Pipeline, catalog: DataCatalog) -> None:
        """The method implementing sequential pipeline running.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.

        """
        nodes = pipeline.nodes

        load_counts = Counter(chain.from_iterable(n.inputs for n in nodes))

        for exec_index, node in enumerate(nodes):
            run_node(node, catalog)

            # decrement load counts and release any data sets we've finished with
            for data_set in node.inputs:
                load_counts[data_set] -= 1
                if load_counts[data_set] < 1 and data_set not in pipeline.inputs():
                    catalog.release(data_set)
            for data_set in node.outputs:
                if load_counts[data_set] < 1 and data_set not in pipeline.outputs():
                    catalog.release(data_set)

            self._logger.info(
                "Completed %d out of %d tasks", exec_index + 1, len(nodes)
            )
Ejemplo n.º 2
0
    def _run(self, pipeline: Pipeline, catalog: DataCatalog) -> None:
        """The method implementing sequential pipeline running.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.

        """
        nodes = pipeline.nodes
        for exec_index, node in enumerate(nodes):
            run_node(node, catalog)
            self._logger.info("Completed %d out of %d tasks", exec_index + 1,
                              len(nodes))
Ejemplo n.º 3
0
def _run_node_synchronization(node: Node,
                              catalog: DataCatalog,
                              is_async: bool = False,
                              run_id: str = None) -> Node:
    """Run a single `Node` with inputs from and outputs to the `catalog`.
    `KedroContext` class is initialized in every subprocess because of Windows
    (latest OSX with Python 3.8) limitation.
    Windows has no "fork", so every subprocess is a brand new process created via "spawn",
    and KedroContext needs to be created in every subprocess in order to make
    KedroContext logging setup and hook manager work.

    Args:
        node: The ``Node`` to run.
        catalog: A ``DataCatalog`` containing the node's inputs and outputs.
        is_async: If True, the node inputs and outputs are loaded and saved
            asynchronously with threads. Defaults to False.
        run_id: The id of the pipeline run.

    Returns:
        The node argument.

    """

    if multiprocessing.get_start_method() == "spawn":  # type: ignore
        # pylint: disable=import-outside-toplevel
        import kedro.framework.context.context as context  # pragma: no cover

        context.load_context(Path.cwd())  # pragma: no cover
    # The hard-coded current working directory causes
    # parallel runner to not work in notebook environment,
    # but we will revisit this when we work on access `project_path`
    # from within the runner and data in KedroContext
    # See https://github.com/quantumblacklabs/private-kedro/issues/701.
    return run_node(node, catalog, is_async, run_id)
Ejemplo n.º 4
0
def _run_node_synchronization(  # pylint: disable=too-many-arguments
    node: Node,
    catalog: DataCatalog,
    is_async: bool = False,
    run_id: str = None,
    package_name: str = None,
    conf_logging: Dict[str, Any] = None,
) -> Node:
    """Run a single `Node` with inputs from and outputs to the `catalog`.
    `KedroSession` instance is activated in every subprocess because of Windows
    (and latest OSX with Python 3.8) limitation.
    Windows has no "fork", so every subprocess is a brand new process
    created via "spawn", hence the need to a) setup the logging, b) register
    the hooks, and c) activate `KedroSession` in every subprocess.

    Args:
        node: The ``Node`` to run.
        catalog: A ``DataCatalog`` containing the node's inputs and outputs.
        is_async: If True, the node inputs and outputs are loaded and saved
            asynchronously with threads. Defaults to False.
        run_id: The id of the pipeline run.
        package_name: The name of the project Python package.
        conf_logging: A dictionary containing logging configuration.

    Returns:
        The node argument.

    """
    if multiprocessing.get_start_method(
    ) == "spawn" and package_name:  # type: ignore
        conf_logging = conf_logging or {}
        _bootstrap_subprocess(package_name, conf_logging)

    return run_node(node, catalog, is_async, run_id)
Ejemplo n.º 5
0
    def _run(self,
             pipeline: Pipeline,
             catalog: DataCatalog,
             run_id: str = None) -> None:
        """The method implementing sequential pipeline running.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.
            run_id: The id of the run.

        Raises:
            Exception: in case of any downstream node failure.
        """
        nodes = pipeline.nodes
        done_nodes = set()

        load_counts = Counter(chain.from_iterable(n.inputs for n in nodes))

        for exec_index, node in enumerate(nodes):
            try:
                run_node(node, catalog, self._is_async, run_id)
                done_nodes.add(node)
            except Exception:
                self._suggest_resume_scenario(pipeline, done_nodes)
                raise

            # decrement load counts and release any data sets we've finished with
            for data_set in node.inputs:
                load_counts[data_set] -= 1
                if load_counts[
                        data_set] < 1 and data_set not in pipeline.inputs():
                    catalog.release(data_set)
            for data_set in node.outputs:
                if load_counts[
                        data_set] < 1 and data_set not in pipeline.outputs():
                    catalog.release(data_set)

            self._logger.info("Completed %d out of %d tasks", exec_index + 1,
                              len(nodes))