Exemple #1
0
    def _run(self, pipeline: Pipeline, catalog: DataCatalog) -> None:
        """The method implementing sequential pipeline running.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.

        """
        nodes = pipeline.nodes

        load_counts = Counter(chain.from_iterable(n.inputs for n in nodes))

        for exec_index, node in enumerate(nodes):
            run_node(node, catalog)

            # decrement load counts and release any data sets we've finished with
            for data_set in node.inputs:
                load_counts[data_set] -= 1
                if load_counts[data_set] < 1 and data_set not in pipeline.inputs():
                    catalog.release(data_set)
            for data_set in node.outputs:
                if load_counts[data_set] < 1 and data_set not in pipeline.outputs():
                    catalog.release(data_set)

            self._logger.info(
                "Completed %d out of %d tasks", exec_index + 1, len(nodes)
            )
Exemple #2
0
    def _run(  # pylint: disable=too-many-locals,useless-suppression
            self, pipeline: Pipeline, catalog: DataCatalog) -> None:
        """The abstract interface for running pipelines.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.

        Raises:
            AttributeError: when the provided pipeline is not suitable for
                parallel execution.
            Exception: in case of any downstream node failure.

        """
        nodes = pipeline.nodes
        self._validate_catalog(catalog, pipeline)
        self._validate_nodes(nodes)

        load_counts = Counter(chain.from_iterable(n.inputs for n in nodes))
        node_dependencies = pipeline.node_dependencies
        todo_nodes = set(node_dependencies.keys())
        done_nodes = set()  # type: Set[Node]
        futures = set()
        done = None
        max_workers = self._get_required_workers_count(pipeline)

        with ProcessPoolExecutor(max_workers=max_workers) as pool:
            while True:
                ready = {
                    n
                    for n in todo_nodes if node_dependencies[n] <= done_nodes
                }
                todo_nodes -= ready
                for node in ready:
                    futures.add(pool.submit(run_node, node, catalog))
                if not futures:
                    assert not todo_nodes, (todo_nodes, done_nodes, ready,
                                            done)
                    break
                done, futures = wait(futures, return_when=FIRST_COMPLETED)
                for future in done:
                    try:
                        node = future.result()
                    except Exception:
                        self._suggest_resume_scenario(pipeline, done_nodes)
                        raise
                    done_nodes.add(node)

                    # decrement load counts and release any data sets we've finished with
                    # this is particularly important for the shared datasets we create above
                    for data_set in node.inputs:
                        load_counts[data_set] -= 1
                        if (load_counts[data_set] < 1
                                and data_set not in pipeline.inputs()):
                            catalog.release(data_set)
                    for data_set in node.outputs:
                        if (load_counts[data_set] < 1
                                and data_set not in pipeline.outputs()):
                            catalog.release(data_set)
    def _run(self,
             pipeline: Pipeline,
             catalog: DataCatalog,
             run_id: str = None) -> None:
        """The method implementing sequential pipeline running.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.
            run_id: The id of the run.

        Raises:
            Exception: in case of any downstream node failure.
        """
        nodes = pipeline.nodes
        done_nodes = set()

        load_counts = Counter(chain.from_iterable(n.inputs for n in nodes))

        for exec_index, node in enumerate(nodes):
            try:
                run_node(node, catalog, self._is_async, run_id)
                done_nodes.add(node)
            except Exception:
                self._suggest_resume_scenario(pipeline, done_nodes)
                raise

            # decrement load counts and release any data sets we've finished with
            for data_set in node.inputs:
                load_counts[data_set] -= 1
                if load_counts[
                        data_set] < 1 and data_set not in pipeline.inputs():
                    catalog.release(data_set)
            for data_set in node.outputs:
                if load_counts[
                        data_set] < 1 and data_set not in pipeline.outputs():
                    catalog.release(data_set)

            self._logger.info("Completed %d out of %d tasks", exec_index + 1,
                              len(nodes))
Exemple #4
0
    def _run(  # pylint: disable=too-many-locals,useless-suppression
            self,
            pipeline: Pipeline,
            catalog: DataCatalog,
            run_id: str = None) -> None:
        """The abstract interface for running pipelines.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.
            run_id: The id of the run.

        Raises:
            AttributeError: When the provided pipeline is not suitable for
                parallel execution.
            RuntimeError: If the runner is unable to schedule the execution of
                all pipeline nodes.
            Exception: In case of any downstream node failure.

        """
        # pylint: disable=import-outside-toplevel,cyclic-import
        from kedro.framework.session.session import get_current_session

        nodes = pipeline.nodes
        self._validate_catalog(catalog, pipeline)
        self._validate_nodes(nodes)

        load_counts = Counter(chain.from_iterable(n.inputs for n in nodes))
        node_dependencies = pipeline.node_dependencies
        todo_nodes = set(node_dependencies.keys())
        done_nodes = set()  # type: Set[Node]
        futures = set()
        done = None
        max_workers = self._get_required_workers_count(pipeline)

        from kedro.framework.project import PACKAGE_NAME

        session = get_current_session(silent=True)
        # pylint: disable=protected-access
        conf_logging = session._get_logging_config() if session else None

        with ProcessPoolExecutor(max_workers=max_workers) as pool:
            while True:
                ready = {
                    n
                    for n in todo_nodes if node_dependencies[n] <= done_nodes
                }
                todo_nodes -= ready
                for node in ready:
                    futures.add(
                        pool.submit(
                            _run_node_synchronization,
                            node,
                            catalog,
                            self._is_async,
                            run_id,
                            package_name=PACKAGE_NAME,
                            conf_logging=conf_logging,
                        ))
                if not futures:
                    if todo_nodes:
                        debug_data = {
                            "todo_nodes": todo_nodes,
                            "done_nodes": done_nodes,
                            "ready_nodes": ready,
                            "done_futures": done,
                        }
                        debug_data_str = "\n".join(
                            f"{k} = {v}" for k, v in debug_data.items())
                        raise RuntimeError(
                            f"Unable to schedule new tasks although some nodes "
                            f"have not been run:\n{debug_data_str}")
                    break  # pragma: no cover
                done, futures = wait(futures, return_when=FIRST_COMPLETED)
                for future in done:
                    try:
                        node = future.result()
                    except Exception:
                        self._suggest_resume_scenario(pipeline, done_nodes)
                        raise
                    done_nodes.add(node)

                    # decrement load counts and release any data sets we've finished with
                    # this is particularly important for the shared datasets we create above
                    for data_set in node.inputs:
                        load_counts[data_set] -= 1
                        if (load_counts[data_set] < 1
                                and data_set not in pipeline.inputs()):
                            catalog.release(data_set)
                    for data_set in node.outputs:
                        if (load_counts[data_set] < 1
                                and data_set not in pipeline.outputs()):
                            catalog.release(data_set)
Exemple #5
0
    def _run(  # pylint: disable=too-many-locals,useless-suppression
            self,
            pipeline: Pipeline,
            catalog: DataCatalog,
            run_id: str = None) -> None:
        """The abstract interface for running pipelines.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.
            run_id: The id of the run.

        Raises:
            Exception: in case of any downstream node failure.

        """
        nodes = pipeline.nodes
        load_counts = Counter(chain.from_iterable(n.inputs for n in nodes))
        node_dependencies = pipeline.node_dependencies
        todo_nodes = set(node_dependencies.keys())
        done_nodes = set()  # type: Set[Node]
        futures = set()
        done = None
        max_workers = self._get_required_workers_count(pipeline)

        with ThreadPoolExecutor(max_workers=max_workers) as pool:
            while True:
                ready = {
                    n
                    for n in todo_nodes if node_dependencies[n] <= done_nodes
                }
                todo_nodes -= ready
                for node in ready:
                    futures.add(
                        pool.submit(run_node, node, catalog, self._is_async,
                                    run_id))
                if not futures:
                    assert not todo_nodes, (todo_nodes, done_nodes, ready,
                                            done)
                    break
                done, futures = wait(futures, return_when=FIRST_COMPLETED)
                for future in done:
                    try:
                        node = future.result()
                    except Exception:
                        self._suggest_resume_scenario(pipeline, done_nodes)
                        raise
                    done_nodes.add(node)
                    self._logger.info("Completed node: %s", node.name)
                    self._logger.info("Completed %d out of %d tasks",
                                      len(done_nodes), len(nodes))

                    # Decrement load counts, and release any datasets we
                    # have finished with.
                    for data_set in node.inputs:
                        load_counts[data_set] -= 1
                        if (load_counts[data_set] < 1
                                and data_set not in pipeline.inputs()):
                            catalog.release(data_set)
                    for data_set in node.outputs:
                        if (load_counts[data_set] < 1
                                and data_set not in pipeline.outputs()):
                            catalog.release(data_set)