コード例 #1
0
ファイル: parallel_runner.py プロジェクト: zeta1999/kedro
    def _validate_catalog(cls, catalog: DataCatalog, pipeline: Pipeline):
        """Ensure that all data sets are serializable and that we do not have
        any non proxied memory data sets being used as outputs as their content
        will not be synchronized across threads.
        """

        data_sets = catalog._data_sets  # pylint: disable=protected-access

        unserializable = []
        for name, data_set in data_sets.items():
            if getattr(data_set, "_SINGLE_PROCESS", False):  # SKIP_IF_NO_SPARK
                unserializable.append(name)
                continue
            try:
                ForkingPickler.dumps(data_set)
            except (AttributeError, PicklingError):
                unserializable.append(name)

        if unserializable:
            raise AttributeError(
                "The following data sets cannot be used with multiprocessing: "
                "{}\nIn order to utilize multiprocessing you need to make sure "
                "all data sets are serializable, i.e. data sets should not make "
                "use of lambda functions, nested functions, closures etc.\nIf you "
                "are using custom decorators ensure they are correctly using "
                "functools.wraps().".format(sorted(unserializable))
            )

        memory_data_sets = []
        for name, data_set in data_sets.items():
            if (
                name in pipeline.all_outputs()
                and isinstance(data_set, MemoryDataSet)
                and not isinstance(data_set, BaseProxy)
            ):
                memory_data_sets.append(name)

        if memory_data_sets:
            raise AttributeError(
                "The following data sets are memory data sets: {}\n"
                "ParallelRunner does not support output to externally created "
                "MemoryDataSets".format(sorted(memory_data_sets))
            )
コード例 #2
0
    def _run(self, pipeline: Pipeline, catalog: DataCatalog) -> None:
        """The method implementing sequential pipeline running.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.

        Raises:
            ValueError: if the Pipeline is not compatible with Airflow
        """

        print("_run")

        data_sets = catalog._data_sets  # pylint: disable=protected-access
        memory_data_sets = []
        for name, data_set in data_sets.items():
            if name in pipeline.all_outputs() and isinstance(
                    data_set, MemoryDataSet):
                memory_data_sets.append(name)

        if memory_data_sets:
            raise ValueError(
                "The following output data sets are memory data sets: {}\n"
                "AirflowRunner does not support output to MemoryDataSets".
                format(", ".join("'{}'".format(ds)
                                 for ds in memory_data_sets)))

        node_dependencies = pipeline.node_dependencies
        operators_by_node = {}
        for node in node_dependencies:
            name = slugify(node.name)
            operators_by_node[node] = PythonOperator(
                task_id=name,
                provide_context=True,
                python_callable=self.create_task(node, catalog),
                dag=self._dag,
                **self._operator_arguments(name))

        for node, dependencies in node_dependencies.items():
            for dependency in dependencies:
                operators_by_node[node].set_upstream(
                    operators_by_node[dependency])