Python DataCatalog.list Exemples, kedro.io.DataCatalog.list Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : runner.py Projet : zeta1999/kedro

    def run_only_missing(
        self, pipeline: Pipeline, catalog: DataCatalog
    ) -> Dict[str, Any]:
        """Run only the missing outputs from the ``Pipeline`` using the
        ``DataSet``s provided by ``catalog`` and save results back to the same
        objects.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.
        Raises:
            ValueError: Raised when ``Pipeline`` inputs cannot be satisfied.

        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.

        """
        free_outputs = pipeline.outputs() - set(catalog.list())
        missing = {ds for ds in catalog.list() if not catalog.exists(ds)}
        to_build = free_outputs | missing
        to_rerun = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs(
            *to_build
        )

        # we also need any memory data sets that feed into that
        # including chains of memory data sets
        memory_sets = pipeline.data_sets() - set(catalog.list())
        output_to_memory = pipeline.only_nodes_with_outputs(*memory_sets)
        input_from_memory = to_rerun.inputs() & memory_sets
        to_rerun += output_to_memory.to_outputs(*input_from_memory)

        return self.run(to_rerun, catalog)

Exemple #2

0

Afficher le fichier

    def run(self, pipeline: Pipeline, catalog: DataCatalog) -> Dict[str, Any]:
        """Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``
        and save results back to the same objects.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.

        Raises:
            ValueError: Raised when ``Pipeline`` inputs cannot be satisfied.

        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.

        """

        catalog = catalog.shallow_copy()

        unsatisfied = pipeline.inputs() - set(catalog.list())
        if unsatisfied:
            raise ValueError("Pipeline input(s) {} not found in the "
                             "DataCatalog".format(unsatisfied))

        free_outputs = pipeline.outputs() - set(catalog.list())
        unregistered_ds = pipeline.data_sets() - set(catalog.list())
        for ds_name in unregistered_ds:
            catalog.add(ds_name, self.create_default_data_set(ds_name))

        self._run(pipeline, catalog)

        self._logger.info("Pipeline execution completed successfully.")

        return {ds_name: catalog.load(ds_name) for ds_name in free_outputs}

Exemple #3

0

Afficher le fichier

    def run(self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None) -> Dict[str, Any]:
        """
        Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``.

        Parameters
        ----------
        pipeline: Pipeline
            The ``Pipeline`` to run
        catalog: DataCatalog
            The ``DataCatalog`` from which to fetch data.
        run_id: str
            The id of the run.

        Returns
        -------
        dict
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.

        """
        # If missing flag run missing_output pipeline and its child nodes
        if self.only_missing:
            to_build = {ds for ds in catalog.list() if not catalog.exists(ds)}.intersection(pipeline.data_sets())
            pipeline = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs(*to_build)

        return super(DatalabRunner, self).run(pipeline, catalog, run_id)

Exemple #4

0

Afficher le fichier

    def before_pipeline_run(self, run_params: Dict, pipeline: Pipeline,
                            catalog: DataCatalog):
        if not self._enabled:
            return

        logger.info("KedroWings is Enabled")
        all_dataset_names = set([
            ds for node in pipeline.nodes
            for ds in [inp for inp in node.inputs] +
            [outp for outp in node.outputs]
        ])

        catalog_entries = self._create_catalog_entries(all_dataset_names)

        existing_catalog_names = set(catalog.list())
        for catalog_name, catalog_dataset in catalog_entries.items():
            if catalog_name in existing_catalog_names:
                continue
            catalog.add(catalog_name, catalog_dataset)