def run_only_missing( self, pipeline: Pipeline, catalog: DataCatalog ) -> Dict[str, Any]: """Run only the missing outputs from the ``Pipeline`` using the ``DataSet``s provided by ``catalog`` and save results back to the same objects. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ free_outputs = pipeline.outputs() - set(catalog.list()) missing = {ds for ds in catalog.list() if not catalog.exists(ds)} to_build = free_outputs | missing to_rerun = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs( *to_build ) # we also need any memory data sets that feed into that # including chains of memory data sets memory_sets = pipeline.data_sets() - set(catalog.list()) output_to_memory = pipeline.only_nodes_with_outputs(*memory_sets) input_from_memory = to_rerun.inputs() & memory_sets to_rerun += output_to_memory.to_outputs(*input_from_memory) return self.run(to_rerun, catalog)
def run(self, pipeline: Pipeline, catalog: DataCatalog) -> Dict[str, Any]: """Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog`` and save results back to the same objects. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ catalog = catalog.shallow_copy() unsatisfied = pipeline.inputs() - set(catalog.list()) if unsatisfied: raise ValueError("Pipeline input(s) {} not found in the " "DataCatalog".format(unsatisfied)) free_outputs = pipeline.outputs() - set(catalog.list()) unregistered_ds = pipeline.data_sets() - set(catalog.list()) for ds_name in unregistered_ds: catalog.add(ds_name, self.create_default_data_set(ds_name)) self._run(pipeline, catalog) self._logger.info("Pipeline execution completed successfully.") return {ds_name: catalog.load(ds_name) for ds_name in free_outputs}
def run(self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None) -> Dict[str, Any]: """ Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``. Parameters ---------- pipeline: Pipeline The ``Pipeline`` to run catalog: DataCatalog The ``DataCatalog`` from which to fetch data. run_id: str The id of the run. Returns ------- dict Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # If missing flag run missing_output pipeline and its child nodes if self.only_missing: to_build = {ds for ds in catalog.list() if not catalog.exists(ds)}.intersection(pipeline.data_sets()) pipeline = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs(*to_build) return super(DatalabRunner, self).run(pipeline, catalog, run_id)
def before_pipeline_run(self, run_params: Dict, pipeline: Pipeline, catalog: DataCatalog): if not self._enabled: return logger.info("KedroWings is Enabled") all_dataset_names = set([ ds for node in pipeline.nodes for ds in [inp for inp in node.inputs] + [outp for outp in node.outputs] ]) catalog_entries = self._create_catalog_entries(all_dataset_names) existing_catalog_names = set(catalog.list()) for catalog_name, catalog_dataset in catalog_entries.items(): if catalog_name in existing_catalog_names: continue catalog.add(catalog_name, catalog_dataset)