def _filter_pipeline( self, pipeline: Pipeline, tags: Iterable[str] = None, from_nodes: Iterable[str] = None, to_nodes: Iterable[str] = None, node_names: Iterable[str] = None, from_inputs: Iterable[str] = None, ) -> Pipeline: """Filter the pipeline as the intersection of all conditions.""" new_pipeline = pipeline # We need to intersect with the pipeline because the order # of operations matters, so we don't want to do it incrementally. # As an example, with a pipeline of nodes 1,2,3, think of # "from 1", and "only 1 and 3" - the order you do them in results in # either 1 & 3, or just 1. if tags: new_pipeline &= pipeline.only_nodes_with_tags(*tags) if not new_pipeline.nodes: raise KedroContextError( "Pipeline contains no nodes with tags: {}".format( str(tags))) if from_nodes: new_pipeline &= pipeline.from_nodes(*from_nodes) if to_nodes: new_pipeline &= pipeline.to_nodes(*to_nodes) if node_names: new_pipeline &= pipeline.only_nodes(*node_names) if from_inputs: new_pipeline &= pipeline.from_inputs(*from_inputs) if not new_pipeline.nodes: raise KedroContextError("Pipeline contains no nodes") return new_pipeline
def run_only_missing( self, pipeline: Pipeline, catalog: DataCatalog ) -> Dict[str, Any]: """Run only the missing outputs from the ``Pipeline`` using the ``DataSet``s provided by ``catalog`` and save results back to the same objects. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ free_outputs = pipeline.outputs() - set(catalog.list()) missing = {ds for ds in catalog.list() if not catalog.exists(ds)} to_build = free_outputs | missing to_rerun = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs( *to_build ) # we also need any memory data sets that feed into that # including chains of memory data sets memory_sets = pipeline.data_sets() - set(catalog.list()) output_to_memory = pipeline.only_nodes_with_outputs(*memory_sets) input_from_memory = to_rerun.inputs() & memory_sets to_rerun += output_to_memory.to_outputs(*input_from_memory) return self.run(to_rerun, catalog)
def run(self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None) -> Dict[str, Any]: """ Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``. Parameters ---------- pipeline: Pipeline The ``Pipeline`` to run catalog: DataCatalog The ``DataCatalog`` from which to fetch data. run_id: str The id of the run. Returns ------- dict Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # If missing flag run missing_output pipeline and its child nodes if self.only_missing: to_build = {ds for ds in catalog.list() if not catalog.exists(ds)}.intersection(pipeline.data_sets()) pipeline = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs(*to_build) return super(DatalabRunner, self).run(pipeline, catalog, run_id)