def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: inputs = {name: catalog.load(name) for name in node.inputs} hook_manager = get_hook_manager() is_async = False hook_manager.hook.before_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id ) try: outputs = node.run(inputs) except Exception as exc: hook_manager.hook.on_node_error( # pylint: disable=no-member error=exc, node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id, ) raise exc hook_manager.hook.after_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, outputs=outputs, is_async=is_async, run_id=run_id, ) for name, data in outputs.items(): catalog.save(name, data) return node
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: inputs = {} hook_manager = get_hook_manager() for name in node.inputs: hook_manager.hook.before_dataset_loaded( # pylint: disable=no-member dataset_name=name ) inputs[name] = catalog.load(name) hook_manager.hook.after_dataset_loaded( # pylint: disable=no-member dataset_name=name, data=inputs[name] ) is_async = False additional_inputs = _collect_inputs_from_hook( node, catalog, inputs, is_async, run_id=run_id ) inputs.update(additional_inputs) outputs = _call_node_run(node, catalog, inputs, is_async, run_id=run_id) for name, data in outputs.items(): hook_manager.hook.before_dataset_saved( # pylint: disable=no-member dataset_name=name, data=data ) catalog.save(name, data) hook_manager.hook.after_dataset_saved( # pylint: disable=no-member dataset_name=name, data=data ) return node
def test_add_save_and_load(self, data_set, dummy_dataframe): """Test adding and then saving and reloading the data set""" catalog = DataCatalog(data_sets={}) catalog.add("test", data_set) catalog.save("test", dummy_dataframe) reloaded_df = catalog.load("test") assert_frame_equal(reloaded_df, dummy_dataframe)
def test_all_before_adding(self, fake_data_set, fake_transformer): catalog = DataCatalog() catalog.add_transformer(fake_transformer) catalog.add("test", fake_data_set) catalog.save("test", 42) assert catalog.load("test") == 44 assert fake_data_set.log == [("save", 43), ("load", 43)] assert fake_transformer.log == [("save", 42), ("load", 43)]
def all_catalog(dataframex, dataframey, dataframey_bad): #создадим DF, так как они лежат в Кедро, после загрузки в память # https://kedro.readthedocs.io/en/stable/05_data/02_kedro_io.html from kedro.io import DataCatalog, MemoryDataSet catalog = DataCatalog({ "dataframex": MemoryDataSet(), "dataframey": MemoryDataSet(), "dataframey_bad": MemoryDataSet() }) catalog.save("dataframex", dataframex) catalog.save("dataframey", dataframey) catalog.save("dataframey_bad", dataframey_bad) return catalog
def run_node(node: Node, catalog: DataCatalog) -> Node: """Run a single `Node` with inputs from and outputs to the `catalog`. Args: node: The ``Node`` to run. catalog: A ``DataCatalog`` containing the node's inputs and outputs. Returns: The node argument. """ inputs = {name: catalog.load(name) for name in node.inputs} outputs = node.run(inputs) for name, data in outputs.items(): catalog.save(name, data) return node
def _run_one_task(self, config_filename): # create node from Task expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node() config, log = self._init_config_log(config_filename, self.base_directory, self.config_directory) # Prepare a data catalog data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'expanded_config': MemoryDataSet() }) data_catalog.save('config', config) data_catalog.save('log', log) # Assemble nodes into a pipeline pipeline = Pipeline([expand_config_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) return log, data_catalog
def run(self): """ Run all tasks """ # data data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet(), 'dataset': MemoryDataSet(), 'data': MemoryDataSet() }) data_catalog.save('config', self.config) data_catalog.save('log', self.log) data_catalog.save('base_directory', self.base_directory) load_data_node = mls.workflows.tasks.LoadDataTask.get_node() prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node() split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node() learn_node = mls.sl.workflows.tasks.LearnTask.get_node() evaluate_node = mls.sl.workflows.tasks.EvaluateTask.get_node() # Assemble nodes into a pipeline pipeline = Pipeline([ load_data_node, prepare_data_node, split_data_node, learn_node, evaluate_node ]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) self.terminate()
def _run_one_task(self, config_filename): # create node from Task load_data_node = mls.workflows.tasks.LoadDataTask.get_node() prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node() split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node() learn_data_node = mls.sl.workflows.tasks.LearnTask.get_node() config, log = self._init_config_log(config_filename, self.base_directory, self.config_directory) # Prepare a data catalog data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet() }) data_catalog.save('config', config) data_catalog.save('log', log) data_catalog.save('base_directory', self.base_directory) # Assemble nodes into a pipeline pipeline = Pipeline([ load_data_node, prepare_data_node, split_data_node, learn_data_node ]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) return log, config, data_catalog
def run(self): """ Run the workflow : run each config """ # data data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet() }) data_catalog.save('config', self.config) data_catalog.save('log', self.log) data_catalog.save('base_directory', self.base_directory) expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node() multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node( ) # Assemble nodes into a pipeline pipeline = Pipeline([expand_config_node, multiple_learning_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline result = runner.run(pipeline, data_catalog) if len(result) == 0: self.terminate()
def test_save_to_unregistered(self, dummy_dataframe): """Check the error when attempting to save to unregistered data set""" catalog = DataCatalog(data_sets={}) pattern = r"DataSet 'test' not found in the catalog" with pytest.raises(DataSetNotFoundError, match=pattern): catalog.save("test", dummy_dataframe)
join_statements, inputs="my_salutation", outputs="my_message" ) # # adder_node = node( # func=add, inputs=["a", "b"], outputs="sum" # ) generation_node = node( generation, inputs=["num_samples"], outputs=None ) # Assemble nodes into a pipeline #pipeline = Pipeline([return_greeting_node, join_statements_node]) #pipeline = Pipeline([return_greeting_node]) pipeline = Pipeline([generation_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline #print(runner.run(pipeline, data_catalog)) #print(runner.run(pipeline, DataCatalog(dict(a=2,b=3)))) #io = DataCatalog(dict(a=MemoryDataSet(),b=MemoryDataSet())) io = DataCatalog(dict(num_samples=MemoryDataSet())) io.save("num_samples",1) # print(adder_node.run(dict(a=2, b=3))) print(runner.run(pipeline, io))
def _run_one_task(self, config_filename): # create node from Task expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node() multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node() # Prepare a data catalog config, log = self._init_config_log(config_filename, self.base_directory, self.config_directory) expanded_config = [{"input": {"type": "NClassRandomClassificationWithNoise", "parameters": {"n_samples": 100, "shuffle": True, "random_state": 0, "noise": 0} }, "split": {"type": "traintest", "parameters": {"test_size": 20, "random_state": 0, "shuffle": True} }, "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier", "hyperparameters": { "n_neighbors": 15, "algorithm": "auto", "weights": "uniform" } } }, {"input": {"type": "make_circles", "parameters": { "n_samples": 100, "shuffle": True, "noise": 0, "random_state": 0, "factor": 0.3 }}, "split": {"type": "traintest", "parameters": {"test_size": 20, "random_state": 0, "shuffle": True} }, "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier", "hyperparameters": { "n_neighbors": 15, "algorithm": "auto", "weights": "uniform" } } }, {"input": {"type": "load_iris", "parameters": {} }, "split": {"type": "traintest", "parameters": {"test_size": 20, "random_state": 0, "shuffle": True} }, "algorithm": {"type": "sklearn.neighbors.KNeighborsClassifier", "hyperparameters": { "n_neighbors": 15, "algorithm": "auto", "weights": "uniform" } } }] data_catalog = DataCatalog({'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet(), 'expanded_config': MemoryDataSet()}) data_catalog.save('config', config) data_catalog.save('log', log) data_catalog.save('base_directory', self.base_directory) data_catalog.save('expanded_config', expanded_config) # Assemble nodes into a pipeline pipeline = Pipeline([expand_config_node, multiple_learning_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) return log, data_catalog
def basic_catalog(basic_data): from kedro.io import DataCatalog, MemoryDataSet catalog = DataCatalog({"basic_data": MemoryDataSet()}) catalog.save("basic_data", basic_data) return catalog
class KedroPipelineModel(PythonModel): def __init__( self, pipeline: Pipeline, catalog: DataCatalog, input_name: str, runner: Optional[AbstractRunner] = None, copy_mode: Optional[Union[Dict[str, str], str]] = None, ): """[summary] Args: pipeline (Pipeline): A Kedro Pipeline object to store as a Mlflow Model. Also works with kedro_mlflow PipelineML objects. catalog (DataCatalog): The DataCatalog associated to the PipelineMl runner (Optional[AbstractRunner], optional): The kedro AbstractRunner to use. Defaults to SequentialRunner if None. copy_mode (Optional[Union[Dict[str,str], str]]): The copy_mode of each DataSet of the catalog when reconstructing the DataCatalog in memory. You can pass either: - None to use Kedro default mode for each dataset - a single string ("deepcopy", "copy" and "assign") to apply to all datasets - a dictionnary with (dataset name, copy_mode) key/values pairs. The associated mode must be a valid kedro mode ("deepcopy", "copy" and "assign") for each. Defaults to None. """ self.pipeline = (pipeline.inference if isinstance( pipeline, PipelineML) else pipeline) self.input_name = input_name self.initial_catalog = self._extract_pipeline_catalog(catalog) nb_outputs = len(self.pipeline.outputs()) if nb_outputs != 1: outputs_list_str = "\n - ".join(self.pipeline.outputs()) raise ValueError( f"Pipeline must have one and only one output, got '{nb_outputs}' outputs: \n - {outputs_list_str}" ) self.output_name = list(self.pipeline.outputs())[0] self.runner = runner or SequentialRunner() self.copy_mode = copy_mode or {} # copy mode has been converted because it is a property # TODO: we need to use the runner's default dataset in case of multithreading self.loaded_catalog = DataCatalog( data_sets={ name: MemoryDataSet(copy_mode=copy_mode) for name, copy_mode in self.copy_mode.items() }) @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @property def copy_mode(self): return self._copy_mode @copy_mode.setter def copy_mode(self, copy_mode): if isinstance(copy_mode, str) or copy_mode is None: # if it is a string, we must create manually the dictionary # of all catalog entries with this copy_mode self._copy_mode = { name: copy_mode for name in self.pipeline.data_sets() if name != self.output_name } elif isinstance(copy_mode, dict): # if it is a dict we will retrieve the copy mode when necessary # it does not matter if this dict does not contain all the catalog entries # the others will be returned as None when accessing with dict.get() self._copy_mode = { name: None for name in self.pipeline.data_sets() if name != self.output_name } self._copy_mode.update(copy_mode) else: raise TypeError( f"'copy_mode' must be a 'str' or a 'dict', not '{type(copy_mode)}'" ) def _extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog: sub_catalog = DataCatalog() for data_set_name in self.pipeline.inputs(): if data_set_name == self.input_name: # there is no obligation that this dataset is persisted # and even if it is, we keep only an ampty memory dataset to avoid # extra uneccessary dependencies: this dataset will be replaced at # inference time and we do not need to know the original type, see # https://github.com/Galileo-Galilei/kedro-mlflow/issues/273 sub_catalog.add(data_set_name=data_set_name, data_set=MemoryDataSet()) else: try: data_set = catalog._data_sets[data_set_name] if isinstance( data_set, MemoryDataSet ) and not data_set_name.startswith("params:"): raise KedroPipelineModelError(""" The datasets of the training pipeline must be persisted locally to be used by the inference pipeline. You must enforce them as non 'MemoryDataSet' in the 'catalog.yml'. Dataset '{data_set_name}' is not persisted currently. """.format(data_set_name=data_set_name)) self._logger.info( f"The data_set '{data_set_name}' is added to the Pipeline catalog." ) sub_catalog.add(data_set_name=data_set_name, data_set=data_set) except KeyError: raise KedroPipelineModelError(( f"The provided catalog must contains '{data_set_name}' data_set " "since it is the input of the pipeline.")) return sub_catalog def extract_pipeline_artifacts( self, parameters_saving_folder: Optional[Path] = None): artifacts = {} for name, dataset in self.initial_catalog._data_sets.items(): if name != self.input_name: if name.startswith("params:"): # we need to persist it locally for mlflow access absolute_param_path = (parameters_saving_folder / f"params_{name[7:]}.pkl") persisted_dataset = PickleDataSet( filepath=absolute_param_path.as_posix()) persisted_dataset.save(dataset.load()) artifact_path = absolute_param_path.as_uri() self._logger.info(( f"The parameter '{name[7:]}' is persisted (as pickle) " "at the following location: '{artifact_path}'")) else: # In this second case, we know it cannot be a MemoryDataSet # weird bug when directly converting PurePosixPath to windows: it is considered as relative artifact_path = (Path( dataset._filepath.as_posix()).resolve().as_uri()) artifacts[name] = artifact_path return artifacts def load_context(self, context): # a consistency check is made when loading the model # it would be better to check when saving the model # but we rely on a mlflow function for saving, and it is unaware of kedro # pipeline structure mlflow_artifacts_keys = set(context.artifacts.keys()) kedro_artifacts_keys = set(self.pipeline.inputs() - {self.input_name}) if mlflow_artifacts_keys != kedro_artifacts_keys: in_artifacts_but_not_inference = (mlflow_artifacts_keys - kedro_artifacts_keys) in_inference_but_not_artifacts = (kedro_artifacts_keys - mlflow_artifacts_keys) raise ValueError(( "Provided artifacts do not match catalog entries:" f"\n - 'artifacts - inference.inputs()' = : {in_artifacts_but_not_inference}" f"\n - 'inference.inputs() - artifacts' = : {in_inference_but_not_artifacts}" )) updated_catalog = self.initial_catalog.shallow_copy() for name, uri in context.artifacts.items(): updated_catalog._data_sets[name]._filepath = Path(uri) self.loaded_catalog.save(name=name, data=updated_catalog.load(name)) def predict(self, context, model_input): # we create an empty hook manager but do NOT register hooks # because we want this model be executable outside of a kedro project hook_manager = _create_hook_manager() self.loaded_catalog.save( name=self.input_name, data=model_input, ) run_output = self.runner.run( pipeline=self.pipeline, catalog=self.loaded_catalog, hook_manager=hook_manager, ) # unpack the result to avoid messing the json # file with the name of the Kedro dataset unpacked_output = run_output[self.output_name] return unpacked_output
class KedroPipelineModel(PythonModel): def __init__( self, pipeline_ml: PipelineML, catalog: DataCatalog, runner: Optional[AbstractRunner] = None, copy_mode: Optional[Union[Dict[str, str], str]] = None, ): """[summary] Args: pipeline_ml (PipelineML): A PipelineML object to store as a Mlflow Model catalog (DataCatalog): The DataCatalog associated to the PipelineMl runner (Optional[AbstractRunner], optional): The kedro AbstractRunner to use. Defaults to SequentialRunner if None. copy_mode (Optional[Union[Dict[str,str], str]]): The copy_mode of each DataSet of the catalog when reconstructing the DataCatalog in memory. You can pass either: - None to use Kedro default mode for each dataset - a single string ("deepcopy", "copy" and "assign") to apply to all datasets - a dictionnary with (dataset name, copy_mode) key/values pairs. The associated mode must be a valid kedro mode ("deepcopy", "copy" and "assign") for each. Defaults to None. """ self.pipeline_ml = pipeline_ml self.initial_catalog = pipeline_ml._extract_pipeline_catalog(catalog) # we have the guarantee that there is only one output in inference self.output_name = list(pipeline_ml.inference.outputs())[0] self.runner = runner or SequentialRunner() self.copy_mode = copy_mode or {} # copy mode has been converted because it is a property # TODO: we need to use the runner's default dataset in case of multithreading self.loaded_catalog = DataCatalog( data_sets={ name: MemoryDataSet(copy_mode=copy_mode) for name, copy_mode in self.copy_mode.items() } ) @property def copy_mode(self): return self._copy_mode @copy_mode.setter def copy_mode(self, copy_mode): if isinstance(copy_mode, str) or copy_mode is None: # if it is a string, we must create manually the dictionary # of all catalog entries with this copy_mode self._copy_mode = { name: copy_mode for name in self.pipeline_ml.inference.data_sets() if name != self.output_name } elif isinstance(copy_mode, dict): # if it is a dict we will retrieve the copy mode when necessary # it does not matter if this dict does not contain all the catalog entries # the others will be returned as None when accessing with dict.get() self._copy_mode = { name: None for name in self.pipeline_ml.inference.data_sets() if name != self.output_name } self._copy_mode.update(copy_mode) else: raise TypeError( f"'copy_mode' must be a 'str' or a 'dict', not '{type(copy_mode)}'" ) def load_context(self, context): # a consistency check is made when loading the model # it would be better to check when saving the model # but we rely on a mlflow function for saving, and it is unaware of kedro # pipeline structure mlflow_artifacts_keys = set(context.artifacts.keys()) kedro_artifacts_keys = set( self.pipeline_ml.inference.inputs() - {self.pipeline_ml.input_name} ) if mlflow_artifacts_keys != kedro_artifacts_keys: in_artifacts_but_not_inference = ( mlflow_artifacts_keys - kedro_artifacts_keys ) in_inference_but_not_artifacts = ( kedro_artifacts_keys - mlflow_artifacts_keys ) raise ValueError( ( "Provided artifacts do not match catalog entries:" f"\n - 'artifacts - inference.inputs()' = : {in_artifacts_but_not_inference}" f"\n - 'inference.inputs() - artifacts' = : {in_inference_but_not_artifacts}" ) ) updated_catalog = deepcopy(self.initial_catalog) for name, uri in context.artifacts.items(): updated_catalog._data_sets[name]._filepath = Path(uri) self.loaded_catalog.save(name=name, data=updated_catalog.load(name)) def predict(self, context, model_input): # TODO : checkout out how to pass extra args in predict # for instance, to enable parallelization self.loaded_catalog.save( name=self.pipeline_ml.input_name, data=model_input, ) run_output = self.runner.run( pipeline=self.pipeline_ml.inference, catalog=self.loaded_catalog ) # unpack the result to avoid messing the json # file with the name of the Kedro dataset unpacked_output = run_output[self.output_name] return unpacked_output