def test_bad_confirm(self, sane_config, dataset_name, pattern): """Test confirming non existent dataset or the one that does not have `confirm` method""" data_catalog = DataCatalog.from_config(**sane_config) with pytest.raises(DataSetError, match=re.escape(pattern)): data_catalog.confirm(dataset_name)
def _run( # pylint: disable=too-many-locals,useless-suppression self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None) -> None: """The abstract interface for running pipelines. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. run_id: The id of the run. Raises: AttributeError: When the provided pipeline is not suitable for parallel execution. RuntimeError: If the runner is unable to schedule the execution of all pipeline nodes. Exception: In case of any downstream node failure. """ # pylint: disable=import-outside-toplevel,cyclic-import from kedro.framework.session.session import get_current_session nodes = pipeline.nodes self._validate_catalog(catalog, pipeline) self._validate_nodes(nodes) load_counts = Counter(chain.from_iterable(n.inputs for n in nodes)) node_dependencies = pipeline.node_dependencies todo_nodes = set(node_dependencies.keys()) done_nodes = set() # type: Set[Node] futures = set() done = None max_workers = self._get_required_workers_count(pipeline) from kedro.framework.project import PACKAGE_NAME session = get_current_session(silent=True) # pylint: disable=protected-access conf_logging = session._get_logging_config() if session else None with ProcessPoolExecutor(max_workers=max_workers) as pool: while True: ready = { n for n in todo_nodes if node_dependencies[n] <= done_nodes } todo_nodes -= ready for node in ready: futures.add( pool.submit( _run_node_synchronization, node, catalog, self._is_async, run_id, package_name=PACKAGE_NAME, conf_logging=conf_logging, )) if not futures: if todo_nodes: debug_data = { "todo_nodes": todo_nodes, "done_nodes": done_nodes, "ready_nodes": ready, "done_futures": done, } debug_data_str = "\n".join( f"{k} = {v}" for k, v in debug_data.items()) raise RuntimeError( f"Unable to schedule new tasks although some nodes " f"have not been run:\n{debug_data_str}") break # pragma: no cover done, futures = wait(futures, return_when=FIRST_COMPLETED) for future in done: try: node = future.result() except Exception: self._suggest_resume_scenario(pipeline, done_nodes) raise done_nodes.add(node) # decrement load counts and release any data sets we've finished with # this is particularly important for the shared datasets we create above for data_set in node.inputs: load_counts[data_set] -= 1 if (load_counts[data_set] < 1 and data_set not in pipeline.inputs()): catalog.release(data_set) for data_set in node.outputs: if (load_counts[data_set] < 1 and data_set not in pipeline.outputs()): catalog.release(data_set)
def data_catalog_from_config(sane_config): return DataCatalog.from_config(**sane_config)
def register_catalog( self, catalog, credentials, load_versions, save_version, journal ) -> DataCatalog: return DataCatalog.from_config( catalog, credentials, load_versions, save_version, journal )
def catalog(): return DataCatalog()
def test_not_found_error(self, fake_transformer): catalog = DataCatalog() with pytest.raises(DataSetNotFoundError): catalog.add_transformer(fake_transformer, "test")
def catalog(fake_data_set): return DataCatalog({"test": fake_data_set})
def test_missing_credentials(self, sane_config): """Check the error if credentials can't be located""" sane_config["catalog"]["cars"]["credentials"] = "missing" with pytest.raises(KeyError, match=r"Unable to find credentials"): DataCatalog.from_config(**sane_config)
def test_idempotent_catalog(self, sane_config): """Test that data catalog instantiations are idempotent""" _ = DataCatalog.from_config(**sane_config) # NOQA catalog = DataCatalog.from_config(**sane_config) assert catalog
def test_config_invalid_module(self, sane_config): """Check the error if the type points to nonexistent module""" sane_config["catalog"]["boats"][ "type"] = "kedro.invalid_module_name.io.CSVLocalDataSet" with pytest.raises(DataSetError, match=r"Cannot import module"): DataCatalog.from_config(**sane_config)
def test_empty_config(self): """Test empty config""" assert DataCatalog.from_config(None)
def test_save_to_unregistered(self, dummy_dataframe): """Check the error when attempting to save to unregistered data set""" catalog = DataCatalog(data_sets={}) pattern = r"DataSet 'test' not found in the catalog" with pytest.raises(DataSetNotFoundError, match=pattern): catalog.save("test", dummy_dataframe)
def test_mlflow_hook_metrics_dataset_with_run_id( kedro_project_with_mlflow_conf, dummy_pipeline, dummy_run_params ): bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session: context = session.load_context() with mlflow.start_run(): existing_run_id = mlflow.active_run().info.run_id dummy_catalog_with_run_id = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet( (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix() ), "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id), "another_metrics": MlflowMetricsDataSet( run_id=existing_run_id, prefix="foo" ), "my_metric": MlflowMetricDataSet(run_id=existing_run_id), "another_metric": MlflowMetricDataSet( run_id=existing_run_id, key="foo" ), "my_metric_history": MlflowMetricHistoryDataSet(run_id=existing_run_id), "another_metric_history": MlflowMetricHistoryDataSet( run_id=existing_run_id, key="bar" ), } ) mlflow_hook = MlflowHook() runner = SequentialRunner() mlflow_hook.after_context_created(context) mlflow_hook.after_catalog_created( catalog=dummy_catalog_with_run_id, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) mlflow_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog_with_run_id, ) runner.run(dummy_pipeline, dummy_catalog_with_run_id, session._hook_manager) current_run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=dummy_pipeline, catalog=dummy_catalog_with_run_id, ) mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri) # the first run is created in Default (id 0), # but the one initialised in before_pipeline_run # is create in kedro_project experiment (id 1) all_runs_id = set( [ run.run_id for k in range(2) for run in mlflow_client.list_run_infos(experiment_id=f"{k}") ] ) # the metrics are supposed to have been logged inside existing_run_id run_data = mlflow_client.get_run(existing_run_id).data # Check if metrics datasets have prefix with its names. # for metric assert all_runs_id == {current_run_id, existing_run_id} assert run_data.metrics["my_metrics.metric_key"] == 1.1 assert run_data.metrics["foo.metric_key"] == 1.1 assert run_data.metrics["my_metric"] == 1.1 assert run_data.metrics["foo"] == 1.1 assert ( run_data.metrics["my_metric_history"] == 0.2 ) # the list is stored, but only the last value is retrieved assert ( run_data.metrics["bar"] == 0.2 ) # the list is stored, but only the last value is retrieved
def preprocess_raw_data(parameters: Dict): import glob, os os.chdir(parameters["path_raw"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] Raw_Data_dated = [] wells_life = [] wells_data_ = [] for well in wells_data: # well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', # 'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure', # 'ESP Current', 'Emulsion Flow Rate']] # well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Emulsion Flow Rate']] well = well[[ 'Date', 'IBHP', 'PBHP', 'Steam [m3/d]', 'Emulsion [m3/d]' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well['Date'] = pd.to_datetime(well['Date']) well = well.set_index('Date') # well = well.resample('7D').mean() # weekly data # well = well.resample('30D').mean() # monthly data wells_life.append(len(well)) wells_data_.append(well) min_well_length = np.min(np.array(wells_life)) timesteps = 983 for well, file, filename in zip(wells_data_, files, filenames): # well = well.iloc[:min_well_length] # use minimum well life well = well.iloc[:timesteps] # daily, weekly, monthly data # well = well.fillna(0) well = well.fillna(well.rolling(30, min_periods=1).median()) well = well.fillna(well.median()) well_dated = well.copy() well_dated["Well"] = filename # create a column for well name data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] + "/pre_processed_data_" + file) data_set.save(well_dated) Raw_Data_dated.append(well_dated) Raw_Data_preprocessed.append(well) os.chdir(parameters["path_raw_static"]) static_files = [] for static_file in glob.glob("*.csv"): static_files.append(static_file) static_filenames = [] statics_data = [] for static_file in static_files: static_filename, others = static_file.split('_') static_filenames.append(static_filename) for static_file, static_filename in zip(static_files, static_filenames): io = DataCatalog({ static_filename: CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" + static_file), }) static_data = io.load(static_filename) statics_data.append(static_data) statics_data_new = [] well_name_list = [] for pad_static in statics_data: well_name = pad_static['WELLPAIR_NAME'].values well_name_list.append(well_name) pad_static = pad_static.set_index('WELLPAIR_NAME') pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE']) statics_data_new.append(pad_static) properties = [] probabilities = [] asset_names = [] for pad_static, names in zip(statics_data_new, well_name_list): for well in names: prob = pad_static.loc[well, 'Forecast_Prob'] probabilities.append(prob) pad_code = pad_static.loc[well, 'PAD_CODE'] asset_name, pad = pad_code.split('_') asset_names.append(asset_name) property_ = pad_static.loc[ well, 'Effective_Length':'BottomWater_Oil_Saturation'].values properties.append(property_) properties = np.array(properties) return [ timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames, probabilities, asset_names, properties ]
def preprocess_raw_data(parameters: Dict): import glob, os os.chdir(parameters["path_raw"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] Raw_Data_dated = [] for well, file, filename in zip(wells_data, files, filenames): well = well[[ 'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well = well.iloc[:1399] well = well.fillna(well.rolling(30, min_periods=1).median()) well = well.fillna(well.median()) well_dated = well.copy() well_dated["Well"] = filename # create a column for well name data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] + "/pre_processed_data_" + file) data_set.save(well_dated) Raw_Data_dated.append(well_dated) well['Date'] = pd.to_datetime(well['Date']) well = well.set_index('Date') Raw_Data_preprocessed.append(well) os.chdir(parameters["path_raw_static"]) static_files = [] for static_file in glob.glob("*.csv"): static_files.append(static_file) static_filenames = [] statics_data = [] for static_file in static_files: static_filename, others = static_file.split('_') static_filenames.append(static_filename) for static_file, static_filename in zip(static_files, static_filenames): io = DataCatalog({ static_filename: CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" + static_file), }) static_data = io.load(static_filename) statics_data.append(static_data) statics_data_new = [] well_name_list = [] for pad_static in statics_data: well_name = pad_static['WELLPAIR_NAME'].values well_name_list.append(well_name) pad_static = pad_static.set_index('WELLPAIR_NAME') pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE']) statics_data_new.append(pad_static) properties = [] probabilities = [] asset_names = [] for pad_static, names in zip(statics_data_new, well_name_list): for well in names: prob = pad_static.loc[well, 'Forecast_Prob'] probabilities.append(prob) pad_code = pad_static.loc[well, 'PAD_CODE'] asset_name, pad = pad_code.split('_') asset_names.append(asset_name) property_ = pad_static.loc[ well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values properties.append(property_) properties = np.array(properties) return [ Raw_Data_preprocessed, Raw_Data_dated, files, filenames, probabilities, asset_names, properties ]
def test_error_dataset_init(self, bad_config): """Check the error when trying to instantiate erroneous data set""" pattern = r"Failed to instantiate DataSet \'bad\' " r"of type `.*BadDataSet`" with pytest.raises(DataSetError, match=pattern): DataCatalog.from_config(bad_config, None)
def register_catalog( # pylint: disable=no-self-use, too-many-arguments self, catalog, credentials, load_versions, save_version, journal): return DataCatalog.from_config(catalog, credentials, load_versions, save_version, journal)
def multi_catalog(mocker): csv = CSVLocalDataSet(filepath="abc.csv") parq = ParquetLocalDataSet(filepath="xyz.parq") journal = mocker.Mock() return DataCatalog({"abc": csv, "xyz": parq}, journal=journal)
def test_not_found_error_in_constructor(self): with pytest.raises(DataSetNotFoundError): DataCatalog(transformers={"test": []})
def memory_catalog(): ds1 = MemoryDataSet({"data": 42}) ds2 = MemoryDataSet([1, 2, 3, 4, 5]) return DataCatalog({"ds1": ds1, "ds2": ds2})
"""Contents of hello_kedro.py""" # prerequisite # kedro jupyter convert notebooks/generationCopy1.ipynb from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import node, Pipeline from kedro.runner import SequentialRunner from src.kedro_bioimage.nodes.generationCopy1 import generation # Prepare a data catalog data_catalog = DataCatalog({"example_data": MemoryDataSet()}) # Prepare first node def return_greeting(): return "Hello" return_greeting_node = node( return_greeting, inputs=None, outputs="my_salutation" ) # Prepare second node def join_statements(greeting): return f"{greeting} Kedro!" join_statements_node = node( join_statements, inputs="my_salutation", outputs="my_message" ) #
"""Contents of hello_kedro.py""" from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import node, Pipeline from kedro.runner import SequentialRunner # Prepare a data catalog data_catalog = DataCatalog({"my_salutation": MemoryDataSet()}) # Prepare first node def return_greeting(): return "Hello" return_greeting_node = node(return_greeting, inputs=None, outputs="my_salutation") # Prepare second node def join_statements(greeting): return f"{greeting} Kedro!" join_statements_node = node( join_statements, inputs="my_salutation", outputs="my_message" ) # Assemble nodes into a pipeline pipeline = Pipeline([return_greeting_node, join_statements_node]) # Create a runner to run the pipeline runner = SequentialRunner()
def test_node_returning_none(self): pipeline = Pipeline([node(identity, "A", "B"), node(return_none, "B", "C")]) catalog = DataCatalog({"A": MemoryDataSet("42")}) pattern = "Saving `None` to a `DataSet` is not allowed" with pytest.raises(DataSetError, match=pattern): ThreadRunner().run(pipeline, catalog)
def test_config_good_version(self): config = yaml.safe_load(StringIO(YML_CONFIG_VERSIONED)) catalog = DataCatalog.from_config(config, load_versions={"test_ds": "42"}) assert catalog._data_sets["test_ds"]._dataset._version.load == "42"
def test_memory_data_set_input(self, fan_out_fan_in): catalog = DataCatalog({"A": MemoryDataSet("42")}) result = ThreadRunner().run(fan_out_fan_in, catalog) assert "Z" in result assert result["Z"] == ("42", "42", "42")
def load_well_validation_data(dummy2, timesteps, parameters: Dict): import glob, os os.chdir(parameters["path_model_input"]) files_val = [] for file in glob.glob("*.csv"): files_val.append(file) filenames_val = [] wells_data = [] for file in files_val: filename, extension = file.split('.') filenames_val.append(filename) for file, filename in zip(files_val, filenames_val): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed_val = [] wells_life = [] wells_data_ = [] for well in wells_data: # well = well[['Date', 'Injector Bottom Hole Pressure', # 'Producer Bottom Hole Pressure', 'ESP Speed', # 'Steam Flow Rate - Outer', # 'Emulsion Flow Rate']] well = well[[ 'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP', 'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]', 'Emulsion [m3/d]' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well['Prod_Date'] = pd.to_datetime(well['Date']) well = well.set_index('Prod_Date') # well = well.dropna(axis=0) # may change # well = well.resample('7D').mean() # weekly data # well = well.resample('30D').mean() # monthly data # well = well.rolling(30, min_periods=1).mean() # well = well.rolling(30, min_periods=1).mean() data = well['Oil [bbl/d]'] / 6.28981 well.insert(4, 'Oil [m3/d]', data) time_data = np.arange(len(well)) well.insert(0, 'Timestep', time_data) wells_life.append(len(well)) wells_data_.append(well) min_well_length = np.min(np.array(wells_life)) if min_well_length < timesteps: timesteps_validation = min_well_length else: timesteps_validation = timesteps for well, file, filename in zip(wells_data_, files_val, filenames_val): well = well.iloc[:timesteps_validation] # daily data # well = well.fillna(0) # well = well.fillna(well.rolling(30, min_periods=1).median()) # well = well.fillna(well.median()) Raw_Data_preprocessed_val.append(well) stats_validation = CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/static_P50_data_validation.csv") stats_val = stats_validation.load() stats_val_ROIP = stats_val.loc[:, 'ROIP'] stats_val = stats_val.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation'] # # using only rich geoostats and no bottom water properties # stats_val = stats_val.loc[:, 'Effective_Length':'Rich_Oil_Saturation'] # # Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness # data = stats_val['Rich_Pay_Thickness'] - stats_val['Stand_Off'] # stats_val.insert(3, 'Effective_Rich_Pay_Thickness', data) # stats_val = stats_val.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off']) property_names_val = list(stats_val.columns) properties_val = list(stats_val.values) # properties_val = stats.loc[:, ['Effective_Length', 'Spacing', 'Effective_Rich_Pay_Thickness', 'Non_Rich_Pay_Thickness', # 'Rich_Vertical_Permeability','Non_Rich_Vertical_Permeability', 'Rich_Porosity', # 'Non_Rich_Porosity', 'Rich_Oil_Saturation', 'Non_Rich_Oil_Saturation']].values properties_val = np.array(properties_val) dummy11 = files_val return [ dummy11, timesteps_validation, Raw_Data_preprocessed_val, files_val, filenames_val, properties_val, stats_val_ROIP, property_names_val ]
def data_catalog(data_set): return DataCatalog(data_sets={"test": data_set})
from kedro.io import DataCatalog from kedro.extras.datasets.pandas import CSVDataSet io = DataCatalog({"titanic_training_data": CSVDataSet(filepath="train.csv")}) # Load your file and print the output df = io.load("titanic_training_data") print(df.head())
def test_load_from_unregistered(self): """Check the error when attempting to load unregistered data set""" catalog = DataCatalog(data_sets={}) pattern = r"DataSet 'test' not found in the catalog" with pytest.raises(DataSetNotFoundError, match=pattern): catalog.load("test")
def test_missing_nested_credentials(self, sane_config_with_nested_creds): del sane_config_with_nested_creds["credentials"]["other_credentials"] pattern = "Unable to find credentials 'other_credentials'" with pytest.raises(KeyError, match=pattern): DataCatalog.from_config(**sane_config_with_nested_creds)