def test_no_param_datasets_in_respose(self, fake_cli_invoke, fake_load_context, mocker): yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") mocked_context = fake_load_context.return_value catalog_data_sets = { "iris_data": CSVDataSet("test.csv"), "parameters": MemoryDataSet(), "params:data_ratio": MemoryDataSet(), "intermediate": MemoryDataSet(), "not_used": CSVDataSet("test2.csv"), } pl_obj_data_sets = catalog_data_sets.keys() - {"not_used"} mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets) mocked_context.pipelines.keys.return_value = (self.PIPELINE_NAME, ) mocked_pl_obj = mocked_context.pipelines.get.return_value mocked_pl_obj.data_sets.return_value = pl_obj_data_sets result = fake_cli_invoke(["catalog", "list"]) assert not result.exit_code # 'parameters' and 'params:data_ratio' should not appear in the response expected_dict = { "DataSets in 'pipeline' pipeline": { "Datasets mentioned in pipeline": { "CSVDataSet": ["iris_data"], "MemoryDataSet": ["intermediate"], }, "Datasets not mentioned in pipeline": { "CSVDataSet": ["not_used"] }, } } yaml_dump_mock.assert_called_once_with(expected_dict)
def test_no_missing_datasets(self, fake_cli_invoke, fake_load_context, fake_repo_path): mocked_context = fake_load_context.return_value catalog_data_sets = { "input_data": CSVDataSet("test.csv"), "output_data": CSVDataSet("test2.csv"), } mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets) mocked_context.pipelines = { self.PIPELINE_NAME: Pipeline([node(identity, "input_data", "output_data")]) } mocked_context.project_path = fake_repo_path mocked_context.CONF_ROOT = "conf" data_catalog_file = (fake_repo_path / "conf" / "base" / "catalog" / f"{self.PIPELINE_NAME}.yml") result = fake_cli_invoke( ["catalog", "create", "--pipeline", self.PIPELINE_NAME]) assert not result.exit_code assert not data_catalog_file.exists()
def test_no_missing_datasets( self, fake_project_cli, fake_metadata, fake_load_context, fake_repo_path, mock_pipelines, ): mocked_context = fake_load_context.return_value catalog_data_sets = { "input_data": CSVDataSet("test.csv"), "output_data": CSVDataSet("test2.csv"), } mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets) mocked_context.project_path = fake_repo_path mock_pipelines[self.PIPELINE_NAME] = Pipeline( [node(identity, "input_data", "output_data")]) data_catalog_file = (fake_repo_path / "conf" / "base" / "catalog" / f"{self.PIPELINE_NAME}.yml") result = CliRunner().invoke( fake_project_cli, ["catalog", "create", "--pipeline", self.PIPELINE_NAME], obj=fake_metadata, ) assert not result.exit_code assert not data_catalog_file.exists()
def test_save_options_csv(self, tmp_path, sample_spark_df): # To cross check the correct Spark save operation we save to # a single spark partition with csv format and retrieve it with Kedro # CSVDataSet temp_dir = Path(str(tmp_path / "test_data")) spark_data_set = SparkDataSet( filepath=str(temp_dir), file_format="csv", save_args={ "sep": "|", "header": True }, ) spark_df = sample_spark_df.coalesce(1) spark_data_set.save(spark_df) single_csv_file = [ f for f in temp_dir.iterdir() if f.is_file() and f.suffix == ".csv" ][0] csv_local_data_set = CSVDataSet(filepath=str(single_csv_file), load_args={"sep": "|"}) pandas_df = csv_local_data_set.load() assert pandas_df[pandas_df["name"] == "Alex"]["age"][0] == 31
def catalog_with_encoder(): return DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "encoder": CSVDataSet("fake/path/to/encoder.csv"), "model": CSVDataSet("fake/path/to/model.csv"), })
def test_catalog_release(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value filepath = "test.csv" data_set = CSVDataSet(filepath=filepath) assert data_set._version_cache.currsize == 0 # no cache if unversioned data_set.release() fs_mock.invalidate_cache.assert_called_once_with(filepath) assert data_set._version_cache.currsize == 0
def test_load_options_csv(self, tmp_path, sample_pandas_df): filepath = str(tmp_path / "data") local_csv_data_set = CSVDataSet(filepath=filepath) local_csv_data_set.save(sample_pandas_df) spark_data_set = SparkDataSet(filepath=filepath, file_format="csv", load_args={"header": True}) spark_df = spark_data_set.load() assert spark_df.filter(col("Name") == "Alex").count() == 1
def catalog_with_stopwords(): catalog_with_stopwords = DataCatalog({ "data": MemoryDataSet(), "cleaned_data": MemoryDataSet(), "stopwords_from_nltk": CSVDataSet("fake/path/to/stopwords.csv"), "model": CSVDataSet("fake/path/to/model.csv"), }) return catalog_with_stopwords
def test_run_load_versions(self, tmp_path, dummy_context, dummy_dataframe, mocker): class DummyContext(KedroContext): project_name = "bob" package_name = "bob" project_version = kedro_version def _get_pipelines(self) -> Dict[str, Pipeline]: return {"__default__": Pipeline([node(identity, "cars", "boats")])} mocker.patch("logging.config.dictConfig") dummy_context = DummyContext(str(tmp_path)) filepath = str(dummy_context.project_path / "cars.csv") old_save_version = generate_timestamp() old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) old_csv_data_set = CSVDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, old_save_version), ) old_csv_data_set.save(old_df) new_save_version = generate_timestamp() new_csv_data_set = CSVDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, new_save_version), ) new_csv_data_set.save(dummy_dataframe) load_versions = {"cars": old_save_version} dummy_context.run(load_versions=load_versions) assert not dummy_context.catalog.load("boats").equals(dummy_dataframe) assert dummy_context.catalog.load("boats").equals(old_df)
def test_run_load_versions(self, dummy_context, dummy_dataframe): filepath = (dummy_context.project_path / "cars.csv").as_posix() old_save_version = generate_timestamp() old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) old_csv_data_set = CSVDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, old_save_version), ) old_csv_data_set.save(old_df) sleep(0.5) new_save_version = generate_timestamp() new_csv_data_set = CSVDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, new_save_version), ) new_csv_data_set.save(dummy_dataframe) load_versions = {"cars": old_save_version} dummy_context.run(load_versions=load_versions, pipeline_name="simple") assert not dummy_context.catalog.load("boats").equals(dummy_dataframe) assert dummy_context.catalog.load("boats").equals(old_df)
def dummy_catalog(): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": CSVDataSet("fake/path/to/model.csv"), }) return dummy_catalog
def test_http_filesystem_no_versioning(self): pattern = r"HTTP\(s\) DataSet doesn't support versioning\." with pytest.raises(DataSetError, match=pattern): CSVDataSet( filepath="https://example.com/file.csv", version=Version(None, None) )
def test_default_dataset(self, fake_project_cli, fake_metadata, fake_load_context, mocker, mock_pipelines): """Test that datasets that are found in `Pipeline.data_sets()`, but not in the catalog, are outputted under the key "DefaultDataset". """ yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") mocked_context = fake_load_context.return_value catalog_data_sets = {"some_dataset": CSVDataSet("test.csv")} mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets) mocker.patch.object( mock_pipelines[PIPELINE_NAME], "data_sets", return_value=catalog_data_sets.keys() | {"intermediate"}, ) result = CliRunner().invoke( fake_project_cli, ["catalog", "list"], obj=fake_metadata, ) assert not result.exit_code expected_dict = { f"DataSets in '{PIPELINE_NAME}' pipeline": { "Datasets mentioned in pipeline": { "CSVDataSet": ["some_dataset"], "DefaultDataSet": ["intermediate"], } } } key = f"DataSets in '{PIPELINE_NAME}' pipeline" assert yaml_dump_mock.call_count == 1 assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key]
def test_protocol_usage(self, filepath, instance_type, credentials): data_set = CSVDataSet(filepath=filepath, credentials=credentials) assert isinstance(data_set._fs, instance_type) path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath)
def test_no_param_datasets_in_respose(self, fake_project_cli, fake_metadata, fake_load_context, mocker, mock_pipelines): yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") mocked_context = fake_load_context.return_value catalog_data_sets = { "iris_data": CSVDataSet("test.csv"), "intermediate": MemoryDataSet(), "parameters": MemoryDataSet(), "params:data_ratio": MemoryDataSet(), "not_used": CSVDataSet("test2.csv"), } mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets) mocker.patch.object( mock_pipelines[PIPELINE_NAME], "data_sets", return_value=catalog_data_sets.keys() - {"not_used"}, ) result = CliRunner().invoke( fake_project_cli, ["catalog", "list"], obj=fake_metadata, ) assert not result.exit_code # 'parameters' and 'params:data_ratio' should not appear in the response expected_dict = { f"DataSets in '{PIPELINE_NAME}' pipeline": { "Datasets mentioned in pipeline": { "CSVDataSet": ["iris_data"], "MemoryDataSet": ["intermediate"], }, "Datasets not mentioned in pipeline": { "CSVDataSet": ["not_used"] }, } } key = f"DataSets in '{PIPELINE_NAME}' pipeline" assert yaml_dump_mock.call_count == 1 assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key]
def test_multiple_loads(self, versioned_csv_data_set, dummy_dataframe, filepath_csv): """Test that if a new version is created mid-run, by an external system, it won't be loaded in the current run.""" versioned_csv_data_set.save(dummy_dataframe) versioned_csv_data_set.load() v1 = versioned_csv_data_set.resolve_load_version() # force-drop a newer version into the same location v_new = generate_timestamp() CSVDataSet(filepath=filepath_csv, version=Version(v_new, v_new)).save(dummy_dataframe) versioned_csv_data_set.load() v2 = versioned_csv_data_set.resolve_load_version() assert v2 == v1 # v2 should not be v_new! ds_new = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) assert (ds_new.resolve_load_version() == v_new ) # new version is discoverable by a new instance
def test_version_str_repr(self, load_version, save_version): """Test that version is in string representation of the class instance when applicable.""" filepath = "test.csv" ds = CSVDataSet(filepath=filepath) ds_versioned = CSVDataSet(filepath=filepath, version=Version(load_version, save_version)) assert filepath in str(ds) assert "version" not in str(ds) assert filepath in str(ds_versioned) ver_str = f"version=Version(load={load_version}, save='{save_version}')" assert ver_str in str(ds_versioned) assert "CSVDataSet" in str(ds_versioned) assert "CSVDataSet" in str(ds) assert "protocol" in str(ds_versioned) assert "protocol" in str(ds) # Default save_args assert "save_args={'index': False}" in str(ds) assert "save_args={'index': False}" in str(ds_versioned)
def multi_catalog(mocker): csv = CSVDataSet(filepath="abc.csv") parq = ParquetDataSet(filepath="xyz.parq") journal = mocker.Mock() layers = {"raw": {"abc.csv"}, "model": {"xyz.parq"}} return DataCatalog({ "abc": csv, "xyz": parq }, journal=journal, layers=layers)
def test_protocol_usage(self, filepath, instance_type): data_set = CSVDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) # _strip_protocol() doesn't strip http(s) protocol if data_set._protocol == "https": path = filepath.split("://")[-1] else: path = data_set._fs._strip_protocol(filepath) assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath)
def test_replacing_non_alphanumeric_characters(self): """Test replacing non alphanumeric characters in datasets names""" csv = CSVDataSet(filepath="abc.csv") datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv} catalog = DataCatalog(data_sets=datasets) assert "ds1@spark" not in catalog.datasets.__dict__ assert "ds3.csv" not in catalog.datasets.__dict__ assert "ds2_spark" in catalog.datasets.__dict__ assert "ds1__spark" in catalog.datasets.__dict__ assert "ds3__csv" in catalog.datasets.__dict__
def test_replacing_nonword_characters(self): """Test replacing non-word characters in dataset names""" csv = CSVDataSet(filepath="abc.csv") datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv, "jalapeño": csv} catalog = DataCatalog(data_sets=datasets) assert "ds1@spark" not in catalog.datasets.__dict__ assert "ds2__spark" not in catalog.datasets.__dict__ assert "ds3.csv" not in catalog.datasets.__dict__ assert "jalape__o" not in catalog.datasets.__dict__ assert "ds1__spark" in catalog.datasets.__dict__ assert "ds2_spark" in catalog.datasets.__dict__ assert "ds3__csv" in catalog.datasets.__dict__ assert "jalapeño" in catalog.datasets.__dict__
def catalog_with_parameters(): catalog_with_parameters = DataCatalog({ "data": MemoryDataSet(), "cleaned_data": MemoryDataSet(), "params:stopwords": MemoryDataSet(["Hello", "Hi"]), "params:penalty": MemoryDataSet(0.1), "model": CSVDataSet("fake/path/to/model.csv"), "params:threshold": MemoryDataSet(0.5), }) return catalog_with_parameters
def before_pipeline_run(self, run_params: Dict[str, Any], pipeline, catalog): """A hook implementation to add a catalog entry based on the filename passed to the command line, e.g.: kedro run --params=input:iris_1.csv kedro run --params=input:iris_2.csv kedro run --params=input:iris_3.csv """ filename = run_params["extra_params"]["input"] # add input dataset input_dataset_name = "example_iris_data" input_dataset = CSVDataSet(filepath=f"data/01_raw/{filename}") catalog.add(input_dataset_name, input_dataset) # add output dataset output_dataset_name = "example_reporting_data" output_dataset = TextDataSet(filepath=f"data/08_reporting/{filename}") catalog.add(output_dataset_name, output_dataset)
def test_datasets_on_add(self, data_catalog_from_config): """Check datasets are updated correctly after adding""" data_catalog_from_config.add("new_dataset", CSVDataSet("some_path")) assert isinstance(data_catalog_from_config.datasets.new_dataset, CSVDataSet) assert isinstance(data_catalog_from_config.datasets.boats, CSVDataSet)
def multi_catalog(mocker): csv = CSVDataSet(filepath="abc.csv") parq = ParquetDataSet(filepath="xyz.parq") journal = mocker.Mock() return DataCatalog({"abc": csv, "xyz": parq}, journal=journal)
def data_set(filepath): return CSVDataSet(filepath=filepath, save_args={"index": False})
def versioned_csv_data_set(filepath_csv, load_version, save_version): return CSVDataSet(filepath=filepath_csv, version=Version(load_version, save_version))
def csv_data_set(filepath_csv, load_args, save_args): return CSVDataSet(filepath=filepath_csv, load_args=load_args, save_args=save_args)
def test_catalog_release(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value filepath = "test.csv" data_set = CSVDataSet(filepath=filepath) data_set.release() fs_mock.invalidate_cache.assert_called_once_with(filepath)
def test_release_instance_cache(self, dummy_dataframe, filepath_csv): """Test that cache invalidation does not affect other instances""" ds_a = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) assert ds_a._version_cache.currsize == 0 ds_a.save(dummy_dataframe) # create a version assert ds_a._version_cache.currsize == 2 ds_b = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) assert ds_b._version_cache.currsize == 0 ds_b.resolve_save_version() assert ds_b._version_cache.currsize == 1 ds_b.resolve_load_version() assert ds_b._version_cache.currsize == 2 ds_a.release() # dataset A cache is cleared assert ds_a._version_cache.currsize == 0 # dataset B cache is unaffected assert ds_b._version_cache.currsize == 2