def test_multiple_loads(self, versioned_csv_data_set, dummy_dataframe, filepath_csv): """Test that if a new version is created mid-run, by an external system, it won't be loaded in the current run.""" versioned_csv_data_set.save(dummy_dataframe) versioned_csv_data_set.load() v1 = versioned_csv_data_set.resolve_load_version() sleep(0.5) # force-drop a newer version into the same location v_new = generate_timestamp() GenericDataSet( filepath=filepath_csv.as_posix(), file_format="csv", version=Version(v_new, v_new), ).save(dummy_dataframe) versioned_csv_data_set.load() v2 = versioned_csv_data_set.resolve_load_version() assert v2 == v1 # v2 should not be v_new! ds_new = GenericDataSet( filepath=filepath_csv.as_posix(), file_format="csv", version=Version(None, None), ) assert (ds_new.resolve_load_version() == v_new ) # new version is discoverable by a new instance
def test_catalog_release(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value filepath = "test.csv" data_set = GenericDataSet(filepath=filepath, file_format="sas") assert data_set._version_cache.currsize == 0 # no cache if unversioned data_set.release() fs_mock.invalidate_cache.assert_called_once_with(filepath) assert data_set._version_cache.currsize == 0
def test_generic_no_filepaths(self, file_format): error = ( "Cannot create a dataset of file_format " f"`{file_format}` as it does not support a filepath target/source") with pytest.raises(DataSetError, match=error): _ = GenericDataSet(filepath="/file/thing.file", file_format=file_format).load() with pytest.raises(DataSetError, match=error): GenericDataSet(filepath="/file/thing.file", file_format=file_format).save(pd.DataFrame([1]))
def versioned_csv_data_set(filepath_csv, load_version, save_version): return GenericDataSet( filepath=filepath_csv.as_posix(), file_format="csv", version=Version(load_version, save_version), save_args={"index": False}, )
def sas_data_set_bad_config(filepath_sas, fs_args): return GenericDataSet( filepath=filepath_sas.as_posix(), file_format="sas", load_args={}, # SAS reader requires a type param fs_args=fs_args, )
def html_data_set(filepath_html, fs_args): return GenericDataSet( filepath=filepath_html.as_posix(), file_format="html", fs_args=fs_args, save_args={"index": False}, )
def sas_data_set(filepath_sas, fs_args): return GenericDataSet( filepath=filepath_sas.as_posix(), file_format="sas", load_args={"format": "sas7bdat"}, fs_args=fs_args, )
def test_bad_file_format_argument(self): ds = GenericDataSet(filepath="test.kedro", file_format="kedro") pattern = ( "Unable to retrieve `pandas.read_kedro` method, please ensure that your 'file_format' " "parameter has been defined correctly as per the Pandas API " "https://pandas.pydata.org/docs/reference/io.html") with pytest.raises(DataSetError, match=pattern): _ = ds.load() pattern2 = ( "Unable to retrieve `pandas.DataFrame.to_kedro` method, please ensure that your 'file_format' " "parameter has been defined correctly as per the Pandas API " "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html" ) with pytest.raises(DataSetError, match=pattern2): ds.save(pd.DataFrame([1]))
def test_version_str_repr(self, filepath_csv, load_version, save_version): """Test that version is in string representation of the class instance when applicable.""" filepath = filepath_csv.as_posix() ds = GenericDataSet(filepath=filepath, file_format="csv") ds_versioned = GenericDataSet( filepath=filepath, file_format="csv", version=Version(load_version, save_version), ) assert filepath in str(ds) assert filepath in str(ds_versioned) ver_str = f"version=Version(load={load_version}, save='{save_version}')" assert ver_str in str(ds_versioned) assert "GenericDataSet" in str(ds_versioned) assert "GenericDataSet" in str(ds) assert "protocol" in str(ds_versioned) assert "protocol" in str(ds)
def test_protocol_usage(self, filepath, instance_type, credentials): data_set = GenericDataSet(filepath=filepath, file_format="sas", credentials=credentials) assert isinstance(data_set._fs, instance_type) path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath)
def csv_data_set(filepath_csv): return GenericDataSet( filepath=filepath_csv.as_posix(), file_format="csv", save_args={"index": False}, )
def test_release_instance_cache(self, dummy_dataframe, filepath_csv): """Test that cache invalidation does not affect other instances""" ds_a = GenericDataSet( filepath=filepath_csv.as_posix(), file_format="csv", version=Version(None, None), ) assert ds_a._version_cache.currsize == 0 ds_a.save(dummy_dataframe) # create a version assert ds_a._version_cache.currsize == 2 ds_b = GenericDataSet( filepath=filepath_csv.as_posix(), file_format="csv", version=Version(None, None), ) assert ds_b._version_cache.currsize == 0 ds_b.resolve_save_version() assert ds_b._version_cache.currsize == 1 ds_b.resolve_load_version() assert ds_b._version_cache.currsize == 2 ds_a.release() # dataset A cache is cleared assert ds_a._version_cache.currsize == 0 # dataset B cache is unaffected assert ds_b._version_cache.currsize == 2
def test_multiple_saves(self, dummy_dataframe, filepath_csv): """Test multiple cycles of save followed by load for the same dataset""" ds_versioned = GenericDataSet( filepath=filepath_csv.as_posix(), file_format="csv", version=Version(None, None), ) # first save ds_versioned.save(dummy_dataframe) first_save_version = ds_versioned.resolve_save_version() first_load_version = ds_versioned.resolve_load_version() assert first_load_version == first_save_version # second save sleep(0.5) ds_versioned.save(dummy_dataframe) second_save_version = ds_versioned.resolve_save_version() second_load_version = ds_versioned.resolve_load_version() assert second_load_version == second_save_version assert second_load_version > first_load_version # another dataset ds_new = GenericDataSet( filepath=filepath_csv.as_posix(), file_format="csv", version=Version(None, None), ) assert ds_new.resolve_load_version() == second_load_version