def test_http_filesystem_no_versioning(self): pattern = r"HTTP\(s\) DataSet doesn't support versioning\." with pytest.raises(DataSetError, match=pattern): TextDataSet( filepath="https://example.com/file.txt", version=Version(None, None) )
def test_protocol_usage(self, filepath, instance_type): data_set = TextDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath)
def test_version_str_repr(self, load_version, save_version): """Test that version is in string representation of the class instance when applicable.""" filepath = "test.txt" ds = TextDataSet(filepath=filepath) ds_versioned = TextDataSet(filepath=filepath, version=Version(load_version, save_version)) assert filepath in str(ds) assert "version" not in str(ds) assert filepath in str(ds_versioned) ver_str = f"version=Version(load={load_version}, save='{save_version}')" assert ver_str in str(ds_versioned) assert "TextDataSet" in str(ds_versioned) assert "TextDataSet" in str(ds) assert "protocol" in str(ds_versioned) assert "protocol" in str(ds)
def test_protocol_usage(self, filepath, instance_type): data_set = TextDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) # _strip_protocol() doesn't strip http(s) protocol if data_set._protocol == "https": path = filepath.split("://")[-1] else: path = data_set._fs._strip_protocol(filepath) assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath)
def test_force_checkpoint_checkpoint_file_exists(self, forced_checkpoint, expected_partitions, mocked_csvs_in_s3): """Test how forcing checkpoint value affects the available partitions in S3 if the checkpoint file exists""" # create checkpoint and assert that it exists IncrementalDataSet(mocked_csvs_in_s3, DATASET).confirm() checkpoint_path = "{}/{}".format( mocked_csvs_in_s3, IncrementalDataSet.DEFAULT_CHECKPOINT_FILENAME) checkpoint_value = TextDataSet(checkpoint_path).load() assert checkpoint_value == "p04/data.csv" pds = IncrementalDataSet(mocked_csvs_in_s3, DATASET, checkpoint=forced_checkpoint) assert pds._checkpoint.exists() loaded = pds.load() assert loaded.keys() == expected_partitions
def before_pipeline_run(self, run_params: Dict[str, Any], pipeline, catalog): """A hook implementation to add a catalog entry based on the filename passed to the command line, e.g.: kedro run --params=input:iris_1.csv kedro run --params=input:iris_2.csv kedro run --params=input:iris_3.csv """ filename = run_params["extra_params"]["input"] # add input dataset input_dataset_name = "example_iris_data" input_dataset = CSVDataSet(filepath=f"data/01_raw/{filename}") catalog.add(input_dataset_name, input_dataset) # add output dataset output_dataset_name = "example_reporting_data" output_dataset = TextDataSet(filepath=f"data/08_reporting/{filename}") catalog.add(output_dataset_name, output_dataset)
def versioned_txt_data_set(filepath_txt, load_version, save_version): return TextDataSet(filepath=filepath_txt, version=Version(load_version, save_version))
def txt_data_set(filepath_txt, fs_args): return TextDataSet(filepath=filepath_txt, fs_args=fs_args)
def test_catalog_release(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value filepath = "test.txt" data_set = TextDataSet(filepath=filepath) data_set.release() fs_mock.invalidate_cache.assert_called_once_with(filepath)
from kedro.extras.datasets.text import TextDataSet catalog_dict = { "my_output_dataset": TextDataSet(filepath="data/load/my_output.txt") }