def test_save_and_load_non_existing_dir(self, tmp_path, dummy_dataframe): """Test saving and reloading the data set to non-existing directory.""" filepath = (tmp_path / "non-existing" / FILENAME).as_posix() data_set = ParquetDataSet(filepath=filepath) data_set.save(dummy_dataframe) reloaded = data_set.load() assert_frame_equal(dummy_dataframe, reloaded)
def test_load_parquet(self, tmp_path, sample_pandas_df): temp_path = str(tmp_path / "data") local_parquet_set = ParquetDataSet(filepath=temp_path) local_parquet_set.save(sample_pandas_df) spark_data_set = SparkDataSet(filepath=temp_path) spark_df = spark_data_set.load() assert spark_df.count() == 4
def test_catalog_release(self, protocol, path, mocker): filepath = protocol + path + FILENAME fs_mock = mocker.patch("fsspec.filesystem").return_value data_set = ParquetDataSet(filepath=filepath) data_set.release() if protocol != "https://": filepath = path + FILENAME fs_mock.invalidate_cache.assert_called_once_with(filepath)
def test_read_from_file(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value fs_mock.isdir.return_value = False mocker.patch("pandas.read_parquet") data_set = ParquetDataSet(filepath="/tmp/test.parquet") data_set.load() fs_mock.isdir.assert_called_once() fs_mock.open.assert_called_once()
def test_save_and_load(self, tmp_path, dummy_dataframe): """Test saving and reloading the data set.""" filepath = (tmp_path / FILENAME).as_posix() data_set = ParquetDataSet(filepath=filepath) data_set.save(dummy_dataframe) reloaded = data_set.load() assert_frame_equal(dummy_dataframe, reloaded) assert data_set._fs_open_args_load == {} files = [child.is_file() for child in tmp_path.iterdir()] assert all(files) assert len(files) == 1
def test_read_from_non_local_dir(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value fs_mock.isdir.return_value = True pq_ds_mock = mocker.patch("pyarrow.parquet.ParquetDataset") data_set = ParquetDataSet(filepath="s3://bucket/dir") data_set.load() fs_mock.isdir.assert_called_once() assert not fs_mock.open.called pq_ds_mock.assert_called_once_with("bucket/dir", filesystem=fs_mock) pq_ds_mock().read().to_pandas.assert_called_once_with()
def parquet_data_set(filepath_parquet, load_args, save_args, fs_args): return ParquetDataSet( filepath=filepath_parquet, load_args=load_args, save_args=save_args, fs_args=fs_args, )
def test_http_filesystem_no_versioning(self): pattern = r"HTTP\(s\) DataSet doesn't support versioning\." with pytest.raises(DataSetError, match=pattern): ParquetDataSet( filepath="https://example.com/test.parquet", version=Version(None, None) )
def test_read_partitioned_file(self, mocker, tmp_path, dummy_dataframe): """Test read partitioned parquet file from local directory.""" pq_ds_mock = mocker.patch("pyarrow.parquet.ParquetDataset", wraps=pq.ParquetDataset) dummy_dataframe.to_parquet(str(tmp_path), partition_cols=["col2"]) data_set = ParquetDataSet(filepath=tmp_path.as_posix()) reloaded = data_set.load() # Sort by columns because reading partitioned file results # in different columns order reloaded = reloaded.sort_index(axis=1) # dtype for partition column is 'category' assert_frame_equal(dummy_dataframe, reloaded, check_dtype=False, check_categorical=False) pq_ds_mock.assert_called_once()
def test_protocol_usage(self, filepath, instance_type): data_set = ParquetDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath)
def test_credentials_propagated(self, mocker): """Test propagating credentials for connecting to GCS""" mock_fs = mocker.patch("fsspec.filesystem") credentials = {"key": "value"} ParquetDataSet(filepath=FILENAME, credentials=credentials) mock_fs.assert_called_once_with("file", auto_mkdir=True, **credentials)
def test_version_str_repr(self, load_version, save_version): """Test that version is in string representation of the class instance when applicable.""" ds = ParquetDataSet(filepath=FILENAME) ds_versioned = ParquetDataSet(filepath=FILENAME, version=Version(load_version, save_version)) assert FILENAME in str(ds) assert "version" not in str(ds) assert FILENAME in str(ds_versioned) ver_str = f"version=Version(load={load_version}, save='{save_version}')" assert ver_str in str(ds_versioned) assert "ParquetDataSet" in str(ds_versioned) assert "ParquetDataSet" in str(ds) assert "protocol" in str(ds_versioned) assert "protocol" in str(ds)
def multi_catalog(mocker): csv = CSVDataSet(filepath="abc.csv") parq = ParquetDataSet(filepath="xyz.parq") journal = mocker.Mock() layers = {"raw": {"abc.csv"}, "model": {"xyz.parq"}} return DataCatalog({ "abc": csv, "xyz": parq }, journal=journal, layers=layers)
def test_save_parquet(self, tmp_path, sample_spark_df): # To cross check the correct Spark save operation we save to # a single spark partition and retrieve it with Kedro # ParquetDataSet temp_dir = Path(str(tmp_path / "test_data")) spark_data_set = SparkDataSet(filepath=str(temp_dir), save_args={"compression": "none"}) spark_df = sample_spark_df.coalesce(1) spark_data_set.save(spark_df) single_parquet = [ f for f in temp_dir.iterdir() if f.is_file() and f.name.startswith("part") ][0] local_parquet_data_set = ParquetDataSet(filepath=str(single_parquet)) pandas_df = local_parquet_data_set.load() assert pandas_df[pandas_df["name"] == "Bob"]["age"].iloc[0] == 12
def test_protocol_usage(self, filepath, instance_type): data_set = ParquetDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) # _strip_protocol() doesn't strip http(s) protocol if data_set._protocol == "https": path = filepath.split("://")[-1] else: path = data_set._fs._strip_protocol(filepath) assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath)
def multi_catalog(mocker): csv = CSVDataSet(filepath="abc.csv") parq = ParquetDataSet(filepath="xyz.parq") journal = mocker.Mock() return DataCatalog({"abc": csv, "xyz": parq}, journal=journal)
def test_write_to_dir(self, dummy_dataframe, tmp_path): data_set = ParquetDataSet(filepath=tmp_path.as_posix()) pattern = "Saving ParquetDataSet to a directory is not supported" with pytest.raises(DataSetError, match=pattern): data_set.save(dummy_dataframe)
def versioned_parquet_data_set(filepath_parquet, load_version, save_version): return ParquetDataSet(filepath=filepath_parquet, version=Version(load_version, save_version))