def test_read_from_file(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value fs_mock.isdir.return_value = False mocker.patch("pandas.read_parquet") data_set = ParquetDataSet(filepath="/tmp/test.parquet") data_set.load() fs_mock.isdir.assert_called_once() fs_mock.open.assert_called_once()
def test_read_from_non_local_dir(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value fs_mock.isdir.return_value = True pq_ds_mock = mocker.patch("pyarrow.parquet.ParquetDataset") data_set = ParquetDataSet(filepath="s3://bucket/dir") data_set.load() fs_mock.isdir.assert_called_once() assert not fs_mock.open.called pq_ds_mock.assert_called_once_with("bucket/dir", filesystem=fs_mock) pq_ds_mock().read().to_pandas.assert_called_once_with()
def test_save_and_load_non_existing_dir(self, tmp_path, dummy_dataframe): """Test saving and reloading the data set to non-existing directory.""" filepath = (tmp_path / "non-existing" / FILENAME).as_posix() data_set = ParquetDataSet(filepath=filepath) data_set.save(dummy_dataframe) reloaded = data_set.load() assert_frame_equal(dummy_dataframe, reloaded)
def test_save_and_load(self, tmp_path, dummy_dataframe): """Test saving and reloading the data set.""" filepath = (tmp_path / FILENAME).as_posix() data_set = ParquetDataSet(filepath=filepath) data_set.save(dummy_dataframe) reloaded = data_set.load() assert_frame_equal(dummy_dataframe, reloaded) assert data_set._fs_open_args_load == {} files = [child.is_file() for child in tmp_path.iterdir()] assert all(files) assert len(files) == 1
def test_read_partitioned_file(self, mocker, tmp_path, dummy_dataframe): """Test read partitioned parquet file from local directory.""" pq_ds_mock = mocker.patch("pyarrow.parquet.ParquetDataset", wraps=pq.ParquetDataset) dummy_dataframe.to_parquet(str(tmp_path), partition_cols=["col2"]) data_set = ParquetDataSet(filepath=tmp_path.as_posix()) reloaded = data_set.load() # Sort by columns because reading partitioned file results # in different columns order reloaded = reloaded.sort_index(axis=1) # dtype for partition column is 'category' assert_frame_equal(dummy_dataframe, reloaded, check_dtype=False, check_categorical=False) pq_ds_mock.assert_called_once()
def test_save_parquet(self, tmp_path, sample_spark_df): # To cross check the correct Spark save operation we save to # a single spark partition and retrieve it with Kedro # ParquetDataSet temp_dir = Path(str(tmp_path / "test_data")) spark_data_set = SparkDataSet(filepath=str(temp_dir), save_args={"compression": "none"}) spark_df = sample_spark_df.coalesce(1) spark_data_set.save(spark_df) single_parquet = [ f for f in temp_dir.iterdir() if f.is_file() and f.name.startswith("part") ][0] local_parquet_data_set = ParquetDataSet(filepath=str(single_parquet)) pandas_df = local_parquet_data_set.load() assert pandas_df[pandas_df["name"] == "Bob"]["age"].iloc[0] == 12