def test_save_parquet(self, tmp_path, sample_spark_df): # To cross check the correct Spark save operation we save to # a single spark partition and retrieve it with Kedro # ParquetLocalDataSet temp_dir = Path(str(tmp_path / "test_data")) spark_data_set = SparkDataSet( filepath=str(temp_dir), save_args={"compression": "none"} ) spark_df = sample_spark_df.coalesce(1) spark_data_set.save(spark_df) single_parquet = [ f for f in temp_dir.iterdir() if f.is_file() and f.name.startswith("part") ][0] local_parquet_data_set = ParquetLocalDataSet(filepath=str(single_parquet)) pandas_df = local_parquet_data_set.load() assert pandas_df[pandas_df["name"] == "Bob"]["age"].iloc[0] == 12
def test_save_parquet(): # To cross check the correct Spark save operation we save to # a single spark partition and retrieve it with Kedro # ParquetLocalDataSet with tempfile.TemporaryDirectory() as temp_dir: temp_path = join(temp_dir, "test_data") spark_data_set = SparkDataSet(filepath=temp_path, save_args={"compression": "none"}) spark_df = _get_sample_spark_data_frame().coalesce(1) spark_data_set.save(spark_df) single_parquet = [ join(temp_path, f) for f in listdir(temp_path) if f.startswith("part") ][0] local_parquet_data_set = ParquetLocalDataSet(filepath=single_parquet) pandas_df = local_parquet_data_set.load() assert pandas_df[pandas_df["name"] == "Bob"]["age"].iloc[0] == 12