def test_load_parquet(self, tmp_path, sample_pandas_df): temp_path = str(tmp_path / "data") local_parquet_set = ParquetLocalDataSet(filepath=temp_path) local_parquet_set.save(sample_pandas_df) spark_data_set = SparkDataSet(filepath=temp_path) spark_df = spark_data_set.load() assert spark_df.count() == 4
def test_load_parquet(tmpdir): temp_path = str(tmpdir.join("data")) pandas_df = _get_sample_pandas_data_frame() local_parquet_set = ParquetLocalDataSet(filepath=temp_path) local_parquet_set.save(pandas_df) spark_data_set = SparkDataSet(filepath=temp_path) spark_df = spark_data_set.load() assert spark_df.count() == 4
def test_version_str_repr(self, load_version, save_version): """Test that version is in string representation of the class instance when applicable.""" filepath = "data" ds = ParquetLocalDataSet(filepath=filepath) ds_versioned = ParquetLocalDataSet(filepath=filepath, version=Version( load_version, save_version)) assert filepath in str(ds) assert "version" not in str(ds) assert filepath in str(ds_versioned) ver_str = "version=Version(load={}, save='{}')".format( load_version, save_version) assert ver_str in str(ds_versioned)
def write_parquet_locally(context): """Writes DataFrame as Parquet in a temporary directory.""" file_name = "dummy.parq" context.full_path = context.temp_dir / file_name context.data_set = ParquetLocalDataSet(str(context.full_path)) context.data_set.save(context.pandas_df) assert context.full_path.exists()
def test_str_representation(self): """Test string representation of the data set instance.""" parquet_data_set = ParquetLocalDataSet("test_file.parquet") pattern = ( "ParquetLocalDataSet(engine=auto, " "filepath=test_file.parquet, save_args={})" ) assert pattern in str(parquet_data_set)
def build_catalog(root_dir): """Creates the kedro catalog object stored as io""" return DataCatalog( { "raw_cities": CSVHTTPDataSet( fileurl= "https://people.sc.fsu.edu/~jburkardt/data/csv/cities.csv", auth=None, load_args=None, ), "int_cities": ParquetLocalDataSet( filepath=root_dir / "data" / "int" / "cities", ), "pri_cities": ParquetLocalDataSet( filepath=root_dir / "data" / "pri" / "cities", ), }, )
def test_save_parquet(self, tmp_path, sample_spark_df): # To cross check the correct Spark save operation we save to # a single spark partition and retrieve it with Kedro # ParquetLocalDataSet temp_dir = Path(str(tmp_path / "test_data")) spark_data_set = SparkDataSet( filepath=str(temp_dir), save_args={"compression": "none"} ) spark_df = sample_spark_df.coalesce(1) spark_data_set.save(spark_df) single_parquet = [ f for f in temp_dir.iterdir() if f.is_file() and f.name.startswith("part") ][0] local_parquet_data_set = ParquetLocalDataSet(filepath=str(single_parquet)) pandas_df = local_parquet_data_set.load() assert pandas_df[pandas_df["name"] == "Bob"]["age"].iloc[0] == 12
def test_save_parquet(): # To cross check the correct Spark save operation we save to # a single spark partition and retrieve it with Kedro # ParquetLocalDataSet with tempfile.TemporaryDirectory() as temp_dir: temp_path = join(temp_dir, "test_data") spark_data_set = SparkDataSet(filepath=temp_path, save_args={"compression": "none"}) spark_df = _get_sample_spark_data_frame().coalesce(1) spark_data_set.save(spark_df) single_parquet = [ join(temp_path, f) for f in listdir(temp_path) if f.startswith("part") ][0] local_parquet_data_set = ParquetLocalDataSet(filepath=single_parquet) pandas_df = local_parquet_data_set.load() assert pandas_df[pandas_df["name"] == "Bob"]["age"].iloc[0] == 12
def versioned_parquet_data_set(data_path, load_version, save_version): return ParquetLocalDataSet( filepath=data_path, version=Version(load_version, save_version) )
def parquet_data_set(data_path, request): return ParquetLocalDataSet(filepath=data_path, **request.param)
def multi_catalog(): csv = CSVLocalDataSet(filepath="abc.csv") parq = ParquetLocalDataSet(filepath="xyz.parq") return DataCatalog({"abc": csv, "xyz": parq})
def multi_catalog(mocker): csv = CSVLocalDataSet(filepath="abc.csv") parq = ParquetLocalDataSet(filepath="xyz.parq") journal = mocker.Mock() return DataCatalog({"abc": csv, "xyz": parq}, journal=journal)