def test_null_count(store, column, expected_null_count): serialiser = ParquetSerializer(chunk_size=2) df = pd.DataFrame({ "no_nulls_int": [1, 2, 3, 4, 5, 6], "partial_nulls_int": [1, 2, 3, None, None, None], "no_nulls_float": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6], "partial_nulls_float": [1.0, 2.2, 3.3, np.nan, np.nan, np.nan], "partial_nulls_obj": [1.0, 2.2, 3.3, np.nan, np.nan, np.nan], "no_nulls_obj": ["1.1", "2", "3", "vier", "fuenfeinhalb", "6.6"], "partial_nulls_obj_mixed": [1.0, 2.2, None, np.nan, np.nan, 6.6], "nulls_reverse_rg": [3.3, np.nan, 1.0, 2.0, np.nan, -1.1], }) key = serialiser.store(store, "prefix", df) reader = pa.BufferReader(store.get(key)) parquet_file = ParquetFile(reader) col_idx = parquet_file.reader.column_name_idx(column) assert parquet_file.num_row_groups == 3 for idx in range(0, 3): rg = parquet_file.metadata.row_group(idx) assert rg.column( col_idx).statistics.null_count == expected_null_count[idx]
def test_reconstruct_index_categories(store): ser = ParquetSerializer() df = pd.DataFrame({ "index_col": [1, 1], "second_index_col": [2, 2], "column": list("ab") }) label = "dontcare" key_prefix = "uuid/table/index_col=2/second_index_col=2/{}".format(label) key = ser.store(store, key_prefix, df) schema = make_meta(df, origin="1", partition_keys="index_col") store_schema_metadata(schema, "uuid", store, "table") mp = MetaPartition( label="index_col=2/dontcare", files={"table": key}, metadata_version=4, table_meta={"table": schema}, partition_keys=["index_col", "second_index_col"], ) categories = ["second_index_col", "index_col"] mp = mp.load_dataframes(store, categoricals={"table": categories}) df_actual = mp.data["table"] df_expected = pd.DataFrame( OrderedDict([ ("index_col", [2, 2]), ("second_index_col", [2, 2]), ("column", list("ab")), ])) df_expected = df_expected.astype({col: "category" for col in categories}) pdt.assert_frame_equal(df_actual, df_expected)
def test_reconstruct_index_empty_df(store, categoricals): ser = ParquetSerializer() df = pd.DataFrame({"index_col": [1, 1], "column": list("ab")}) df = df[0:0] label = "dontcare" key_prefix = "uuid/table/index_col=2/{}".format(label) key = ser.store(store, key_prefix, df) schema = make_meta(df, origin="1", partition_keys="index_col") store_schema_metadata(schema, "uuid", store) mp = MetaPartition( label="index_col=2/dontcare", file=key, metadata_version=4, schema=schema, partition_keys=["index_col"], ) categoricals = None if categoricals: categoricals = ["index_col"] mp = mp.load_dataframes(store, categoricals=categoricals) df_actual = mp.data df_expected = pd.DataFrame( OrderedDict([("index_col", [2, 2]), ("column", list("ab"))]) ) df_expected = df_expected[0:0] if categoricals: df_expected = df_expected.astype({"index_col": "category"}) pdt.assert_frame_equal(df_actual, df_expected)
def test_pushdown_null_itermediate(store): binary = b"\x8f\xb6\xe5@\x90\xdc\x11\xe8\x00\xae\x02B\xac\x12\x01\x06" df = pd.DataFrame({"byte_with_null": [binary]}) serialiser = ParquetSerializer(chunk_size=1) key = serialiser.store(store, "key", df) predicate = [[("byte_with_null", "==", binary)]] restored = serialiser.restore_dataframe(store, key, predicates=predicate) pdt.assert_frame_equal(restored, df)
def test_timestamp_us(store): # test that a df with us precision round-trips using parquet ts = datetime(2000, 1, 1, 15, 23, 24, 123456) df = pd.DataFrame({"ts": [ts]}) serialiser = ParquetSerializer() key = serialiser.store(store, "prefix", df) pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)
def test_read_empty_file_with_predicates(store): ser = ParquetSerializer() df = pd.DataFrame(dict(col=pd.Series([], dtype=str))) key = ser.store(store, "key", df) restored_df = ser.restore_dataframe(store, key, columns=["col"], predicates=[[("col", "==", "1")]]) pdt.assert_frame_equal(restored_df, df)
def test_read_categorical(store): df = pd.DataFrame({"col": ["a"]}).astype({"col": "category"}) serialiser = ParquetSerializer() key = serialiser.store(store, "prefix", df) df = serialiser.restore_dataframe(store, key) assert df.dtypes["col"] == "O" df = serialiser.restore_dataframe(store, key, categories=["col"]) assert df.dtypes["col"] == pd.CategoricalDtype(["a"], ordered=False)
def test_pushdown_binaries(store, dataframe_not_nested, binary_value, chunk_size): if _check_contains_null(binary_value): pytest.xfail("Null-terminated binary strings are not supported") serialiser = ParquetSerializer(chunk_size=chunk_size) key = serialiser.store(store, "prefix", dataframe_not_nested) predicates = [[("bytes", "==", binary_value)]] df_restored = serialiser.restore_dataframe(store, key, predicates=predicates) assert len(df_restored) == 1 assert df_restored.iloc[0].bytes == binary_value
def test_int64_statistics_overflow(reference_store, predicate_pushdown_to_io): # Test case for ARROW-5166 ser = ParquetSerializer() v = 705449463447499237 predicates = [[("x", "==", v)]] result = ser.restore_dataframe( reference_store, "int64_statistics_overflow.parquet", predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=predicates, ) assert not result.empty assert (result["x"] == v).all()
def test_column_string_cast(df_all_types, store, metadata_version): original_columns = df_all_types.columns.copy() df_all_types.columns = df_all_types.columns.str.encode("utf-8") ser = ParquetSerializer() key = ser.store(store, "uuid/table/something", df_all_types) mp = MetaPartition( label="something", file=key, schema=make_meta(df_all_types, origin="table"), metadata_version=metadata_version, ) mp = mp.load_dataframes(store) df = mp.data assert all(original_columns == df.columns)
def test_rowgroup_writing(store, use_categorical, chunk_size): df = pd.DataFrame({"string": ["abc", "affe", "banane", "buchstabe"]}) serialiser = ParquetSerializer(chunk_size=2) # Arrow 0.9.0 has a bug in writing categorical columns to more than a single # RowGroup: "ArrowIOError: Column 2 had 2 while previous column had 4". # We have special handling for that in pandas-serialiser that should be # removed once we switch to 0.10.0 if use_categorical: df_write = df.astype({"string": "category"}) else: df_write = df key = serialiser.store(store, "prefix", df_write) parquet_file = ParquetFile(store.open(key)) assert parquet_file.num_row_groups == 2
def test_predicate_accept_in(store, predicate_value, expected): df = pd.DataFrame({"A": [0, 4, 13, 29]}) # min = 0, max = 29 predicate = ("A", "in", predicate_value) serialiser = ParquetSerializer(chunk_size=None) key = serialiser.store(store, "prefix", df) parquet_file = ParquetFile(store.open(key)) row_meta = parquet_file.metadata.row_group(0) arrow_schema = parquet_file.schema.to_arrow_schema() parquet_reader = parquet_file.reader assert (_predicate_accepts( predicate, row_meta=row_meta, arrow_schema=arrow_schema, parquet_reader=parquet_reader, ) == expected)
def test_store_single_dataframe_as_partition(store, metadata_storage_format, metadata_version, expected_key): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) mp = MetaPartition(label="test_label", data={"core": df}, metadata_version=metadata_version) meta_partition = mp.store_dataframes( store=store, df_serializer=ParquetSerializer(), dataset_uuid="dataset_uuid", store_metadata=True, metadata_storage_format=metadata_storage_format, ) assert len(meta_partition.data) == 0 assert meta_partition.files == {"core": expected_key} assert meta_partition.label == "test_label" files_in_store = list(store.keys()) expected_num_files = 1 assert len(files_in_store) == expected_num_files stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_key) pdt.assert_frame_equal(df, stored_df) files_in_store.remove(expected_key) assert len(files_in_store) == expected_num_files - 1
def test_store_single_dataframe_as_partition_no_metadata( store, metadata_version): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) mp = MetaPartition(label="test_label", data={"core": df}, metadata_version=metadata_version) partition = mp.store_dataframes( store=store, df_serializer=ParquetSerializer(), dataset_uuid="dataset_uuid", store_metadata=False, ) assert len(partition.data) == 0 expected_file = "dataset_uuid/core/test_label.parquet" assert partition.files == {"core": expected_file} assert partition.label == "test_label" # One meta one actual file files_in_store = list(store.keys()) assert len(files_in_store) == 1 stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_file) pdt.assert_frame_equal(df, stored_df)
def test_store_single_dataframe_as_partition(store, metadata_version): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) mp = MetaPartition(label="test_label", data=df, metadata_version=metadata_version) meta_partition = mp.store_dataframes( store=store, df_serializer=ParquetSerializer(), dataset_uuid="dataset_uuid", ) assert meta_partition.data is None expected_key = "dataset_uuid/table/test_label.parquet" assert meta_partition.file == expected_key assert meta_partition.label == "test_label" files_in_store = list(store.keys()) expected_num_files = 1 assert len(files_in_store) == expected_num_files stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_key) pdt.assert_frame_equal(df, stored_df) files_in_store.remove(expected_key) assert len(files_in_store) == expected_num_files - 1
def test_get_parquet_metadata_row_group_size(store): df = pd.DataFrame({"P": np.arange(0, 10), "L": np.arange(0, 10)}) mp = MetaPartition(label="test_label", data=df) ps = ParquetSerializer(chunk_size=5) meta_partition = mp.store_dataframes( store=store, dataset_uuid="dataset_uuid", df_serializer=ps ) actual = meta_partition.get_parquet_metadata(store=store) actual.drop( columns=[ "serialized_size", "row_group_compressed_size", "row_group_uncompressed_size", ], axis=1, inplace=True, ) expected = pd.DataFrame( { "partition_label": ["test_label", "test_label"], "row_group_id": [0, 1], "number_rows_total": [10, 10], "number_row_groups": [2, 2], "number_rows_per_row_group": [5, 5], } ) pd.testing.assert_frame_equal(actual, expected)
def _validate_predicate_pushdown(df, column, value, store, chunk_size): serialiser = ParquetSerializer(chunk_size=chunk_size) key = serialiser.store(store, "prefix", df) predicates = [[(column, "==", value)]] df_restored = serialiser.restore_dataframe(store, key, predicates=predicates) # date objects are converted to datetime in pyarrow df_restored["date"] = df_restored["date"].dt.date expected = df.iloc[[3]] # ARROW-5138 index isn't preserved when doing predicate pushdown pdt.assert_frame_equal(df_restored.reset_index(drop=True), expected.reset_index(drop=True))
def test_predicate_not_in_columns(store, chunk_size): ser = ParquetSerializer(chunk_size=chunk_size) df = pd.DataFrame({ "date": [date(2000, 1, 1), date(2000, 1, 2), date(2000, 1, 2)], "col": [1, 2, 1], }) key = ser.store(store, "key", df) restored_df = ser.restore_dataframe(store, key, columns=[], predicates=[[("col", "==", 1)]]) if chunk_size: expected_df = pd.DataFrame(index=[0, 1]) else: expected_df = pd.DataFrame(index=[0, 2]) pdt.assert_frame_equal(restored_df, expected_df)
def test_date_as_object(store, chunk_size): ser = ParquetSerializer(chunk_size=chunk_size) df = pd.DataFrame({"date": [date(2000, 1, 1), date(2000, 1, 2)]}) key = ser.store(store, "key", df) restored_df = ser.restore_dataframe(store, key, categories=["date"], date_as_object=True) categories = pd.Series([date(2000, 1, 1), date(2000, 1, 2)]) expected_df = pd.DataFrame({"date": pd.Categorical(categories)}) # expected_df.date = expected_df.date.cat.rename_categories([date(2000, 1, 1)]) pdt.assert_frame_equal(restored_df, expected_df) restored_df = ser.restore_dataframe(store, key, date_as_object=True, predicates=[[("date", "==", "2000-01-01")]]) expected_df = pd.DataFrame({"date": [date(2000, 1, 1)]}) pdt.assert_frame_equal(restored_df, expected_df)
class TimeRestore(object): """ An example benchmark that times the performance of various kinds of iterating over dictionaries in Python. """ params = [(10 ** 3, 10 ** 4), (10, 10 ** 2, 10 ** 3)] param_names = ["num_rows", "chunk_size"] def setup(self, num_rows, chunk_size): self.df = get_dataframe_not_nested(num_rows) self.serialiser = ParquetSerializer(chunk_size=chunk_size) self.store = get_store_from_url("memory://") self.key = self.serialiser.store(self.store, "key_prefix", self.df) self.predicates = [[("int16", "==", 123)]] def time_predicate_pushdown(self, num_rows, chunk_size): self.serialiser.restore_dataframe( self.store, self.key, predicate_pushdown_to_io=True, predicates=self.predicates, )
def test_reconstruct_date_index(store, metadata_version, dates_as_object): ser = ParquetSerializer() # If the parquet file does include the primary index col, still use the reconstructed index and ignore the content of the file df = pd.DataFrame( {"index_col": [date(2018, 6, 1), date(2018, 6, 1)], "column": list("ab")} ) label = "dontcare" key_prefix = "uuid/table/index_col=2018-06-02/{}".format(label) key = ser.store(store, key_prefix, df) schema = make_meta(df, origin="1", partition_keys="index_col") store_schema_metadata(schema, "uuid", store) mp = MetaPartition( label="dontcare", file=key, metadata_version=metadata_version, schema=schema, partition_keys=["index_col"], ) mp = mp.load_dataframes(store, dates_as_object=dates_as_object) df_actual = mp.data if dates_as_object: dt_constructor = date else: dt_constructor = datetime df_expected = pd.DataFrame( OrderedDict( [ ("index_col", [dt_constructor(2018, 6, 2), dt_constructor(2018, 6, 2)]), ("column", list("ab")), ] ) ) pdt.assert_frame_equal(df_actual, df_expected)
def test_collect_dataset_metadata_predicates_row_group_size(store_factory): ps = ParquetSerializer(chunk_size=2) df = pd.DataFrame(data={ "P": range(10), "L": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"] }) store_dataframes_as_dataset( store=store_factory, dataset_uuid="dataset_uuid", partition_on=["L"], dfs=[df], df_serializer=ps, ) predicates = [[("L", "==", "a")]] df_stats = collect_dataset_metadata( store=store_factory, dataset_uuid="dataset_uuid", table_name="table", predicates=predicates, frac=1, ).compute() for part_label in df_stats["partition_label"]: assert "L=a" in part_label df_stats.sort_values(by=["partition_label", "row_group_id"], inplace=True) actual = df_stats.drop( columns=[ "partition_label", "row_group_compressed_size", "row_group_uncompressed_size", "serialized_size", ], axis=1, ) expected = pd.DataFrame( data={ "row_group_id": [0, 1, 2], "number_rows_total": [5, 5, 5], "number_row_groups": [3, 3, 3], "number_rows_per_row_group": [2, 2, 1], }, index=[0, 1, 2], ) pd.testing.assert_frame_equal(actual, expected)
def test_arrow_compat(arrow_version, reference_store, mocker): """ Test if reading/writing across the supported arrow versions is actually compatible Generate new reference files by going to the `reference-data/arrow-compat` directory and executing `generate_reference.py` or `batch_generate_reference.sh`. """ uuid_hook = mocker.patch("kartothek.core.uuid._uuid_hook_object") uuid_hook.return_value = uuid.UUID( bytes=b"\x82\xd6\xc1\x06Z\x08\x11\xe9\x85eJ\x00\x07\xf8\n\x10") orig = get_dataframe_alltypes() restored = ParquetSerializer().restore_dataframe(store=reference_store, key=arrow_version + ".parquet", date_as_object=True) pdt.assert_frame_equal(orig, restored)
def test_compat_old_rw_path(df_all_types, store): # strip down DF before some column types weren't supported before anyway df = df_all_types[ [ c for c in df_all_types.columns if ( not c.startswith("array_") # array types (always null) and c != "unicode" # unicode type (alway null) and "8" not in c # 8 bit types are casted to 64 bit and "16" not in c # 16 bit types are casted to 64 bit and "32" not in c # 32 bit types are casted to 64 bit ) ] ] expected_meta = make_meta(df, origin="df") # old schema write path old_meta = dask_make_meta(df) pa_table = pa.Table.from_pandas(old_meta) buf = pa.BufferOutputStream() pq.write_table(pa_table, buf, version="2.0") key_old = _get_common_metadata_key("dataset_uuid_old", "table") store.put(key_old, buf.getvalue().to_pybytes()) actual_meta = read_schema_metadata( dataset_uuid="dataset_uuid_old", store=store, table="table" ) validate_compatible([actual_meta, expected_meta]) store_schema_metadata( schema=make_meta(df, origin="df"), dataset_uuid="dataset_uuid_new", store=store, table="table", ) key_new = _get_common_metadata_key("dataset_uuid_new", "table") actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store) actual_df["date"] = actual_df["date"].dt.date pdt.assert_frame_equal(actual_df, old_meta)
def test_arrow_compat(arrow_version, reference_store, mocker): """ Test if reading/writing across the supported arrow versions is actually compatible Generate new reference files with:: import pyarrow as pa ParquetSerializer().store(reference_store, pa.__version__, orig) """ uuid_hook = mocker.patch("kartothek.core.uuid._uuid_hook_object") uuid_hook.return_value = uuid.UUID( bytes=b"\x82\xd6\xc1\x06Z\x08\x11\xe9\x85eJ\x00\x07\xf8\n\x10") orig = get_dataframe_alltypes() restored = ParquetSerializer().restore_dataframe(store=reference_store, key=arrow_version + ".parquet", date_as_object=True) if arrow_version == "0.14.1" and not ARROW_LARGER_EQ_0141: orig = orig.astype({"null": float}) pdt.assert_frame_equal(orig, restored)
def setup(self, num_rows, chunk_size): self.df = get_dataframe_not_nested(num_rows) self.serialiser = ParquetSerializer(chunk_size=chunk_size) self.store = get_store_from_url("memory://") self.key = self.serialiser.store(self.store, "key_prefix", self.df) self.predicates = [[("int16", "==", 123)]]
#!/usr/bin/env python import os import pyarrow as pa from storefact import get_store_from_url from kartothek.core.testing import get_dataframe_alltypes from kartothek.serialization import ParquetSerializer if __name__ == "__main__": ser = ParquetSerializer() dir_path = os.path.dirname(os.path.realpath(__file__)) store = get_store_from_url(f"hfs://{dir_path}") df = get_dataframe_alltypes() df["byte"] = b"\x82\xd6\xc1\x06Z\x08\x11\xe9\x85eJ\x00\x07\xf8\n\x10" ref_file = f"{pa.__version__}" ser.store(store, ref_file, df)
"KTK_CUBE_METADATA_STORAGE_FORMAT", "KTK_CUBE_METADATA_VERSION", "KTK_CUBE_UUID_SEPERATOR", ) # # !!!! WARNING !!! # # If you change any of these constants, this may break backwards compatibility. # Also, always ensure to also adapt the docs (especially the format specification in the README). # # !!!!!!!!!!!!!!!! # #: DataFrame serializer that is be used to write data. KTK_CUBE_DF_SERIALIZER = ParquetSerializer(compression="ZSTD") #: Storage format for kartothek metadata that is be used by default. KTK_CUBE_METADATA_STORAGE_FORMAT = "json" #: Kartothek metadata version that ktk_cube is based on. KTK_CUBE_METADATA_VERSION = 4 #: Metadata key that is used to mark seed datasets KTK_CUBE_METADATA_KEY_IS_SEED = "ktk_cube_is_seed" #: Metadata key to store dimension columns KTK_CUBE_METADATA_DIMENSION_COLUMNS = "ktk_cube_dimension_columns" #: Metadata key to store partition columns KTK_CUBE_METADATA_PARTITION_COLUMNS = "ktk_cube_partition_columns"
import numpy as np import pandas as pd import pandas.testing as pdt import pyarrow as pa import pytest from kartothek.serialization import ( CsvSerializer, DataFrameSerializer, ParquetSerializer, default_serializer, ) from kartothek.serialization._util import ensure_unicode_string_type TYPE_STABLE_SERIALISERS = [ParquetSerializer()] SERLIALISERS = TYPE_STABLE_SERIALISERS + [ CsvSerializer(), CsvSerializer(compress=False), default_serializer(), ] type_stable_serialisers = pytest.mark.parametrize("serialiser", TYPE_STABLE_SERIALISERS) predicate_serialisers = pytest.mark.parametrize( "serialiser", [ ParquetSerializer(chunk_size=1), ParquetSerializer(chunk_size=2),
def df_serializer(): return ParquetSerializer()