def generate_mp(dataset_metadata=None): return MetaPartition( label=uuid.uuid4().hex, table_meta={"table": make_meta(get_dataframe_alltypes())}, files={"table": "fakefile"}, dataset_metadata=dataset_metadata, )
def setup(self, num_schemas, has_na): self.df = get_dataframe_alltypes() schema = make_meta(self.df, origin="df") self.schemas = [deepcopy(schema) for _ in range(num_schemas)] if has_na: empty_schema = make_meta(self.df[0:0], origin="empty") # insert alternating empty schemas self.schemas[::2] = [ deepcopy(empty_schema) for _ in range(len(self.schemas[::2])) ]
def test_arrow_compat(arrow_version, reference_store, mocker): """ Test if reading/writing across the supported arrow versions is actually compatible Generate new reference files by going to the `reference-data/arrow-compat` directory and executing `generate_reference.py` or `batch_generate_reference.sh`. """ uuid_hook = mocker.patch("kartothek.core.uuid._uuid_hook_object") uuid_hook.return_value = uuid.UUID( bytes=b"\x82\xd6\xc1\x06Z\x08\x11\xe9\x85eJ\x00\x07\xf8\n\x10") orig = get_dataframe_alltypes() restored = ParquetSerializer().restore_dataframe(store=reference_store, key=arrow_version + ".parquet", date_as_object=True) pdt.assert_frame_equal(orig, restored)
def test_arrow_compat(arrow_version, reference_store, mocker): """ Test if reading/writing across the supported arrow versions is actually compatible Generate new reference files with:: import pyarrow as pa ParquetSerializer().store(reference_store, pa.__version__, orig) """ uuid_hook = mocker.patch("kartothek.core.uuid._uuid_hook_object") uuid_hook.return_value = uuid.UUID( bytes=b"\x82\xd6\xc1\x06Z\x08\x11\xe9\x85eJ\x00\x07\xf8\n\x10") orig = get_dataframe_alltypes() restored = ParquetSerializer().restore_dataframe(store=reference_store, key=arrow_version + ".parquet", date_as_object=True) if arrow_version == "0.14.1" and not ARROW_LARGER_EQ_0141: orig = orig.astype({"null": float}) pdt.assert_frame_equal(orig, restored)
def df_all_types(): return get_dataframe_alltypes()
def test_get_dataframe_alltypes(): df = get_dataframe_alltypes() assert isinstance(df, pd.DataFrame) assert not df.empty assert "byte" in df.columns
#!/usr/bin/env python import os import pyarrow as pa from storefact import get_store_from_url from kartothek.core.testing import get_dataframe_alltypes from kartothek.serialization import ParquetSerializer if __name__ == "__main__": ser = ParquetSerializer() dir_path = os.path.dirname(os.path.realpath(__file__)) store = get_store_from_url(f"hfs://{dir_path}") df = get_dataframe_alltypes() df["byte"] = b"\x82\xd6\xc1\x06Z\x08\x11\xe9\x85eJ\x00\x07\xf8\n\x10" ref_file = f"{pa.__version__}" ser.store(store, ref_file, df)
def setup(self, num_schemas): self.df = get_dataframe_alltypes() schema = make_meta(self.df, origin="df") self.schemas = [deepcopy(schema) for _ in range(num_schemas)]
def setup(self): self.df = get_dataframe_alltypes()
def generate_mp(): return MetaPartition( label=uuid.uuid4().hex, schema=make_meta(get_dataframe_alltypes(), origin="alltypes"), file="fakefile", )