def test_store_df_to_store(store): df = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["∆", "€"]}) dataframe_format = default_serializer() assert isinstance(dataframe_format, ParquetSerializer) key = dataframe_format.store(store, "prefix", df) pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)
def store_dataframes( self, store: StoreInput, dataset_uuid: str, df_serializer: Optional[DataFrameSerializer] = None, ) -> "MetaPartition": """ Stores all dataframes of the MetaPartitions and registers the saved files under the `files` atrribute. The dataframe itself is deleted from memory. Parameters ---------- store If it is a function, the result of calling it must be a KeyValueStore. dataset_uuid The dataset UUID the partition will be assigned to df_serializer Serialiser to be used to store the dataframe Returns ------- MetaPartition """ df_serializer = (df_serializer if df_serializer is not None else default_serializer()) key = get_partition_file_prefix( partition_label=self.label, dataset_uuid=dataset_uuid, metadata_version=self.metadata_version, table=self.table_name, ) if self.data is not None: df = self.data try: file = df_serializer.store(store, key, df) except Exception as exc: try: if isinstance(df, pd.DataFrame): buf = io.StringIO() df.info(buf=buf) LOGGER.error( "Writing dataframe failed.\n" "%s\n" "%s\n" "%s", exc, buf.getvalue(), df.head(), ) else: LOGGER.error("Storage of dask dataframe failed.") pass finally: raise new_metapartition = self.copy(file=file, data=None) return new_metapartition else: return self
import pytest from kartothek.serialization import ( CsvSerializer, DataFrameSerializer, ParquetSerializer, default_serializer, ) from kartothek.serialization._util import ensure_unicode_string_type TYPE_STABLE_SERIALISERS = [ParquetSerializer()] SERLIALISERS = TYPE_STABLE_SERIALISERS + [ CsvSerializer(), CsvSerializer(compress=False), default_serializer(), ] type_stable_serialisers = pytest.mark.parametrize("serialiser", TYPE_STABLE_SERIALISERS) predicate_serialisers = pytest.mark.parametrize( "serialiser", [ ParquetSerializer(chunk_size=1), ParquetSerializer(chunk_size=2), ParquetSerializer(chunk_size=4), ] + SERLIALISERS, )