Exemple #1
0
def test_store_df_to_store(store):
    df = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["∆", "€"]})
    dataframe_format = default_serializer()
    assert isinstance(dataframe_format, ParquetSerializer)
    key = dataframe_format.store(store, "prefix", df)
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key),
                           df)
Exemple #2
0
    def store_dataframes(
        self,
        store: StoreInput,
        dataset_uuid: str,
        df_serializer: Optional[DataFrameSerializer] = None,
    ) -> "MetaPartition":
        """
        Stores all dataframes of the MetaPartitions and registers the saved
        files under the `files` atrribute. The dataframe itself is deleted from memory.

        Parameters
        ----------
        store
            If it is a function, the result of calling it must be a KeyValueStore.
        dataset_uuid
            The dataset UUID the partition will be assigned to
        df_serializer
            Serialiser to be used to store the dataframe
        Returns
        -------
        MetaPartition
        """
        df_serializer = (df_serializer if df_serializer is not None else
                         default_serializer())

        key = get_partition_file_prefix(
            partition_label=self.label,
            dataset_uuid=dataset_uuid,
            metadata_version=self.metadata_version,
            table=self.table_name,
        )
        if self.data is not None:
            df = self.data
            try:
                file = df_serializer.store(store, key, df)
            except Exception as exc:
                try:
                    if isinstance(df, pd.DataFrame):
                        buf = io.StringIO()
                        df.info(buf=buf)
                        LOGGER.error(
                            "Writing dataframe failed.\n"
                            "%s\n"
                            "%s\n"
                            "%s",
                            exc,
                            buf.getvalue(),
                            df.head(),
                        )
                    else:
                        LOGGER.error("Storage of dask dataframe failed.")
                        pass
                finally:
                    raise

            new_metapartition = self.copy(file=file, data=None)

            return new_metapartition
        else:
            return self
Exemple #3
0
import pytest

from kartothek.serialization import (
    CsvSerializer,
    DataFrameSerializer,
    ParquetSerializer,
    default_serializer,
)
from kartothek.serialization._util import ensure_unicode_string_type

TYPE_STABLE_SERIALISERS = [ParquetSerializer()]

SERLIALISERS = TYPE_STABLE_SERIALISERS + [
    CsvSerializer(),
    CsvSerializer(compress=False),
    default_serializer(),
]

type_stable_serialisers = pytest.mark.parametrize("serialiser",
                                                  TYPE_STABLE_SERIALISERS)

predicate_serialisers = pytest.mark.parametrize(
    "serialiser",
    [
        ParquetSerializer(chunk_size=1),
        ParquetSerializer(chunk_size=2),
        ParquetSerializer(chunk_size=4),
    ] + SERLIALISERS,
)