Esempio n. 1
0
def hash_dataset(
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    subset=None,
    group_key=None,
    table: str = SINGLE_TABLE,
    predicates: Optional[PredicatesType] = None,
    factory: Optional[DatasetFactory] = None,
) -> dd.Series:
    """
    Calculate a partition wise, or group wise, hash of the dataset.

    .. note::

        We do not guarantee the hash values to remain constant accross versions.


    Example output::

        Assuming a dataset with two unique values in column `P` this gives

        >>> hash_dataset(factory=dataset_with_index_factory,group_key=["P"]).compute()
        ... P
        ... 1    11462879952839863487
        ... 2    12568779102514529673
        ... dtype: uint64

    Parameters
    ----------
    subset
        If provided, only take these columns into account when hashing the dataset
    group_key
        If provided, calculate hash per group instead of per partition
    """
    dataset_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    columns = subset
    if subset and group_key:
        columns = sorted(set(subset) | set(group_key))
    ddf = read_dataset_as_ddf(
        table=table,
        predicates=predicates,
        factory=dataset_factory,
        columns=columns,
        dates_as_object=True,
    )
    if not group_key:
        return ddf.map_partitions(_hash_partition, meta="uint64").astype("uint64")
    else:
        ddf2 = pack_payload(ddf, group_key=group_key)
        return (
            ddf2.groupby(group_key)
            .apply(_unpack_hash, unpack_meta=ddf._meta, subset=subset, meta="uint64")
            .astype("uint64")
        )
Esempio n. 2
0
def test_pack_payload_roundtrip(df_all_types, num_group_cols):
    group_key = list(df_all_types.columns[-num_group_cols:])
    df_all_types = dd.from_pandas(df_all_types, npartitions=2)
    pdt.assert_frame_equal(
        df_all_types.compute(),
        unpack_payload(
            pack_payload(df_all_types, group_key=group_key),
            unpack_meta=df_all_types._meta,
        ).compute(),
    )
Esempio n. 3
0
def test_pack_payload_empty(df_all_types):
    # For a single row dataframe the packing actually has a few more bytes
    df_empty = dd.from_pandas(df_all_types.iloc[:0], npartitions=1)

    group_key = [df_all_types.columns[-1]]
    pdt.assert_frame_equal(
        df_empty.compute(),
        unpack_payload(pack_payload(df_empty, group_key=group_key),
                       unpack_meta=df_empty._meta).compute(),
    )
Esempio n. 4
0
def test_pack_payload(df_all_types):
    # For a single row dataframe the packing actually has a few more bytes
    df = dd.from_pandas(pd.concat([df_all_types] * 10, ignore_index=True),
                        npartitions=3)
    size_before = df.memory_usage(deep=True).sum()

    packed_df = pack_payload(df, group_key=list(df.columns[-2:]))

    size_after = packed_df.memory_usage(deep=True).sum()

    assert (size_after < size_before).compute()
Esempio n. 5
0
def shuffle_store_dask_partitions(
    ddf: dd.DataFrame,
    table: str,
    secondary_indices: Optional[Union[str, Sequence[str]]],
    metadata_version: int,
    partition_on: List[str],
    store_factory: StoreFactory,
    df_serializer: Optional[DataFrameSerializer],
    dataset_uuid: str,
    num_buckets: int,
    sort_partitions_by: List[str],
    bucket_by: Sequence[str],
) -> da.Array:
    """
    Perform a dataset update with dask reshuffling to control partitioning.

    The shuffle operation will perform the following steps

    1. Pack payload data

       Payload data is serialized and compressed into a single byte value using
       ``distributed.protocol.serialize_bytes``, see also ``pack_payload``.

    2. Apply bucketing

       Hash the column subset ``bucket_by`` and distribute the hashes in
       ``num_buckets`` bins/buckets. Internally every bucket is identified by an
       integer and we will create one physical file for every bucket ID. The
       bucket ID is not exposed to the user and is dropped after the shuffle,
       before the store. This is done since we do not want to guarantee at the
       moment, that the hash function remains stable.

    3. Perform shuffle (dask.DataFrame.groupby.apply)

        The groupby key will be the combination of ``partition_on`` fields and the
        hash bucket ID. This will create a physical file for every unique tuple
        in ``partition_on + bucket_ID``. The function which is applied to the
        dataframe will perform all necessary subtask for storage of the dataset
        (partition_on, index calc, etc.).

    4. Unpack data (within the apply-function)

        After the shuffle, the first step is to unpack the payload data since
        the follow up tasks will require the full dataframe.

    5. Pre storage processing and parquet serialization

        We apply important pre storage processing like sorting data, applying
        final partitioning (at this time there should be only one group in the
        payload data but using the ``MetaPartition.partition_on`` guarantees the
        appropriate data structures kartothek expects are created.).
        After the preprocessing is done, the data is serialized and stored as
        parquet. The applied function will return an (empty) MetaPartition with
        indices and metadata which will then be used to commit the dataset.

    Returns
    -------

    A dask.Array holding relevant MetaPartition objects as values

    """
    if ddf.npartitions == 0:
        return ddf

    group_cols = partition_on.copy()

    if num_buckets is None:
        raise ValueError(
            "``num_buckets`` must not be None when shuffling data.")

    meta = ddf._meta
    meta[_KTK_HASH_BUCKET] = np.uint64(0)
    ddf = ddf.map_partitions(_hash_bucket, bucket_by, num_buckets, meta=meta)
    group_cols.append(_KTK_HASH_BUCKET)

    unpacked_meta = ddf._meta

    ddf = pack_payload(ddf, group_key=group_cols)
    ddf = ddf.groupby(by=group_cols)
    ddf = ddf.apply(
        partial(
            _unpack_store_partition,
            secondary_indices=secondary_indices,
            sort_partitions_by=sort_partitions_by,
            table=table,
            dataset_uuid=dataset_uuid,
            partition_on=partition_on,
            store_factory=store_factory,
            df_serializer=df_serializer,
            metadata_version=metadata_version,
            unpacked_meta=unpacked_meta,
        ),
        meta=("MetaPartition", "object"),
    )
    return ddf