Ejemplo n.º 1
0
def test_input_to_metaframes_dict():
    df_input = {
        "label":
        "cluster_1",
        "data": [
            ("some_file", pd.DataFrame({"A": [1]})),
            ("some_other_file", pd.DataFrame({"B": [2]})),
        ],
    }
    mp = parse_input_to_metapartition(obj=df_input)
    assert isinstance(mp, MetaPartition)
    assert len(mp.data) == 2
    assert len(mp.files) == 0

    assert mp.label == "cluster_1"

    data = mp.data

    df = data["some_file"]
    pdt.assert_frame_equal(df,
                           pd.DataFrame({"A": [1]}),
                           check_dtype=False,
                           check_like=True)

    df2 = data["some_other_file"]
    pdt.assert_frame_equal(df2,
                           pd.DataFrame({"B": [2]}),
                           check_dtype=False,
                           check_like=True)
Ejemplo n.º 2
0
def write_partition(
    partition_df: MetaPartitionInput,
    secondary_indices: List[str],
    sort_partitions_by: List[str],
    dataset_uuid: str,
    partition_on: List[str],
    store_factory: StoreFactory,
    df_serializer: Optional[DataFrameSerializer],
    metadata_version: int,
    dataset_table_name: str = SINGLE_TABLE,
) -> MetaPartition:
    """
    Write a dataframe to store, performing all necessary preprocessing tasks
    like partitioning, bucketing (NotImplemented), indexing, etc. in the correct order.
    """
    store = ensure_store(store_factory)

    # I don't have access to the group values
    mps = parse_input_to_metapartition(
        partition_df,
        metadata_version=metadata_version,
        table_name=dataset_table_name,
    )
    if sort_partitions_by:
        mps = mps.apply(
            partial(sort_values_categorical, columns=sort_partitions_by))
    if partition_on:
        mps = mps.partition_on(partition_on)
    if secondary_indices:
        mps = mps.build_indices(secondary_indices)
    return mps.store_dataframes(store=store,
                                dataset_uuid=dataset_uuid,
                                df_serializer=df_serializer)
def test_parse_nested_input_schema_compatible_but_different():
    # Ensure that input can be parsed even though the schemas are not identical but compatible
    df_input = [[pd.DataFrame({"A": [None]}), pd.DataFrame({"A": ["str"]})]]
    mp = parse_input_to_metapartition(df_input, metadata_version=4)
    expected_schema = make_meta(pd.DataFrame({"A": ["str"]}),
                                origin="expected")
    assert mp.schema == expected_schema
Ejemplo n.º 4
0
def store_dataframes_as_dataset__iter(
    df_generator,
    store,
    dataset_uuid=None,
    metadata=None,
    partition_on=None,
    df_serializer=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
    secondary_indices=None,
):
    """
    Store `pd.DataFrame` s iteratively as a partitioned dataset with multiple tables (files).

    Useful for datasets which do not fit into memory.

    Parameters
    ----------

    Returns
    -------
    dataset: kartothek.core.dataset.DatasetMetadata
        The stored dataset.

    """

    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(df,
                                          metadata_version=metadata_version)

        if partition_on:
            mp = mp.partition_on(partition_on)

        if secondary_indices:
            mp = mp.build_indices(secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(store=store,
                                 dataset_uuid=dataset_uuid,
                                 df_serializer=df_serializer)

        # Add `kartothek.io_components.metapartition.MetaPartition` object to list to track partitions
        new_partitions.append(mp)

    # Store metadata and return `kartothek.DatasetMetadata` object
    return store_dataset_from_partitions(
        partition_list=new_partitions,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_storage_format=metadata_storage_format,
    )
Ejemplo n.º 5
0
def _store_partition(
    df: pd.DataFrame,
    secondary_indices: List[str],
    sort_partitions_by: Optional[str],
    table: str,
    dataset_uuid: str,
    partition_on: Optional[List[str]],
    store_factory: StoreFactoryType,
    df_serializer: DataFrameSerializer,
    metadata_version: int,
    unpacked_meta: pd.DataFrame,
) -> MetaPartition:
    df = unpack_payload_pandas(df, unpacked_meta)
    if _KTK_HASH_BUCKET in df:
        df = df.drop(_KTK_HASH_BUCKET, axis=1)
    store = store_factory()
    # I don't have access to the group values
    mps = parse_input_to_metapartition({"data": {
        table: df
    }},
                                       metadata_version=metadata_version)
    # delete reference to enable release after partition_on; before index build
    del df
    if sort_partitions_by:
        mps = mps.apply(
            partial(sort_values_categorical, column=sort_partitions_by))
    if partition_on:
        mps = mps.partition_on(partition_on)
    if secondary_indices:
        mps = mps.build_indices(secondary_indices)
    return mps.store_dataframes(store=store,
                                dataset_uuid=dataset_uuid,
                                df_serializer=df_serializer)
Ejemplo n.º 6
0
def _write_dataframe_partitions(
    ddf: dd.DataFrame,
    store: StoreFactory,
    dataset_uuid: str,
    table: str,
    secondary_indices: List[str],
    shuffle: bool,
    repartition_ratio: Optional[SupportsFloat],
    num_buckets: int,
    sort_partitions_by: List[str],
    df_serializer: Optional[DataFrameSerializer],
    metadata_version: int,
    partition_on: List[str],
    bucket_by: List[str],
) -> dd.Series:
    if repartition_ratio and ddf is not None:
        ddf = ddf.repartition(
            npartitions=int(np.ceil(ddf.npartitions / repartition_ratio)))

    if ddf is None:
        mps = dd.from_pandas(
            pd.Series([
                parse_input_to_metapartition(
                    None,
                    metadata_version=metadata_version,
                    table_name=table,
                )
            ]),
            npartitions=1,
        )
    else:
        if shuffle:
            mps = shuffle_store_dask_partitions(
                ddf=ddf,
                table=table,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                num_buckets=num_buckets,
                sort_partitions_by=sort_partitions_by,
                bucket_by=bucket_by,
            )
        else:
            mps = ddf.map_partitions(
                write_partition,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                sort_partitions_by=sort_partitions_by,
                dataset_table_name=table,
                meta=(MetaPartition),
            )
    return mps
Ejemplo n.º 7
0
def test_input_to_metaframes_simple():
    df_input = pd.DataFrame({"A": [1]})
    mp = parse_input_to_metapartition(obj=df_input)

    assert isinstance(mp, MetaPartition)
    assert len(mp.data) == 1
    assert len(mp.files) == 0

    df = list(mp.data.values())[0]
    pdt.assert_frame_equal(df, df_input)

    assert isinstance(mp.label, six.string_types)
Ejemplo n.º 8
0
def store_dataframes_as_dataset(
    store,
    dataset_uuid,
    dfs,
    metadata=None,
    partition_on=None,
    df_serializer=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files).

    Useful for very small datasets where all data fits into memory.

    Parameters
    ----------
    dfs : dict of pd.DataFrame or pd.DataFrame
        The dataframe(s) to be stored. If only a single dataframe is passed, it will be stored as the `core` table.

    Returns
    -------
    The stored dataset

    """
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if isinstance(dfs, dict):
        dfs = {"data": [(table, df) for table, df in dfs.items()]}

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    mp = parse_input_to_metapartition(dfs, metadata_version)

    if partition_on:
        mp = MetaPartition.partition_on(mp, partition_on)

    mps = mp.store_dataframes(store=store,
                              dataset_uuid=dataset_uuid,
                              df_serializer=df_serializer)

    return store_dataset_from_partitions(
        partition_list=mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_storage_format=metadata_storage_format,
    )
Ejemplo n.º 9
0
def test_input_to_metaframes_empty():
    mp = parse_input_to_metapartition(obj=[None])
    assert mp == MetaPartition(label=None)
    mp = parse_input_to_metapartition(obj=[])
    assert mp == MetaPartition(label=None)
Ejemplo n.º 10
0
def commit_dataset(
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    new_partitions: Optional[Iterable[MetaPartition]] = None,
    output_dataset_uuid: Optional[str] = None,
    delete_scope: Optional[Iterable[Dict[str, Any]]] = None,
    metadata: Dict = None,
    df_serializer: DataFrameSerializer = None,
    metadata_merger: Callable[[List[Dict]], Dict] = None,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[Iterable[str]] = None,
    factory: Optional[DatasetFactory] = None,
    secondary_indices: Optional[Iterable[str]] = None,
):
    """
    Commit new state to an existing dataset. This can be used for three distinct operations

    1. Add previously written partitions to this dataset

        If for some reasons, the existing pipelines are not sufficient but you need more control, you can write the files outside of a kartothek pipeline and commit them whenever you choose to.

        This should be used in combination with
        :func:`~kartothek.io.eager.write_single_partition` and :func:`~kartothek.io.eager.create_empty_dataset_header`.

        .. code::

            import pandas as pd
            from kartothek.io.eager import write_single_partition, commit_dataset

            store = "hfs://my_store"

            # The partition writing can be done concurrently and distributed if wanted.
            # Only the information about what partitions have been written is required for the commit.
            new_partitions = [
                write_single_partition(
                    store=store,
                    dataset_uuid='dataset_uuid',
                    data=pd.DataFrame({'column': [1, 2]}),
                )
            ]

            new_dataset = commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                new_partitions=new_partitions,
            )

    2. Simple delete of partitions

        If you want to remove some partitions this is one of the simples ways of doing so. By simply providing a delete_scope, this removes the references to these files in an atomic commit.

        .. code::

            commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                delete_scope=[
                    {
                        "partition_column": "part_value_to_be_removed"
                    }
                ],
            )

    3. Add additional metadata

        To add new metadata to an existing dataset

        .. code::

            commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                metadata={"new": "user_metadata"},
            )

        Note::

            If you do not want the new metadata to be merged with the existing one, povide a custom ``metadata_merger``

    Parameters
    ----------
    new_partitions:
        Input partition to be committed.

    """
    if output_dataset_uuid is not None:
        warnings.warn(
            "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if df_serializer is not None:
        warnings.warn(
            "The keyword `df_serializer` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if not new_partitions and not metadata and not delete_scope:
        raise ValueError(
            "Need to provide either new data, new metadata or a delete scope. None of it was provided."
        )
    store = lazy_store(store)
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    mps = parse_input_to_metapartition(new_partitions,
                                       metadata_version=metadata_version)

    if secondary_indices:
        mps = mps.build_indices(columns=secondary_indices)

    mps_list = [_maybe_infer_files_attribute(mp, dataset_uuid) for mp in mps]

    dmd = update_dataset_from_partitions(
        mps_list,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
    return dmd
Ejemplo n.º 11
0
def update_dataset_from_ddf(
    ddf,
    store=None,
    dataset_uuid=None,
    table=None,
    secondary_indices=None,
    shuffle=False,
    repartition_ratio=None,
    num_buckets=1,
    sort_partitions_by=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    factory=None,
):
    """
    Update a dataset from a dask.dataframe.


    .. admonition:: Behavior without ``shuffle==False``

        In the case without ``partition_on`` every dask partition is mapped to a single kartothek partition

        In the case with ``partition_on`` every dask partition is mapped to N kartothek partitions, where N
        depends on the content of the respective partition, such that every resulting kartothek partition has
        only a single value in the respective ``partition_on`` columns.

    .. admonition:: Behavior with ``shuffle==True``

        ``partition_on`` is mandatory

        Perform a data shuffle to ensure that every primary key will have at most ``num_bucket``.

        .. note::
            The number of allowed buckets will have an impact on the required resources and runtime.
            Using a larger number of allowed buckets will usually reduce resource consumption and in some
            cases also improves runtime performance.

        :Example:

            >>> partition_on="primary_key"
            >>> num_buckets=2  # doctest: +SKIP
            primary_key=1/bucket1.parquet
            primary_key=1/bucket2.parquet

    .. note:: This can only be used for datasets with a single table!

    Parameters
    ----------
    ddf: Union[dask.dataframe.DataFrame, None]
        The dask.Dataframe to be used to calculate the new partitions from. If this parameter is `None`, the update pipeline
        will only delete partitions without creating new ones.
    shuffle: bool
        If True and partition_on is requested, shuffle the data to reduce number of output partitions
    repartition_ratio: Optional[Union[int, float]]
        If provided, repartition the dataframe before calculation starts to ``ceil(ddf.npartitions / repartition_ratio)``
    num_buckets: int
        If provided, the output partitioning will have ``num_buckets`` files per primary key partitioning.
        This effectively splits up the execution ``num_buckets`` times. Setting this parameter may be helpful when
        scaling.
        This only has an effect if ``shuffle==True``
    """
    partition_on = normalize_arg("partition_on", partition_on)
    secondary_indices = normalize_arg("secondary_indices", secondary_indices)
    delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope)

    if table is None:
        raise TypeError("The parameter `table` is not optional.")
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        ds_factory=factory,
    )

    if shuffle and not partition_on:
        raise ValueError(
            "If ``shuffle`` is requested, at least one ``partition_on`` column needs to be provided."
        )
    if ds_factory is not None:
        check_single_table_dataset(ds_factory, table)

    if repartition_ratio and ddf is not None:
        ddf = ddf.repartition(
            npartitions=int(np.ceil(ddf.npartitions / repartition_ratio)))

    if ddf is None:
        mps = [
            parse_input_to_metapartition(
                None, metadata_version=default_metadata_version)
        ]
    else:
        secondary_indices = _ensure_compatible_indices(ds_factory,
                                                       secondary_indices)

        if shuffle and partition_on:
            mps = _update_dask_partitions_shuffle(
                ddf=ddf,
                table=table,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                num_buckets=num_buckets,
                sort_partitions_by=sort_partitions_by,
            )
        else:
            delayed_tasks = ddf.to_delayed()
            delayed_tasks = [{"data": {table: task}} for task in delayed_tasks]
            mps = _update_dask_partitions_one_to_one(
                delayed_tasks=delayed_tasks,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                sort_partitions_by=sort_partitions_by,
            )
    return dask.delayed(update_dataset_from_partitions)(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 12
0
def update_dataset_from_dataframes(
    df_list: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]],
    store: Optional[KeyValueStore] = None,
    dataset_uuid: Optional[str] = None,
    delete_scope=None,
    metadata=None,
    df_serializer: Optional[ParquetSerializer] = None,
    metadata_merger: Callable = None,
    central_partition_metadata: bool = True,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    load_dynamic_metadata: bool = True,
    sort_partitions_by: Optional[str] = None,
    secondary_indices: Optional[List[str]] = None,
    factory: Optional[DatasetFactory] = None,
) -> DatasetMetadata:
    """
    Update a kartothek dataset in store at once, using a list of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------
    df_list:
        The dataframe(s) to be stored.

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).

    See Also
    --------
    :ref:`mutating_datasets`
    """
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    inferred_indices = _ensure_compatible_indices(ds_factory,
                                                  secondary_indices)
    del secondary_indices

    mp = parse_input_to_metapartition(
        df_list,
        metadata_version=metadata_version,
        expected_secondary_indices=inferred_indices,
    )

    if sort_partitions_by:
        mp = mp.apply(
            partial(sort_values_categorical, columns=sort_partitions_by))

    if partition_on:
        mp = mp.partition_on(partition_on)

    if inferred_indices:
        mp = mp.build_indices(inferred_indices)

    mp = mp.store_dataframes(store=store,
                             dataset_uuid=dataset_uuid,
                             df_serializer=df_serializer)

    return update_dataset_from_partitions(
        mp,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 13
0
def test_dataframe_input_to_metapartition():
    with pytest.raises(ValueError):
        parse_input_to_metapartition(tuple([1]))
    with pytest.raises(ValueError):
        parse_input_to_metapartition("abc")