Exemple #1
0
def update_dataset_from_delayed(
    delayed_tasks,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
):
    """
    A dask.delayed graph to add and store a list of dictionaries containing
    dataframes to a kartothek dataset in store. The input should be a list
    (or splitter pipeline) containing
    :class:`~karothek.io.metapartition.MetaPartition`. If you want to use this
    pipeline step for just deleting partitions without adding new ones you
    have to give an empty meta partition as input (``[Metapartition(None)]``).

    Parameters
    ----------
    """
    partition_on = normalize_arg("partition_on", partition_on)
    secondary_indices = normalize_arg("secondary_indices", secondary_indices)
    delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope)

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        ds_factory=factory,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory,
                                                   secondary_indices)
    mps = _update_dask_partitions_one_to_one(
        delayed_tasks=delayed_tasks,
        secondary_indices=secondary_indices,
        metadata_version=metadata_version,
        partition_on=partition_on,
        store_factory=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
        sort_partitions_by=sort_partitions_by,
    )

    return dask.delayed(update_dataset_from_partitions)(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Exemple #2
0
def update_dataset_from_dataframes__iter(
    df_generator,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    central_partition_metadata=True,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    load_dynamic_metadata=True,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
):
    """
    Update a kartothek dataset in store iteratively, using a generator of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).
    """
    if load_dynamic_metadata is not True:
        warnings.warn(
            "The keyword `load_dynamic_metadata` has no use and will be removed soon",
            DeprecationWarning,
        )

    if central_partition_metadata is not True:
        warnings.warn(
            "The keyword `central_partition_metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices)

    if sort_partitions_by:  # Define function which sorts each partition by column
        sort_partitions_by_fn = partial(
            sort_values_categorical, column=sort_partitions_by
        )

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(
            df,
            metadata_version=metadata_version,
            expected_secondary_indices=secondary_indices,
        )

        if sort_partitions_by:
            mp = mp.apply(sort_partitions_by_fn)

        if partition_on:
            mp = mp.partition_on(partition_on=partition_on)

        if secondary_indices:
            mp = mp.build_indices(columns=secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(
            store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid
        )

        new_partitions.append(mp)

    return update_dataset_from_partitions(
        new_partitions,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Exemple #3
0
def update_dataset_from_ddf(
    ddf: dd.DataFrame,
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    table: str = SINGLE_TABLE,
    secondary_indices: Optional[List[str]] = None,
    shuffle: bool = False,
    repartition_ratio: Optional[SupportsFloat] = None,
    num_buckets: int = 1,
    sort_partitions_by: Optional[Union[List[str], str]] = None,
    delete_scope: Optional[Iterable[Mapping[str, str]]] = None,
    metadata: Optional[Mapping] = None,
    df_serializer: Optional[DataFrameSerializer] = None,
    metadata_merger: Optional[Callable] = None,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    factory: Optional[DatasetFactory] = None,
    bucket_by: Optional[Union[List[str], str]] = None,
):
    """
    Update a dataset from a dask.dataframe.

    See Also
    --------
    :ref:`mutating_datasets`
    """
    partition_on = normalize_arg("partition_on", partition_on)
    secondary_indices = normalize_arg("secondary_indices", secondary_indices)
    sort_partitions_by = normalize_arg("sort_partitions_by", sort_partitions_by)
    bucket_by = normalize_arg("bucket_by", bucket_by)
    store = normalize_arg("store", store)
    delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope)

    if table is None:
        raise TypeError("The parameter `table` is not optional.")
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        ds_factory=factory,
    )

    inferred_indices = _ensure_compatible_indices(ds_factory, secondary_indices)
    del secondary_indices

    if ds_factory is not None:
        check_single_table_dataset(ds_factory, table)

    mps = _write_dataframe_partitions(
        ddf=ddf,
        store=store,
        dataset_uuid=dataset_uuid or ds_factory.dataset_uuid,
        table=table,
        secondary_indices=inferred_indices,
        shuffle=shuffle,
        repartition_ratio=repartition_ratio,
        num_buckets=num_buckets,
        sort_partitions_by=sort_partitions_by,
        df_serializer=df_serializer,
        metadata_version=metadata_version,
        partition_on=cast(List[str], partition_on),
        bucket_by=bucket_by,
    )
    return dask.delayed(update_dataset_from_partitions)(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Exemple #4
0
def update_dataset_from_ddf(
    ddf,
    store=None,
    dataset_uuid=None,
    table=None,
    secondary_indices=None,
    shuffle=False,
    repartition_ratio=None,
    num_buckets=1,
    sort_partitions_by=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    factory=None,
    bucket_by=None,
):
    """
    Update a dataset from a dask.dataframe.


    .. admonition:: Behavior without ``shuffle==False``

        In the case without ``partition_on`` every dask partition is mapped to a single kartothek partition

        In the case with ``partition_on`` every dask partition is mapped to N kartothek partitions, where N
        depends on the content of the respective partition, such that every resulting kartothek partition has
        only a single value in the respective ``partition_on`` columns.

    .. admonition:: Behavior with ``shuffle==True``

        ``partition_on`` is mandatory

        Perform a data shuffle to ensure that every primary key will have at most ``num_bucket``.

        .. note::
            The number of allowed buckets will have an impact on the required resources and runtime.
            Using a larger number of allowed buckets will usually reduce resource consumption and in some
            cases also improves runtime performance.

        :Example:

            >>> partition_on="primary_key"
            >>> num_buckets=2  # doctest: +SKIP
            primary_key=1/bucket1.parquet
            primary_key=1/bucket2.parquet

    .. note:: This can only be used for datasets with a single table!

    See also, :ref:`partitioning_dask`.

    Parameters
    ----------
    ddf: Union[dask.dataframe.DataFrame, None]
        The dask.Dataframe to be used to calculate the new partitions from. If this parameter is `None`, the update pipeline
        will only delete partitions without creating new ones.
    shuffle: bool
        If `True` and `partition_on` is requested, shuffle the data to reduce number of output partitions.

        See also, :ref:`shuffling`.

        .. warning::

            Dask uses a heuristic to determine how data is shuffled and there are two options, `partd` for local disk shuffling and `tasks` for distributed shuffling using a task graph. If there is no :class:`distributed.Client` in the context and the option is not set explicitly, dask will choose `partd` which may cause data loss when the graph is executed on a distributed cluster.

            Therefore, we recommend to specify the dask shuffle method explicitly, e.g. by using a context manager.

            .. code::

                with dask.config(shuffle='tasks'):
                    graph = update_dataset_from_ddf(...)
                graph.compute()

    repartition_ratio: Optional[Union[int, float]]
        If provided, repartition the dataframe before calculation starts to ``ceil(ddf.npartitions / repartition_ratio)``
    num_buckets: int
        If provided, the output partitioning will have ``num_buckets`` files per primary key partitioning.
        This effectively splits up the execution ``num_buckets`` times. Setting this parameter may be helpful when
        scaling.
        This only has an effect if ``shuffle==True``
    bucket_by:
        The subset of columns which should be considered for bucketing.

        This parameter ensures that groups of the given subset are never split
        across buckets within a given partition.

        Without specifying this the buckets will be created randomly.

        This only has an effect if ``shuffle==True``

        .. admonition:: Secondary indices

            This parameter has a strong effect on the performance of secondary
            indices. Since it guarantees that a given tuple of the subset will
            be entirely put into the same file you can build efficient indices
            with this approach.

        .. note::

            Only columns with data types which can be hashed are allowed to be used in this.
    """
    partition_on = normalize_arg("partition_on", partition_on)
    secondary_indices = normalize_arg("secondary_indices", secondary_indices)
    delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope)

    if table is None:
        raise TypeError("The parameter `table` is not optional.")
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        ds_factory=factory,
    )

    if shuffle and not partition_on:
        raise ValueError(
            "If ``shuffle`` is requested, at least one ``partition_on`` column needs to be provided."
        )
    if ds_factory is not None:
        check_single_table_dataset(ds_factory, table)

    if repartition_ratio and ddf is not None:
        ddf = ddf.repartition(
            npartitions=int(np.ceil(ddf.npartitions / repartition_ratio)))

    if ddf is None:
        mps = [
            parse_input_to_metapartition(
                None, metadata_version=default_metadata_version)
        ]
    else:
        secondary_indices = _ensure_compatible_indices(ds_factory,
                                                       secondary_indices)

        if shuffle and partition_on:
            mps = _update_dask_partitions_shuffle(
                ddf=ddf,
                table=table,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                num_buckets=num_buckets,
                sort_partitions_by=sort_partitions_by,
                bucket_by=bucket_by,
            )
        else:
            delayed_tasks = ddf.to_delayed()
            delayed_tasks = [{"data": {table: task}} for task in delayed_tasks]
            mps = _update_dask_partitions_one_to_one(
                delayed_tasks=delayed_tasks,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                sort_partitions_by=sort_partitions_by,
            )
    return dask.delayed(update_dataset_from_partitions)(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
def update_dataset_from_dataframes__iter(
    df_generator,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
    table_name: str = SINGLE_TABLE,
):
    """
    Update a kartothek dataset in store iteratively, using a generator of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).

    See Also
    --------
    :ref:`mutating_datasets`
    """

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices)

    if sort_partitions_by:  # Define function which sorts each partition by column
        sort_partitions_by_fn = partial(
            sort_values_categorical, columns=sort_partitions_by
        )

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(
            df, metadata_version=metadata_version, table_name=table_name,
        )

        if sort_partitions_by:
            mp = mp.apply(sort_partitions_by_fn)

        if partition_on:
            mp = mp.partition_on(partition_on=partition_on)

        if secondary_indices:
            mp = mp.build_indices(columns=secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(
            store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid
        )

        new_partitions.append(mp)

    return update_dataset_from_partitions(
        new_partitions,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Exemple #6
0
def update_dataset_from_dataframes__iter(
    df_generator,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    central_partition_metadata=True,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    load_dynamic_metadata=True,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
):
    """
    Update a kartothek dataset in store iteratively, using a generator of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------
    df_generator: Iterable[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]
        The dataframe(s) to be stored.

    Returns
    -------
    The dataset metadata object (`kartothek.dataset.DatasetMetadata`).
    """

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        load_dynamic_metadata=load_dynamic_metadata,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory,
                                                   secondary_indices)

    if sort_partitions_by:  # Define function which sorts each partition by column
        sort_partitions_by_fn = partial(sort_values_categorical,
                                        column=sort_partitions_by)

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(
            df, metadata_version=default_metadata_version)

        if sort_partitions_by:
            mp = mp.apply(sort_partitions_by_fn)

        if partition_on:
            mp = mp.partition_on(partition_on=partition_on)

        if secondary_indices:
            mp = mp.build_indices(columns=secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(
            store=store,
            df_serializer=df_serializer,
            dataset_uuid=dataset_uuid,
            store_metadata=not central_partition_metadata,
        )

        new_partitions.append(mp)

    return update_dataset_from_partitions(
        new_partitions,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Exemple #7
0
def write_single_partition(
    store=None,
    dataset_uuid=None,
    data=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    factory=None,
    secondary_indices=None,
):
    """
    Write the parquet file(s) for a single partition. This will **not** update the dataset header and can therefore
    be used for highly concurrent dataset writes.

    For datasets with explicit partitions, the dataset header can be updated by calling
    :func:`kartothek.io.eager.commit_dataset` with the output of this function.

    .. note::

        It is highly recommended to use the full pipelines whenever possible. This functionality should be
        used with caution and should only be necessary in cases where traditional pipeline scheduling is not an
        option.

    .. note::

        This function requires an existing dataset metadata file and the schemas for the tables to be present.
        Either you have ensured that the dataset always exists though some other means or use
        :func:`create_empty_dataset_header` at the start of your computation to ensure the basic dataset
        metadata is there.

    Parameters
    ----------
    data: Dict
        The input is defined according to :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition`

    Returns
    -------
    An empty :class:`~kartothek.io_components.metapartition.MetaPartition` referencing the new files
    """
    if metadata is not None:
        warnings.warn(
            "The keyword `metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if overwrite is not False:
        warnings.warn(
            "The keyword `overwrite` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if metadata_merger is not None:
        warnings.warn(
            "The keyword `metadata_merger` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if data is None:
        raise TypeError("The parameter `data` is not optional")
    _, ds_metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=_make_callable(store),
        ds_factory=factory,
        default_metadata_version=metadata_version,
        partition_on=partition_on,
    )

    mp = parse_input_to_metapartition(obj=data,
                                      metadata_version=ds_metadata_version)
    if partition_on:
        mp = mp.partition_on(partition_on)

    if secondary_indices:
        mp = mp.build_indices(columns=secondary_indices)

    mp = mp.validate_schema_compatible(dataset_uuid=dataset_uuid, store=store)

    mp = mp.store_dataframes(store=store,
                             dataset_uuid=dataset_uuid,
                             df_serializer=df_serializer)
    return mp
Exemple #8
0
def update_dataset_from_dataframes(
    df_list,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    central_partition_metadata=True,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    load_dynamic_metadata=True,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
):
    """
    Update a kartothek dataset in store at once, using a list of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------
    df_list: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]
        The dataframe(s) to be stored.

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).
    """
    if load_dynamic_metadata is not True:
        warnings.warn(
            "The keyword `load_dynamic_metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if central_partition_metadata is not True:
        warnings.warn(
            "The keyword `central_partition_metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory,
                                                   secondary_indices)

    mp = parse_input_to_metapartition(
        df_list,
        metadata_version=metadata_version,
        expected_secondary_indices=secondary_indices,
    )

    if sort_partitions_by:
        mp = mp.apply(
            partial(sort_values_categorical, column=sort_partitions_by))

    if partition_on:
        mp = mp.partition_on(partition_on)

    if secondary_indices:
        mp = mp.build_indices(secondary_indices)

    mp = mp.store_dataframes(store=store,
                             dataset_uuid=dataset_uuid,
                             df_serializer=df_serializer)

    return update_dataset_from_partitions(
        mp,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Exemple #9
0
def commit_dataset(
    store=None,
    dataset_uuid=None,
    new_partitions=NoDefault(),
    output_dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    factory=None,
    secondary_indices=None,
):
    """
    Update an existing dataset with new, already written partitions. This should be used in combination with
    :func:`~kartothek.io.eager.write_single_partition` and :func:`~kartothek.io.eager.create_empty_dataset_header`.

    .. note::

        It is highly recommended to use the full pipelines whenever possible. This functionality should be
        used with caution and should only be necessary in cases where traditional pipeline scheduling is not an
        option.

    Example:

        .. code::

            import storefact
            import pandas as pd
            from functools import partial
            from kartothek.io.eager import write_single_partition
            form kartothek.io.eager.update import commit_dataset

            store = partial(storefact.get_store_from_url, url="hfs://my_store")

            new_data={
                "data": {
                    "table_1": pd.DataFrame({'column': [1, 2]}),
                    "table_1": pd.DataFrame({'other_column': ['a', 'b']}),
                }
            }
            # The partition writing can be done concurrently and distributed if wanted.
            # Only the information about what partitions have been written is required for the commit.
            new_partitions = [
                write_single_partition(
                    store=store,
                    dataset_uuid='dataset_uuid',
                    data=new_data
                )
            ]

            new_dataset = commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                new_partitions=new_partitions
            )

    Parameters
    ----------
    new_partitions: List[kartothek.io_components.metapartition.MetaPartition]
        Input partition to be committed.

    """
    if output_dataset_uuid is not None:
        warnings.warn(
            "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if df_serializer is not None:
        warnings.warn(
            "The keyword `df_serializer` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if isinstance(new_partitions, NoDefault):
        raise TypeError("The parameter `new_partitions` is not optional")
    store = _make_callable(store)
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    mps = parse_input_to_metapartition(new_partitions,
                                       metadata_version=metadata_version)

    if secondary_indices:
        mps = mps.build_indices(columns=secondary_indices)

    mps = [_maybe_infer_files_attribute(mp, dataset_uuid) for mp in mps]

    dmd = update_dataset_from_partitions(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
    return dmd
Exemple #10
0
def commit_dataset(
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    new_partitions: Optional[Iterable[MetaPartition]] = None,
    output_dataset_uuid: Optional[str] = None,
    delete_scope: Optional[Iterable[Dict[str, Any]]] = None,
    metadata: Dict = None,
    df_serializer: DataFrameSerializer = None,
    metadata_merger: Callable[[List[Dict]], Dict] = None,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[Iterable[str]] = None,
    factory: Optional[DatasetFactory] = None,
    secondary_indices: Optional[Iterable[str]] = None,
):
    """
    Commit new state to an existing dataset. This can be used for three distinct operations

    1. Add previously written partitions to this dataset

        If for some reasons, the existing pipelines are not sufficient but you need more control, you can write the files outside of a kartothek pipeline and commit them whenever you choose to.

        This should be used in combination with
        :func:`~kartothek.io.eager.write_single_partition` and :func:`~kartothek.io.eager.create_empty_dataset_header`.

        .. code::

            import pandas as pd
            from kartothek.io.eager import write_single_partition, commit_dataset

            store = "hfs://my_store"

            # The partition writing can be done concurrently and distributed if wanted.
            # Only the information about what partitions have been written is required for the commit.
            new_partitions = [
                write_single_partition(
                    store=store,
                    dataset_uuid='dataset_uuid',
                    data=pd.DataFrame({'column': [1, 2]}),
                )
            ]

            new_dataset = commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                new_partitions=new_partitions,
            )

    2. Simple delete of partitions

        If you want to remove some partitions this is one of the simples ways of doing so. By simply providing a delete_scope, this removes the references to these files in an atomic commit.

        .. code::

            commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                delete_scope=[
                    {
                        "partition_column": "part_value_to_be_removed"
                    }
                ],
            )

    3. Add additional metadata

        To add new metadata to an existing dataset

        .. code::

            commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                metadata={"new": "user_metadata"},
            )

        Note::

            If you do not want the new metadata to be merged with the existing one, povide a custom ``metadata_merger``

    Parameters
    ----------
    new_partitions:
        Input partition to be committed.

    """
    if output_dataset_uuid is not None:
        warnings.warn(
            "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if df_serializer is not None:
        warnings.warn(
            "The keyword `df_serializer` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if not new_partitions and not metadata and not delete_scope:
        raise ValueError(
            "Need to provide either new data, new metadata or a delete scope. None of it was provided."
        )
    store = lazy_store(store)
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    mps = parse_input_to_metapartition(new_partitions,
                                       metadata_version=metadata_version)

    if secondary_indices:
        mps = mps.build_indices(columns=secondary_indices)

    mps_list = [_maybe_infer_files_attribute(mp, dataset_uuid) for mp in mps]

    dmd = update_dataset_from_partitions(
        mps_list,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
    return dmd
Exemple #11
0
def update_dataset_from_ddf(
    ddf,
    store=None,
    dataset_uuid=None,
    table=None,
    secondary_indices=None,
    shuffle=False,
    repartition_ratio=None,
    num_buckets=1,
    sort_partitions_by=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    factory=None,
):
    """
    Update a dataset from a dask.dataframe.


    .. admonition:: Behavior without ``shuffle==False``

        In the case without ``partition_on`` every dask partition is mapped to a single kartothek partition

        In the case with ``partition_on`` every dask partition is mapped to N kartothek partitions, where N
        depends on the content of the respective partition, such that every resulting kartothek partition has
        only a single value in the respective ``partition_on`` columns.

    .. admonition:: Behavior with ``shuffle==True``

        ``partition_on`` is mandatory

        Perform a data shuffle to ensure that every primary key will have at most ``num_bucket``.

        .. note::
            The number of allowed buckets will have an impact on the required resources and runtime.
            Using a larger number of allowed buckets will usually reduce resource consumption and in some
            cases also improves runtime performance.

        :Example:

            >>> partition_on="primary_key"
            >>> num_buckets=2  # doctest: +SKIP
            primary_key=1/bucket1.parquet
            primary_key=1/bucket2.parquet

    .. note:: This can only be used for datasets with a single table!

    Parameters
    ----------
    ddf: Union[dask.dataframe.DataFrame, None]
        The dask.Dataframe to be used to calculate the new partitions from. If this parameter is `None`, the update pipeline
        will only delete partitions without creating new ones.
    shuffle: bool
        If True and partition_on is requested, shuffle the data to reduce number of output partitions
    repartition_ratio: Optional[Union[int, float]]
        If provided, repartition the dataframe before calculation starts to ``ceil(ddf.npartitions / repartition_ratio)``
    num_buckets: int
        If provided, the output partitioning will have ``num_buckets`` files per primary key partitioning.
        This effectively splits up the execution ``num_buckets`` times. Setting this parameter may be helpful when
        scaling.
        This only has an effect if ``shuffle==True``
    """
    partition_on = normalize_arg("partition_on", partition_on)
    secondary_indices = normalize_arg("secondary_indices", secondary_indices)
    delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope)

    if table is None:
        raise TypeError("The parameter `table` is not optional.")
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        ds_factory=factory,
    )

    if shuffle and not partition_on:
        raise ValueError(
            "If ``shuffle`` is requested, at least one ``partition_on`` column needs to be provided."
        )
    if ds_factory is not None:
        check_single_table_dataset(ds_factory, table)

    if repartition_ratio and ddf is not None:
        ddf = ddf.repartition(
            npartitions=int(np.ceil(ddf.npartitions / repartition_ratio)))

    if ddf is None:
        mps = [
            parse_input_to_metapartition(
                None, metadata_version=default_metadata_version)
        ]
    else:
        secondary_indices = _ensure_compatible_indices(ds_factory,
                                                       secondary_indices)

        if shuffle and partition_on:
            mps = _update_dask_partitions_shuffle(
                ddf=ddf,
                table=table,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                num_buckets=num_buckets,
                sort_partitions_by=sort_partitions_by,
            )
        else:
            delayed_tasks = ddf.to_delayed()
            delayed_tasks = [{"data": {table: task}} for task in delayed_tasks]
            mps = _update_dask_partitions_one_to_one(
                delayed_tasks=delayed_tasks,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                sort_partitions_by=sort_partitions_by,
            )
    return dask.delayed(update_dataset_from_partitions)(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Exemple #12
0
def update_dataset_from_dataframes(
    df_list: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]],
    store: Optional[KeyValueStore] = None,
    dataset_uuid: Optional[str] = None,
    delete_scope=None,
    metadata=None,
    df_serializer: Optional[ParquetSerializer] = None,
    metadata_merger: Callable = None,
    central_partition_metadata: bool = True,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    load_dynamic_metadata: bool = True,
    sort_partitions_by: Optional[str] = None,
    secondary_indices: Optional[List[str]] = None,
    factory: Optional[DatasetFactory] = None,
) -> DatasetMetadata:
    """
    Update a kartothek dataset in store at once, using a list of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------
    df_list:
        The dataframe(s) to be stored.

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).

    See Also
    --------
    :ref:`mutating_datasets`
    """
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    inferred_indices = _ensure_compatible_indices(ds_factory,
                                                  secondary_indices)
    del secondary_indices

    mp = parse_input_to_metapartition(
        df_list,
        metadata_version=metadata_version,
        expected_secondary_indices=inferred_indices,
    )

    if sort_partitions_by:
        mp = mp.apply(
            partial(sort_values_categorical, columns=sort_partitions_by))

    if partition_on:
        mp = mp.partition_on(partition_on)

    if inferred_indices:
        mp = mp.build_indices(inferred_indices)

    mp = mp.store_dataframes(store=store,
                             dataset_uuid=dataset_uuid,
                             df_serializer=df_serializer)

    return update_dataset_from_partitions(
        mp,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Exemple #13
0
def update_dataset_from_ddf(
    ddf: dd.DataFrame,
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    table: str = SINGLE_TABLE,
    secondary_indices: Optional[List[str]] = None,
    shuffle: bool = False,
    repartition_ratio: Optional[SupportsFloat] = None,
    num_buckets: int = 1,
    sort_partitions_by: Optional[Union[List[str], str]] = None,
    delete_scope: Optional[Iterable[Mapping[str, str]]] = None,
    metadata: Optional[Mapping] = None,
    df_serializer: Optional[DataFrameSerializer] = None,
    metadata_merger: Optional[Callable] = None,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    factory: Optional[DatasetFactory] = None,
    bucket_by: Optional[Union[List[str], str]] = None,
):
    """
    Update a dataset from a dask.dataframe.

    See Also
    --------
    :ref:`mutating_datasets`
    """
    if table is None:
        raise TypeError("The parameter `table` is not optional.")

    # normalization done by normalize_args but mypy doesn't recognize this
    sort_partitions_by = cast(List[str], sort_partitions_by)
    secondary_indices = cast(List[str], secondary_indices)
    bucket_by = cast(List[str], bucket_by)
    partition_on = cast(List[str], partition_on)

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        ds_factory=factory,
    )

    inferred_indices = _ensure_compatible_indices(ds_factory,
                                                  secondary_indices)
    del secondary_indices

    mp_ser = _write_dataframe_partitions(
        ddf=ddf,
        store=ds_factory.store_factory if ds_factory else store,
        dataset_uuid=dataset_uuid or ds_factory.dataset_uuid,
        table=table,
        secondary_indices=inferred_indices,
        shuffle=shuffle,
        repartition_ratio=repartition_ratio,
        num_buckets=num_buckets,
        sort_partitions_by=sort_partitions_by,
        df_serializer=df_serializer,
        metadata_version=metadata_version,
        partition_on=cast(List[str], partition_on),
        bucket_by=bucket_by,
    )

    return mp_ser.reduction(
        chunk=_id,
        aggregate=_commit_update_from_reduction,
        split_every=False,
        token="commit-dataset",
        meta=object,
        aggregate_kwargs={
            "store_factory": store,
            "dataset_uuid": dataset_uuid,
            "ds_factory": ds_factory,
            "delete_scope": delete_scope,
            "metadata": metadata,
            "metadata_merger": metadata_merger,
        },
    )