Esempio n. 1
0
def _multiplex_store_dataset_from_partitions_flat(mpss, cube, metadata, update,
                                                  store, existing_datasets):
    dct = defaultdict(list)
    for sublist in mpss:
        for mp in sublist:
            for k, v in mp.items():
                dct[k].append(v)

    result = {}
    for k, v in dct.items():
        if update:
            ds_factory = metadata_factory_from_dataset(existing_datasets[k],
                                                       with_schema=True,
                                                       store=store)
            result[k] = update_dataset_from_partitions(
                v,
                dataset_uuid=cube.ktk_dataset_uuid(k),
                delete_scope=[],
                ds_factory=ds_factory,
                metadata=metadata[k],
                metadata_merger=None,
                store_factory=store,
            )
        else:
            result[k] = store_dataset_from_partitions(
                v,
                dataset_metadata=metadata[k],
                dataset_uuid=cube.ktk_dataset_uuid(k),
                metadata_merger=None,
                metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT,
                store=store,
            )

    # list required for dask.bag
    return [result]
Esempio n. 2
0
def remove_partitions(cube,
                      store,
                      conditions=None,
                      ktk_cube_dataset_ids=None,
                      metadata=None):
    """
    Remove given partition range from cube using a transaction.

    Parameters
    ----------
    cube: kartothek.core.cube.cube.Cube
        Cube spec.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Store.
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied, optional. Defaults to "entire cube".
    ktk_cube_dataset_ids: Optional[Union[Iterable[Union[Str, Bytes]], Union[Str, Bytes]]]
        Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all".
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets, updated.
    """
    if callable(store):
        store_instance = store()
        store_factory = store
    else:
        store_instance = store

        def store_factory():
            return store

    existing_datasets = discover_datasets(cube, store)

    for (
            ktk_cube_dataset_id,
        (ds, mp, delete_scope),
    ) in prepare_metapartitions_for_removal_action(
            cube=cube,
            store=store_instance,
            conditions=conditions,
            ktk_cube_dataset_ids=ktk_cube_dataset_ids,
            existing_datasets=existing_datasets,
    ).items():
        mp = mp.store_dataframes(
            store=store_instance,
            dataset_uuid=ds.uuid,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
        )

        ds_factory = metadata_factory_from_dataset(ds,
                                                   with_schema=True,
                                                   store=store_factory)

        existing_datasets[
            ktk_cube_dataset_id] = update_dataset_from_partitions(
                mp,
                store_factory=store_factory,
                dataset_uuid=ds.uuid,
                ds_factory=ds_factory,
                metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id,
                                              metadata),
                metadata_merger=None,
                delete_scope=delete_scope,
            )

    return existing_datasets
Esempio n. 3
0
def update_dataset_from_dataframes__iter(
    df_generator,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    central_partition_metadata=True,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    load_dynamic_metadata=True,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
):
    """
    Update a kartothek dataset in store iteratively, using a generator of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).
    """
    if load_dynamic_metadata is not True:
        warnings.warn(
            "The keyword `load_dynamic_metadata` has no use and will be removed soon",
            DeprecationWarning,
        )

    if central_partition_metadata is not True:
        warnings.warn(
            "The keyword `central_partition_metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices)

    if sort_partitions_by:  # Define function which sorts each partition by column
        sort_partitions_by_fn = partial(
            sort_values_categorical, column=sort_partitions_by
        )

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(
            df,
            metadata_version=metadata_version,
            expected_secondary_indices=secondary_indices,
        )

        if sort_partitions_by:
            mp = mp.apply(sort_partitions_by_fn)

        if partition_on:
            mp = mp.partition_on(partition_on=partition_on)

        if secondary_indices:
            mp = mp.build_indices(columns=secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(
            store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid
        )

        new_partitions.append(mp)

    return update_dataset_from_partitions(
        new_partitions,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Esempio n. 4
0
def update_dataset_from_dataframes__iter(
    df_generator,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    central_partition_metadata=True,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    load_dynamic_metadata=True,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
):
    """
    Update a kartothek dataset in store iteratively, using a generator of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------
    df_generator: Iterable[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]
        The dataframe(s) to be stored.

    Returns
    -------
    The dataset metadata object (`kartothek.dataset.DatasetMetadata`).
    """

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        load_dynamic_metadata=load_dynamic_metadata,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory,
                                                   secondary_indices)

    if sort_partitions_by:  # Define function which sorts each partition by column
        sort_partitions_by_fn = partial(sort_values_categorical,
                                        column=sort_partitions_by)

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(
            df, metadata_version=default_metadata_version)

        if sort_partitions_by:
            mp = mp.apply(sort_partitions_by_fn)

        if partition_on:
            mp = mp.partition_on(partition_on=partition_on)

        if secondary_indices:
            mp = mp.build_indices(columns=secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(
            store=store,
            df_serializer=df_serializer,
            dataset_uuid=dataset_uuid,
            store_metadata=not central_partition_metadata,
        )

        new_partitions.append(mp)

    return update_dataset_from_partitions(
        new_partitions,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Esempio n. 5
0
def update_dataset_from_dataframes__iter(
    df_generator,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
    table_name: str = SINGLE_TABLE,
):
    """
    Update a kartothek dataset in store iteratively, using a generator of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).

    See Also
    --------
    :ref:`mutating_datasets`
    """

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices)

    if sort_partitions_by:  # Define function which sorts each partition by column
        sort_partitions_by_fn = partial(
            sort_values_categorical, columns=sort_partitions_by
        )

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(
            df, metadata_version=metadata_version, table_name=table_name,
        )

        if sort_partitions_by:
            mp = mp.apply(sort_partitions_by_fn)

        if partition_on:
            mp = mp.partition_on(partition_on=partition_on)

        if secondary_indices:
            mp = mp.build_indices(columns=secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(
            store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid
        )

        new_partitions.append(mp)

    return update_dataset_from_partitions(
        new_partitions,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Esempio n. 6
0
def update_dataset_from_dataframes(
    df_list,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    central_partition_metadata=True,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    load_dynamic_metadata=True,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
):
    """
    Update a kartothek dataset in store at once, using a list of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------
    df_list: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]
        The dataframe(s) to be stored.

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).
    """
    if load_dynamic_metadata is not True:
        warnings.warn(
            "The keyword `load_dynamic_metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if central_partition_metadata is not True:
        warnings.warn(
            "The keyword `central_partition_metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory,
                                                   secondary_indices)

    mp = parse_input_to_metapartition(
        df_list,
        metadata_version=metadata_version,
        expected_secondary_indices=secondary_indices,
    )

    if sort_partitions_by:
        mp = mp.apply(
            partial(sort_values_categorical, column=sort_partitions_by))

    if partition_on:
        mp = mp.partition_on(partition_on)

    if secondary_indices:
        mp = mp.build_indices(secondary_indices)

    mp = mp.store_dataframes(store=store,
                             dataset_uuid=dataset_uuid,
                             df_serializer=df_serializer)

    return update_dataset_from_partitions(
        mp,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Esempio n. 7
0
def commit_dataset(
    store=None,
    dataset_uuid=None,
    new_partitions=NoDefault(),
    output_dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    factory=None,
    secondary_indices=None,
):
    """
    Update an existing dataset with new, already written partitions. This should be used in combination with
    :func:`~kartothek.io.eager.write_single_partition` and :func:`~kartothek.io.eager.create_empty_dataset_header`.

    .. note::

        It is highly recommended to use the full pipelines whenever possible. This functionality should be
        used with caution and should only be necessary in cases where traditional pipeline scheduling is not an
        option.

    Example:

        .. code::

            import storefact
            import pandas as pd
            from functools import partial
            from kartothek.io.eager import write_single_partition
            form kartothek.io.eager.update import commit_dataset

            store = partial(storefact.get_store_from_url, url="hfs://my_store")

            new_data={
                "data": {
                    "table_1": pd.DataFrame({'column': [1, 2]}),
                    "table_1": pd.DataFrame({'other_column': ['a', 'b']}),
                }
            }
            # The partition writing can be done concurrently and distributed if wanted.
            # Only the information about what partitions have been written is required for the commit.
            new_partitions = [
                write_single_partition(
                    store=store,
                    dataset_uuid='dataset_uuid',
                    data=new_data
                )
            ]

            new_dataset = commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                new_partitions=new_partitions
            )

    Parameters
    ----------
    new_partitions: List[kartothek.io_components.metapartition.MetaPartition]
        Input partition to be committed.

    """
    if output_dataset_uuid is not None:
        warnings.warn(
            "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if df_serializer is not None:
        warnings.warn(
            "The keyword `df_serializer` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if isinstance(new_partitions, NoDefault):
        raise TypeError("The parameter `new_partitions` is not optional")
    store = _make_callable(store)
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    mps = parse_input_to_metapartition(new_partitions,
                                       metadata_version=metadata_version)

    if secondary_indices:
        mps = mps.build_indices(columns=secondary_indices)

    mps = [_maybe_infer_files_attribute(mp, dataset_uuid) for mp in mps]

    dmd = update_dataset_from_partitions(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
    return dmd
Esempio n. 8
0
def commit_dataset(
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    new_partitions: Optional[Iterable[MetaPartition]] = None,
    output_dataset_uuid: Optional[str] = None,
    delete_scope: Optional[Iterable[Dict[str, Any]]] = None,
    metadata: Dict = None,
    df_serializer: DataFrameSerializer = None,
    metadata_merger: Callable[[List[Dict]], Dict] = None,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[Iterable[str]] = None,
    factory: Optional[DatasetFactory] = None,
    secondary_indices: Optional[Iterable[str]] = None,
):
    """
    Commit new state to an existing dataset. This can be used for three distinct operations

    1. Add previously written partitions to this dataset

        If for some reasons, the existing pipelines are not sufficient but you need more control, you can write the files outside of a kartothek pipeline and commit them whenever you choose to.

        This should be used in combination with
        :func:`~kartothek.io.eager.write_single_partition` and :func:`~kartothek.io.eager.create_empty_dataset_header`.

        .. code::

            import pandas as pd
            from kartothek.io.eager import write_single_partition, commit_dataset

            store = "hfs://my_store"

            # The partition writing can be done concurrently and distributed if wanted.
            # Only the information about what partitions have been written is required for the commit.
            new_partitions = [
                write_single_partition(
                    store=store,
                    dataset_uuid='dataset_uuid',
                    data=pd.DataFrame({'column': [1, 2]}),
                )
            ]

            new_dataset = commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                new_partitions=new_partitions,
            )

    2. Simple delete of partitions

        If you want to remove some partitions this is one of the simples ways of doing so. By simply providing a delete_scope, this removes the references to these files in an atomic commit.

        .. code::

            commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                delete_scope=[
                    {
                        "partition_column": "part_value_to_be_removed"
                    }
                ],
            )

    3. Add additional metadata

        To add new metadata to an existing dataset

        .. code::

            commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                metadata={"new": "user_metadata"},
            )

        Note::

            If you do not want the new metadata to be merged with the existing one, povide a custom ``metadata_merger``

    Parameters
    ----------
    new_partitions:
        Input partition to be committed.

    """
    if output_dataset_uuid is not None:
        warnings.warn(
            "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if df_serializer is not None:
        warnings.warn(
            "The keyword `df_serializer` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if not new_partitions and not metadata and not delete_scope:
        raise ValueError(
            "Need to provide either new data, new metadata or a delete scope. None of it was provided."
        )
    store = lazy_store(store)
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    mps = parse_input_to_metapartition(new_partitions,
                                       metadata_version=metadata_version)

    if secondary_indices:
        mps = mps.build_indices(columns=secondary_indices)

    mps_list = [_maybe_infer_files_attribute(mp, dataset_uuid) for mp in mps]

    dmd = update_dataset_from_partitions(
        mps_list,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
    return dmd
Esempio n. 9
0
def update_dataset_from_dataframes(
    df_list: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]],
    store: Optional[KeyValueStore] = None,
    dataset_uuid: Optional[str] = None,
    delete_scope=None,
    metadata=None,
    df_serializer: Optional[ParquetSerializer] = None,
    metadata_merger: Callable = None,
    central_partition_metadata: bool = True,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    load_dynamic_metadata: bool = True,
    sort_partitions_by: Optional[str] = None,
    secondary_indices: Optional[List[str]] = None,
    factory: Optional[DatasetFactory] = None,
) -> DatasetMetadata:
    """
    Update a kartothek dataset in store at once, using a list of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------
    df_list:
        The dataframe(s) to be stored.

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).

    See Also
    --------
    :ref:`mutating_datasets`
    """
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    inferred_indices = _ensure_compatible_indices(ds_factory,
                                                  secondary_indices)
    del secondary_indices

    mp = parse_input_to_metapartition(
        df_list,
        metadata_version=metadata_version,
        expected_secondary_indices=inferred_indices,
    )

    if sort_partitions_by:
        mp = mp.apply(
            partial(sort_values_categorical, columns=sort_partitions_by))

    if partition_on:
        mp = mp.partition_on(partition_on)

    if inferred_indices:
        mp = mp.build_indices(inferred_indices)

    mp = mp.store_dataframes(store=store,
                             dataset_uuid=dataset_uuid,
                             df_serializer=df_serializer)

    return update_dataset_from_partitions(
        mp,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Esempio n. 10
0
def _commit_update_from_reduction(df_mps, **kwargs):
    partitions = pd.Series(df_mps.values.flatten()).dropna()
    return update_dataset_from_partitions(
        partition_list=partitions,
        **kwargs,
    )
Esempio n. 11
0
def remove_partitions(
    cube: Cube,
    store: Union[simplekv.KeyValueStore, StoreFactory],
    conditions: Union[None, Condition, Sequence[Condition],
                      Conjunction] = None,
    ktk_cube_dataset_ids: Optional[Union[Sequence[str], str]] = None,
    metadata: Optional[Dict[str, Dict[str, Any]]] = None,
):
    """
    Remove given partition range from cube using a transaction.

    Remove the partitions selected by ``conditions``. If no ``conditions`` are given,
    remove all partitions. For each considered dataset, only the subset of
    ``conditions`` that refers to the partition columns of the respective dataset
    is used. In particular, a dataset that is not partitioned at all is always considered
    selected by ``conditions``.

    Parameters
    ----------
    cube
        Cube spec.
    store
        Store.
    conditions
        Select the partitions to be removed. Must be a condition only on partition columns.
    ktk_cube_dataset_ids
        Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all".
    metadata
        Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets, updated.
    """
    if callable(store):
        store_instance = store()
        store_factory = store
    else:
        store_instance = store

        def store_factory():
            return store

    existing_datasets = discover_datasets(cube, store)

    for (
            ktk_cube_dataset_id,
        (ds, mp, delete_scope),
    ) in prepare_metapartitions_for_removal_action(
            cube=cube,
            store=store_instance,
            conditions=conditions,
            ktk_cube_dataset_ids=ktk_cube_dataset_ids,
            existing_datasets=existing_datasets,
    ).items():
        mp = mp.store_dataframes(
            store=store_instance,
            dataset_uuid=ds.uuid,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
        )

        ds_factory = metadata_factory_from_dataset(ds,
                                                   with_schema=True,
                                                   store=store_factory)

        existing_datasets[
            ktk_cube_dataset_id] = update_dataset_from_partitions(
                mp,
                store_factory=store_factory,
                dataset_uuid=ds.uuid,
                ds_factory=ds_factory,
                metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id,
                                              metadata),
                metadata_merger=None,
                delete_scope=delete_scope,
            )

    return existing_datasets