コード例 #1
0
def append_to_cube_from_bag_internal(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    remove_conditions=None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".


    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance.
    metadata:
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.
    remove_conditions:
        Conditions that select which partitions to remove.
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
        objects. The bag has a single partition with a single element.
    """
    check_store_factory(store)
    if ktk_cube_dataset_ids:
        ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)
    else:
        ktk_cube_dataset_ids = []
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    existing_datasets = discover_datasets(cube, store)
    cube = ensure_valid_cube_indices(existing_datasets, cube)
    # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
    # compat check within 1 dataset
    existing_payload: Set[str] = set()

    partition_on = {k: v.partition_keys for k, v in existing_datasets.items()}

    check_existing_datasets(existing_datasets=existing_datasets,
                            ktk_cube_dataset_ids=ktk_cube_dataset_ids)

    if remove_conditions is not None:
        remove_metapartitions = prepare_metapartitions_for_removal_action(
            cube, store, remove_conditions, ktk_cube_dataset_ids,
            existing_datasets)
        delete_scopes = {
            k: delete_scope
            for k, (_, _, delete_scope) in remove_metapartitions.items()
        }
    else:
        delete_scopes = {}

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _fill_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
                _multiplex_prepare_data_for_ktk,
                cube=cube,
                existing_payload=existing_payload,
                partition_on=partition_on,
            ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        update=True,
        existing_datasets=existing_datasets,
        delete_scopes=delete_scopes,
        df_serializer=df_serializer,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data
コード例 #2
0
def remove_partitions(cube,
                      store,
                      conditions=None,
                      ktk_cube_dataset_ids=None,
                      metadata=None):
    """
    Remove given partition range from cube using a transaction.

    Parameters
    ----------
    cube: kartothek.core.cube.cube.Cube
        Cube spec.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Store.
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied, optional. Defaults to "entire cube".
    ktk_cube_dataset_ids: Optional[Union[Iterable[Union[Str, Bytes]], Union[Str, Bytes]]]
        Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all".
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets, updated.
    """
    if callable(store):
        store_instance = store()
        store_factory = store
    else:
        store_instance = store

        def store_factory():
            return store

    existing_datasets = discover_datasets(cube, store)

    for (
            ktk_cube_dataset_id,
        (ds, mp, delete_scope),
    ) in prepare_metapartitions_for_removal_action(
            cube=cube,
            store=store_instance,
            conditions=conditions,
            ktk_cube_dataset_ids=ktk_cube_dataset_ids,
            existing_datasets=existing_datasets,
    ).items():
        mp = mp.store_dataframes(
            store=store_instance,
            dataset_uuid=ds.uuid,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
        )

        ds_factory = metadata_factory_from_dataset(ds,
                                                   with_schema=True,
                                                   store=store_factory)

        existing_datasets[
            ktk_cube_dataset_id] = update_dataset_from_partitions(
                mp,
                store_factory=store_factory,
                dataset_uuid=ds.uuid,
                ds_factory=ds_factory,
                metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id,
                                              metadata),
                metadata_merger=None,
                delete_scope=delete_scope,
            )

    return existing_datasets
コード例 #3
0
def remove_partitions(
    cube: Cube,
    store: Union[simplekv.KeyValueStore, StoreFactory],
    conditions: Union[None, Condition, Sequence[Condition],
                      Conjunction] = None,
    ktk_cube_dataset_ids: Optional[Union[Sequence[str], str]] = None,
    metadata: Optional[Dict[str, Dict[str, Any]]] = None,
):
    """
    Remove given partition range from cube using a transaction.

    Remove the partitions selected by ``conditions``. If no ``conditions`` are given,
    remove all partitions. For each considered dataset, only the subset of
    ``conditions`` that refers to the partition columns of the respective dataset
    is used. In particular, a dataset that is not partitioned at all is always considered
    selected by ``conditions``.

    Parameters
    ----------
    cube
        Cube spec.
    store
        Store.
    conditions
        Select the partitions to be removed. Must be a condition only on partition columns.
    ktk_cube_dataset_ids
        Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all".
    metadata
        Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets, updated.
    """
    if callable(store):
        store_instance = store()
        store_factory = store
    else:
        store_instance = store

        def store_factory():
            return store

    existing_datasets = discover_datasets(cube, store)

    for (
            ktk_cube_dataset_id,
        (ds, mp, delete_scope),
    ) in prepare_metapartitions_for_removal_action(
            cube=cube,
            store=store_instance,
            conditions=conditions,
            ktk_cube_dataset_ids=ktk_cube_dataset_ids,
            existing_datasets=existing_datasets,
    ).items():
        mp = mp.store_dataframes(
            store=store_instance,
            dataset_uuid=ds.uuid,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
        )

        ds_factory = metadata_factory_from_dataset(ds,
                                                   with_schema=True,
                                                   store=store_factory)

        existing_datasets[
            ktk_cube_dataset_id] = update_dataset_from_partitions(
                mp,
                store_factory=store_factory,
                dataset_uuid=ds.uuid,
                ds_factory=ds_factory,
                metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id,
                                              metadata),
                metadata_merger=None,
                delete_scope=delete_scope,
            )

    return existing_datasets