Esempio n. 1
0
def append_to_cube_from_bag_internal(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    remove_conditions=None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".


    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance.
    metadata:
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.
    remove_conditions:
        Conditions that select which partitions to remove.
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
        objects. The bag has a single partition with a single element.
    """
    check_store_factory(store)
    if ktk_cube_dataset_ids:
        ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)
    else:
        ktk_cube_dataset_ids = []
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    existing_datasets = discover_datasets(cube, store)
    cube = ensure_valid_cube_indices(existing_datasets, cube)
    # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
    # compat check within 1 dataset
    existing_payload: Set[str] = set()

    partition_on = {k: v.partition_keys for k, v in existing_datasets.items()}

    check_existing_datasets(existing_datasets=existing_datasets,
                            ktk_cube_dataset_ids=ktk_cube_dataset_ids)

    if remove_conditions is not None:
        remove_metapartitions = prepare_metapartitions_for_removal_action(
            cube, store, remove_conditions, ktk_cube_dataset_ids,
            existing_datasets)
        delete_scopes = {
            k: delete_scope
            for k, (_, _, delete_scope) in remove_metapartitions.items()
        }
    else:
        delete_scopes = {}

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _fill_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
                _multiplex_prepare_data_for_ktk,
                cube=cube,
                existing_payload=existing_payload,
                partition_on=partition_on,
            ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        update=True,
        existing_datasets=existing_datasets,
        delete_scopes=delete_scopes,
        df_serializer=df_serializer,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data
Esempio n. 2
0
def append_to_cube(data, cube, store, metadata=None):
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :meth:`build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".

    .. hint::

        To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use
        :meth:`remove_partitions` beforehand.

    Parameters
    ----------
    data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]]
        Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed
        dataset.
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: simplekv.KeyValueStore
        Store to which the data should be written to.
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        DatasetMetadata for every dataset written.
    """
    data = _normalize_user_input(data, cube)

    existing_datasets = discover_datasets(cube, store)
    partition_on = {k: v.partition_keys for k, v in existing_datasets.items()}

    check_existing_datasets(existing_datasets=existing_datasets,
                            ktk_cube_dataset_ids=set(data.keys()))

    # do all data preparation before writing anything
    # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
    # compat check within 1 dataset
    data = _prepare_data_for_ktk_all(data=data,
                                     cube=cube,
                                     existing_payload=set(),
                                     partition_on=partition_on)

    # update_dataset_from_dataframes requires a store factory, so create one
    # if not provided
    if not callable(store):

        def store_factory():
            return store

    else:
        store_factory = store

    updated_datasets = {}
    for ktk_cube_dataset_id, part in data.items():
        updated_datasets[ktk_cube_dataset_id] = update_dataset_from_dataframes(
            store=store_factory,
            dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
            df_list=part,
            partition_on=list(partition_on[ktk_cube_dataset_id]),
            df_serializer=KTK_CUBE_DF_SERIALIZER,
            metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata),
        )

    return apply_postwrite_checks(
        datasets=updated_datasets,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )
Esempio n. 3
0
def append_to_cube_from_bag_internal(data, cube, store, ktk_cube_dataset_ids,
                                     metadata):
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :meth:`build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".

    .. hint::

        To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use
        :meth:`remove_partitions` beforehand.

    Parameters
    ----------
    data: dask.Bag
        Bag containing dataframes
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: Callable[[], simplekv.KeyValueStore]
        Store to which the data should be written to.
    ktk_cube_dataset_ids: Optional[Iterable[str]]
        Datasets that will be written, must be specified in advance.
    metadata: Dict[str, Dict[str, Any]]
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
        objects. The bag has a single partition with a single element.
    """
    check_store_factory(store)
    ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    existing_datasets = discover_datasets(cube, store)
    # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
    # compat check within 1 dataset
    existing_payload = set()

    partition_on = {k: v.partition_keys for k, v in existing_datasets.items()}

    check_existing_datasets(existing_datasets=existing_datasets,
                            ktk_cube_dataset_ids=ktk_cube_dataset_ids)

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _multiplex_prepare_data_for_ktk,
            cube=cube,
            existing_payload=existing_payload,
            partition_on=partition_on,
        ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        update=True,
        existing_datasets=existing_datasets,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data