def append_to_cube_from_bag_internal( data: db.Bag, cube: Cube, store: StoreFactory, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], remove_conditions=None, df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Append data to existing cube. For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`. .. important:: Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the old data is treated as "removed". Parameters ---------- data: dask.bag.Bag Bag containing dataframes cube: Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. metadata: Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of metadata keys is not possible. remove_conditions: Conditions that select which partitions to remove. df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) if ktk_cube_dataset_ids: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) else: ktk_cube_dataset_ids = [] metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets(cube, store) cube = ensure_valid_cube_indices(existing_datasets, cube) # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the # compat check within 1 dataset existing_payload: Set[str] = set() partition_on = {k: v.partition_keys for k, v in existing_datasets.items()} check_existing_datasets(existing_datasets=existing_datasets, ktk_cube_dataset_ids=ktk_cube_dataset_ids) if remove_conditions is not None: remove_metapartitions = prepare_metapartitions_for_removal_action( cube, store, remove_conditions, ktk_cube_dataset_ids, existing_datasets) delete_scopes = { k: delete_scope for k, (_, _, delete_scope) in remove_metapartitions.items() } else: delete_scopes = {} data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _fill_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=existing_payload, partition_on=partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, update=True, existing_datasets=existing_datasets, delete_scopes=delete_scopes, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def remove_partitions(cube, store, conditions=None, ktk_cube_dataset_ids=None, metadata=None): """ Remove given partition range from cube using a transaction. Parameters ---------- cube: kartothek.core.cube.cube.Cube Cube spec. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Store. conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied, optional. Defaults to "entire cube". ktk_cube_dataset_ids: Optional[Union[Iterable[Union[Str, Bytes]], Union[Str, Bytes]]] Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all". metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of metadata keys is not possible. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets, updated. """ if callable(store): store_instance = store() store_factory = store else: store_instance = store def store_factory(): return store existing_datasets = discover_datasets(cube, store) for ( ktk_cube_dataset_id, (ds, mp, delete_scope), ) in prepare_metapartitions_for_removal_action( cube=cube, store=store_instance, conditions=conditions, ktk_cube_dataset_ids=ktk_cube_dataset_ids, existing_datasets=existing_datasets, ).items(): mp = mp.store_dataframes( store=store_instance, dataset_uuid=ds.uuid, df_serializer=KTK_CUBE_DF_SERIALIZER, ) ds_factory = metadata_factory_from_dataset(ds, with_schema=True, store=store_factory) existing_datasets[ ktk_cube_dataset_id] = update_dataset_from_partitions( mp, store_factory=store_factory, dataset_uuid=ds.uuid, ds_factory=ds_factory, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), metadata_merger=None, delete_scope=delete_scope, ) return existing_datasets
def remove_partitions( cube: Cube, store: Union[simplekv.KeyValueStore, StoreFactory], conditions: Union[None, Condition, Sequence[Condition], Conjunction] = None, ktk_cube_dataset_ids: Optional[Union[Sequence[str], str]] = None, metadata: Optional[Dict[str, Dict[str, Any]]] = None, ): """ Remove given partition range from cube using a transaction. Remove the partitions selected by ``conditions``. If no ``conditions`` are given, remove all partitions. For each considered dataset, only the subset of ``conditions`` that refers to the partition columns of the respective dataset is used. In particular, a dataset that is not partitioned at all is always considered selected by ``conditions``. Parameters ---------- cube Cube spec. store Store. conditions Select the partitions to be removed. Must be a condition only on partition columns. ktk_cube_dataset_ids Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all". metadata Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of metadata keys is not possible. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets, updated. """ if callable(store): store_instance = store() store_factory = store else: store_instance = store def store_factory(): return store existing_datasets = discover_datasets(cube, store) for ( ktk_cube_dataset_id, (ds, mp, delete_scope), ) in prepare_metapartitions_for_removal_action( cube=cube, store=store_instance, conditions=conditions, ktk_cube_dataset_ids=ktk_cube_dataset_ids, existing_datasets=existing_datasets, ).items(): mp = mp.store_dataframes( store=store_instance, dataset_uuid=ds.uuid, df_serializer=KTK_CUBE_DF_SERIALIZER, ) ds_factory = metadata_factory_from_dataset(ds, with_schema=True, store=store_factory) existing_datasets[ ktk_cube_dataset_id] = update_dataset_from_partitions( mp, store_factory=store_factory, dataset_uuid=ds.uuid, ds_factory=ds_factory, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), metadata_merger=None, delete_scope=delete_scope, ) return existing_datasets