def copy_cube(cube, src_store, tgt_store, overwrite=False, datasets=None): """ Copy cube from one store to another. Parameters ---------- cube: Cube Cube specification. src_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Source KV store. tgt_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Target KV store. overwrite: bool If possibly existing datasets in the target store should be overwritten. datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]] Datasets to copy, must all be part of the cube. May be either the result of :meth:`discover_datasets`, a list of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied). """ if callable(src_store): src_store = src_store() if callable(tgt_store): tgt_store = tgt_store() assert_stores_different(src_store, tgt_store, cube.ktk_dataset_uuid(cube.seed_dataset)) keys = get_copy_keys( cube=cube, src_store=src_store, tgt_store=tgt_store, overwrite=overwrite, datasets=datasets, ) copy_keys(keys, src_store, tgt_store)
def copy_cube_bag( cube, src_store: StoreFactory, tgt_store: StoreFactory, blocksize: int = 100, overwrite: bool = False, datasets: Optional[Union[Iterable[str], Dict[str, DatasetMetadata]]] = None, ): """ Copy cube from one store to another. Parameters ---------- cube Cube specification. src_store Source KV store. tgt_store Target KV store. overwrite If possibly existing datasets in the target store should be overwritten. blocksize Number of keys to copy at once. datasets Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied). Returns ------- bag: dask.bag.Bag A dask bag that performs the given operation. May contain multiple partitions. """ check_store_factory(src_store) check_store_factory(tgt_store) check_blocksize(blocksize) assert_stores_different( src_store, tgt_store, cube.ktk_dataset_uuid(cube.seed_dataset) ) keys = get_copy_keys( cube=cube, src_store=src_store, tgt_store=tgt_store, overwrite=overwrite, datasets=datasets, ) return db.from_sequence(seq=sorted(keys), partition_size=blocksize).map_partitions( copy_keys, src_store=src_store, tgt_store=tgt_store )