Ejemplo n.º 1
0
def copy_cube(cube, src_store, tgt_store, overwrite=False, datasets=None):
    """
    Copy cube from one store to another.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    src_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Source KV store.
    tgt_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Target KV store.
    overwrite: bool
        If possibly existing datasets in the target store should be overwritten.
    datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]]
        Datasets to copy, must all be part of the cube. May be either the result of :meth:`discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied).
    """
    if callable(src_store):
        src_store = src_store()
    if callable(tgt_store):
        tgt_store = tgt_store()
    assert_stores_different(src_store, tgt_store,
                            cube.ktk_dataset_uuid(cube.seed_dataset))

    keys = get_copy_keys(
        cube=cube,
        src_store=src_store,
        tgt_store=tgt_store,
        overwrite=overwrite,
        datasets=datasets,
    )
    copy_keys(keys, src_store, tgt_store)
Ejemplo n.º 2
0
def copy_cube_bag(
    cube,
    src_store: StoreFactory,
    tgt_store: StoreFactory,
    blocksize: int = 100,
    overwrite: bool = False,
    datasets: Optional[Union[Iterable[str], Dict[str, DatasetMetadata]]] = None,
):
    """
    Copy cube from one store to another.

    Parameters
    ----------
    cube
        Cube specification.
    src_store
        Source KV store.
    tgt_store
        Target KV store.
    overwrite
        If possibly existing datasets in the target store should be overwritten.
    blocksize
        Number of keys to copy at once.
    datasets
        Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied).

    Returns
    -------
    bag: dask.bag.Bag
        A dask bag that performs the given operation. May contain multiple partitions.
    """
    check_store_factory(src_store)
    check_store_factory(tgt_store)
    check_blocksize(blocksize)
    assert_stores_different(
        src_store, tgt_store, cube.ktk_dataset_uuid(cube.seed_dataset)
    )

    keys = get_copy_keys(
        cube=cube,
        src_store=src_store,
        tgt_store=tgt_store,
        overwrite=overwrite,
        datasets=datasets,
    )

    return db.from_sequence(seq=sorted(keys), partition_size=blocksize).map_partitions(
        copy_keys, src_store=src_store, tgt_store=tgt_store
    )