Ejemplo n.º 1
0
def copy_cube(cube, src_store, tgt_store, overwrite=False, datasets=None):
    """
    Copy cube from one store to another.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    src_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Source KV store.
    tgt_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Target KV store.
    overwrite: bool
        If possibly existing datasets in the target store should be overwritten.
    datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]]
        Datasets to copy, must all be part of the cube. May be either the result of :meth:`discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied).
    """
    if callable(src_store):
        src_store = src_store()
    if callable(tgt_store):
        tgt_store = tgt_store()
    assert_stores_different(src_store, tgt_store,
                            cube.ktk_dataset_uuid(cube.seed_dataset))

    keys = get_copy_keys(
        cube=cube,
        src_store=src_store,
        tgt_store=tgt_store,
        overwrite=overwrite,
        datasets=datasets,
    )
    copy_keys(keys, src_store, tgt_store)
Ejemplo n.º 2
0
def copy_cube_bag(
    cube,
    src_store: StoreFactory,
    tgt_store: StoreFactory,
    blocksize: int = 100,
    overwrite: bool = False,
    datasets: Optional[Union[Iterable[str], Dict[str, DatasetMetadata]]] = None,
):
    """
    Copy cube from one store to another.

    Parameters
    ----------
    cube
        Cube specification.
    src_store
        Source KV store.
    tgt_store
        Target KV store.
    overwrite
        If possibly existing datasets in the target store should be overwritten.
    blocksize
        Number of keys to copy at once.
    datasets
        Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied).

    Returns
    -------
    bag: dask.bag.Bag
        A dask bag that performs the given operation. May contain multiple partitions.
    """
    check_store_factory(src_store)
    check_store_factory(tgt_store)
    check_blocksize(blocksize)
    assert_stores_different(
        src_store, tgt_store, cube.ktk_dataset_uuid(cube.seed_dataset)
    )

    keys = get_copy_keys(
        cube=cube,
        src_store=src_store,
        tgt_store=tgt_store,
        overwrite=overwrite,
        datasets=datasets,
    )

    return db.from_sequence(seq=sorted(keys), partition_size=blocksize).map_partitions(
        copy_keys, src_store=src_store, tgt_store=tgt_store
    )
Ejemplo n.º 3
0
def copy_cube(
    cube: Cube,
    src_store: Union[KeyValueStore, Callable[[], KeyValueStore]],
    tgt_store: Union[KeyValueStore, Callable[[], KeyValueStore]],
    overwrite: bool = False,
    datasets: Union[None, Iterable[str], Dict[str, DatasetMetadata]] = None,
    renamed_cube_prefix: Optional[str] = None,
    renamed_datasets: Optional[Dict[str, str]] = None,
):
    """
    Copy cube from one store to another.

    .. warning::
        A failing copy operation can not be rolled back if the `overwrite` flag is enabled
        and might leave the overwritten dataset in an inconsistent state.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    src_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Source KV store.
    tgt_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Target KV store.
    overwrite: bool
        If possibly existing datasets in the target store should be overwritten.
    datasets: Union[None, Iterable[str], Dict[str, DatasetMetadata]]
        Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied).
    renamed_cube_prefix: Optional[str]
        Optional new cube prefix. If specified, the cube will be renamed while copying.
    renamed_datasets: Optional[Dict[str, str]]
        Optional dict with {old dataset name: new dataset name} entries. If provided,
        the datasets will be renamed accordingly during copying. When the parameter
        datasets is specified, the datasets to rename must be a subset of the datasets
        to copy.
    """
    if callable(src_store):
        src_store = src_store()
    if callable(tgt_store):
        tgt_store = tgt_store()
    assert_stores_different(src_store, tgt_store,
                            cube.ktk_dataset_uuid(cube.seed_dataset))
    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix,
                                                    tgt_store)

    if renamed_datasets is None:
        new_seed_dataset = cube.seed_dataset
    else:
        new_seed_dataset = renamed_datasets.get(cube.seed_dataset,
                                                cube.seed_dataset)

    new_cube = Cube(
        dimension_columns=cube.dimension_columns,
        partition_columns=cube.partition_columns,
        uuid_prefix=renamed_cube_prefix or cube.uuid_prefix,
        seed_dataset=new_seed_dataset,
        index_columns=cube.index_columns,
        suppress_index_on=cube.suppress_index_on,
    )

    datasets_to_copy = get_datasets_to_copy(
        cube=cube,
        src_store=src_store,
        tgt_store=tgt_store,
        overwrite=overwrite,
        datasets=datasets,
    )

    copied = {}  # type: Dict[str, DatasetMetadata]
    for src_ds_name, src_ds_meta in datasets_to_copy.items():
        tgt_ds_uuid = _transform_uuid(
            src_uuid=src_ds_meta.uuid,
            cube_prefix=cube.uuid_prefix,
            renamed_cube_prefix=renamed_cube_prefix,
            renamed_datasets=renamed_datasets,
        )
        try:
            md_transformed = copy_dataset(
                source_dataset_uuid=src_ds_meta.uuid,
                store=src_store,
                target_dataset_uuid=tgt_ds_uuid,
                target_store=tgt_store,
            )
        except Exception as e:
            if overwrite:
                # We can't roll back safely if the target dataset has been partially overwritten.
                raise RuntimeError(e)
            else:
                apply_postwrite_checks(
                    datasets=copied,
                    cube=new_cube,
                    store=tgt_store,
                    existing_datasets=existing_datasets,
                )
        else:
            copied.update(md_transformed)