def test_copy_same_source_and_target(dataset_to_copy, store): with pytest.raises(ValueError): copy_dataset( source_dataset_uuid=SRC_DS_UUID, target_dataset_uuid=SRC_DS_UUID, store=store, target_store=store, )
def test_copy_eager_no_target_store(dataset_to_copy, store, store2): copy_dataset( source_dataset_uuid=SRC_DS_UUID, target_dataset_uuid=TGT_DS_UUID, store=store, target_store=None, ) assert_target_keys(store, SRC_DS_UUID, store, TGT_DS_UUID) assert_target_ktk_readable(store, TGT_DS_UUID)
def test_copy_eager_with_rename_different_store(dataset_to_copy, store, store2): """ Copies and renames DS between stores """ copy_dataset( source_dataset_uuid=SRC_DS_UUID, target_dataset_uuid=TGT_DS_UUID, store=store, target_store=store2, ) assert_target_keys(store, SRC_DS_UUID, store2, TGT_DS_UUID) assert_target_ktk_readable(store2, TGT_DS_UUID)
def test_copy_rename_eager_same_store(dataset_to_copy, store): """ Copies and renames DS within one store """ copy_dataset( source_dataset_uuid=SRC_DS_UUID, target_dataset_uuid=TGT_DS_UUID, store=store, target_store=store, ) assert_target_keys(store, SRC_DS_UUID, store, TGT_DS_UUID) assert_target_ktk_readable(store, TGT_DS_UUID)
def test_copy_eager_without_rename_different_store(dataset_to_copy, store, store2): """ Copies DS between stores while keeping the name """ copy_dataset( source_dataset_uuid=SRC_DS_UUID, target_dataset_uuid=SRC_DS_UUID, store=store, target_store=store2, ) assert_target_keys(store, SRC_DS_UUID, store2, SRC_DS_UUID) assert_target_ktk_readable(store2, SRC_DS_UUID)
def side_effect(*args, **kwargs): if side_effect.counter == 0: side_effect.counter += 1 return copy_dataset(*args, **kwargs) else: raise ValueError("Something unexpected happened during cube copy.")
def copy_cube( cube: Cube, src_store: Union[KeyValueStore, Callable[[], KeyValueStore]], tgt_store: Union[KeyValueStore, Callable[[], KeyValueStore]], overwrite: bool = False, datasets: Union[None, Iterable[str], Dict[str, DatasetMetadata]] = None, renamed_cube_prefix: Optional[str] = None, renamed_datasets: Optional[Dict[str, str]] = None, ): """ Copy cube from one store to another. .. warning:: A failing copy operation can not be rolled back if the `overwrite` flag is enabled and might leave the overwritten dataset in an inconsistent state. Parameters ---------- cube: Cube Cube specification. src_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Source KV store. tgt_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Target KV store. overwrite: bool If possibly existing datasets in the target store should be overwritten. datasets: Union[None, Iterable[str], Dict[str, DatasetMetadata]] Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied). renamed_cube_prefix: Optional[str] Optional new cube prefix. If specified, the cube will be renamed while copying. renamed_datasets: Optional[Dict[str, str]] Optional dict with {old dataset name: new dataset name} entries. If provided, the datasets will be renamed accordingly during copying. When the parameter datasets is specified, the datasets to rename must be a subset of the datasets to copy. """ if callable(src_store): src_store = src_store() if callable(tgt_store): tgt_store = tgt_store() assert_stores_different(src_store, tgt_store, cube.ktk_dataset_uuid(cube.seed_dataset)) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, tgt_store) if renamed_datasets is None: new_seed_dataset = cube.seed_dataset else: new_seed_dataset = renamed_datasets.get(cube.seed_dataset, cube.seed_dataset) new_cube = Cube( dimension_columns=cube.dimension_columns, partition_columns=cube.partition_columns, uuid_prefix=renamed_cube_prefix or cube.uuid_prefix, seed_dataset=new_seed_dataset, index_columns=cube.index_columns, suppress_index_on=cube.suppress_index_on, ) datasets_to_copy = get_datasets_to_copy( cube=cube, src_store=src_store, tgt_store=tgt_store, overwrite=overwrite, datasets=datasets, ) copied = {} # type: Dict[str, DatasetMetadata] for src_ds_name, src_ds_meta in datasets_to_copy.items(): tgt_ds_uuid = _transform_uuid( src_uuid=src_ds_meta.uuid, cube_prefix=cube.uuid_prefix, renamed_cube_prefix=renamed_cube_prefix, renamed_datasets=renamed_datasets, ) try: md_transformed = copy_dataset( source_dataset_uuid=src_ds_meta.uuid, store=src_store, target_dataset_uuid=tgt_ds_uuid, target_store=tgt_store, ) except Exception as e: if overwrite: # We can't roll back safely if the target dataset has been partially overwritten. raise RuntimeError(e) else: apply_postwrite_checks( datasets=copied, cube=new_cube, store=tgt_store, existing_datasets=existing_datasets, ) else: copied.update(md_transformed)