def assert_target_keys(src_store, src_uuid, tgt_store, tgt_uuid): """ Check that the expected keys exist in the target data set, and the corresponding values are equal to the source data set (or modified as expected) """ df_source = DatasetFactory( dataset_uuid=src_uuid, store_factory=lazy_store(src_store), ) src_keys = get_dataset_keys(df_source.dataset_metadata) df_target = DatasetFactory( dataset_uuid=tgt_uuid, store_factory=lazy_store(tgt_store), ) tgt_keys = get_dataset_keys(df_target.dataset_metadata) for src_key in src_keys: # check for each source key if the corresponding target key exists tgt_key = src_key.replace(src_uuid, tgt_uuid) assert tgt_key in tgt_keys # check if the files for source and target key are equal (exception: # metadata => here the target must contain the modified metadata) b1 = src_store.get(src_key) b2 = tgt_store.get(tgt_key) if tgt_key.endswith("by-dataset-metadata.json"): b1_mod = b1.decode("utf-8").replace(src_uuid, tgt_uuid).encode("utf-8") assert b1_mod == b2 else: assert b1 == b2
def test_partial_delete(driver, function_store): df_seed = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13] }) df_1 = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "a": [20, 21, 22, 23] }) df_2 = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "b": [20, 21, 22, 23] }) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") datasets = build_cube( data={ cube.seed_dataset: df_seed, "enrich-1": df_1, "enrich-2": df_2 }, cube=cube, store=function_store, ) enrich_1_keys = get_dataset_keys( discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=function_store, filter_ktk_cube_dataset_ids=["enrich-1"], )["enrich-1"]) enrich_2_keys = get_dataset_keys( discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=function_store, filter_ktk_cube_dataset_ids=["enrich-2"], )["enrich-2"]) all_keys = set(function_store().keys()) driver(cube=cube, store=function_store, datasets=["enrich-1"]) assert set(function_store().keys()) == all_keys - enrich_1_keys driver(cube=cube, store=function_store, datasets={"enrich-2": datasets["enrich-2"]}) assert set( function_store().keys()) == all_keys - enrich_1_keys - enrich_2_keys
def delete_cube(cube, store, datasets=None): """ Delete cube from store. .. important:: This routine only deletes tracked files. Garbage and leftovers from old cubes and failed operations are NOT removed. Parameters ---------- cube: Cube Cube specification. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] KV store. datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]] Datasets to delete, must all be part of the cube. May be either the result of :meth:`discover_datasets`, a list of Ktk_cube dataset ID or ``None`` (in which case entire cube will be deleted). """ if callable(store): store = store() if not isinstance(datasets, dict): datasets = discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=datasets, ) keys = set() for ktk_cube_dataset_id in sorted(datasets.keys()): ds = datasets[ktk_cube_dataset_id] keys |= get_dataset_keys(ds) for k in sorted(keys): store.delete(k)
def test_partial_copy_exclude_pattern( cli, built_cube, skv, store, store2, exclude_pattern, copy_tables ): extend_cube( data={ "mytable": pd.DataFrame( { "x": [0, 1], "y": [0, 0], "p": 0, "q": ["a", "a"], "mycolumn": ["a", "b"], } ) }, cube=built_cube, store=store, ) copied_datasets = discover_datasets_unchecked( uuid_prefix=built_cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=copy_tables, ) copy_keys = set() for name in copy_tables: copy_keys |= get_dataset_keys(copied_datasets[name]) result = cli( "--store=cubes", "my_cube", "copy", "--tgt_store=cubes2", "--exclude=" + exclude_pattern, ) assert result.exit_code == 0 assert set(store2.keys()) == copy_keys
def get_keys_to_clean(cube_uuid_prefix, datasets, store): """ Get the keys that are present in the store but can be deleted. Parameters ---------- store: simplekv.KeyValueStore KV store. datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets to scan for keys. Returns ------- keys: Set[str] Keys to delete. """ keys_should = reduce( set.union, (get_dataset_keys(ds) for ds in datasets.values()), set() ) keys_present = { k for k in store.iter_keys(cube_uuid_prefix + KTK_CUBE_UUID_SEPARATOR) } return keys_present - keys_should
def test_ignores_untracked(self, function_store, ds): keys = set(function_store().keys()) # irrelevant content function_store().put(ds.uuid + ".foo", b"") assert get_dataset_keys(ds) == keys
def test_partial_copy_dataset_dict( driver, function_store, function_store2, cube, built_cube ): driver( cube=cube, src_store=function_store, tgt_store=function_store2, datasets={"seed": built_cube["seed"], "enrich": built_cube["enrich"]}, ) all_datasets = discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=function_store, filter_ktk_cube_dataset_ids=["seed", "enrich"], ) copied_ds_keys = set() copied_ds_keys |= get_dataset_keys(all_datasets["seed"]) copied_ds_keys |= get_dataset_keys(all_datasets["enrich"]) tgt_store_keys = set(function_store2().keys()) assert copied_ds_keys == tgt_store_keys
def test_partial_copy_include_pattern_nomatch(cli, built_cube, skv, store, store2): copied_datasets = discover_datasets_unchecked( uuid_prefix=built_cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=["source"], ) copy_keys = get_dataset_keys(copied_datasets["source"]) # noqa result = cli( "--store=cubes", "my_cube", "copy", "--tgt_store=cubes2", "--include=x*,source" ) assert result.exit_code == 2 assert "Error: Could not find dataset x*" in result.output
def test_partial_delete_exclude_pattern( cli, built_cube, skv, store, exclude_pattern, delete_tables ): datasets = discover_datasets_unchecked( uuid_prefix=built_cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=delete_tables, ) delete_keys = set() for name in delete_tables: delete_keys |= get_dataset_keys(datasets[name]) all_keys = set(store.keys()) result = cli("--store=cubes", "my_cube", "delete", "--exclude=" + exclude_pattern) assert result.exit_code == 0 assert set(store.keys()) == all_keys - delete_keys
def delete_cube_bag( cube: Cube, store: StoreFactory, blocksize: int = 100, datasets: Optional[Union[Iterable[str], Dict[str, DatasetMetadata]]] = None, ): """ Delete cube from store. .. important:: This routine only deletes tracked files. Garbage and leftovers from old cubes and failed operations are NOT removed. Parameters ---------- cube Cube specification. store KV store. blocksize Number of keys to delete at once. datasets Datasets to delete, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list of Ktk_cube dataset ID or ``None`` (in which case entire cube will be deleted). Returns ------- bag: dask.bag.Bag A dask bag that performs the given operation. May contain multiple partitions. """ check_store_factory(store) check_blocksize(blocksize) if not isinstance(datasets, dict): datasets = discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=datasets, ) keys = set() for ktk_cube_dataset_id in sorted(datasets.keys()): ds = datasets[ktk_cube_dataset_id] keys |= get_dataset_keys(ds) return db.from_sequence(seq=sorted(keys), partition_size=blocksize).map_partitions( _delete, store=store )
def get_copy_keys( cube: Cube, src_store: Union[Callable[[], KeyValueStore], KeyValueStore], tgt_store: Union[Callable[[], KeyValueStore], KeyValueStore], overwrite: bool, datasets: Optional[Union[Iterable[str], Dict[str, DatasetMetadata]]] = None, ): """ Get and check keys that should be copied from one store to another. Parameters ---------- cube: Cube specification. src_store: Source KV store. tgt_store: Target KV store. overwrite: If possibly existing datasets in the target store should be overwritten. datasets: Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, an iterable of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied). Returns ------- keys: Set[str] Set of keys to copy. Raises ------ RuntimeError: In case the copy would not pass successfully or if there is no cube in ``src_store``. """ new_datasets = get_datasets_to_copy( cube=cube, src_store=src_store, tgt_store=tgt_store, overwrite=overwrite, datasets=datasets, ) keys = set() for ktk_cube_dataset_id in sorted(new_datasets.keys()): ds = new_datasets[ktk_cube_dataset_id] keys |= get_dataset_keys(ds) return keys
def get_copy_keys(cube, src_store, tgt_store, overwrite, datasets=None): """ Get and check keys that should be copied from one store to another. Parameters ---------- cube: kartothek.core.cube.cube.Cube Cube specification. src_store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore] Source KV store. tgt_store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore] Target KV store. overwrite: bool If possibly existing datasets in the target store should be overwritten. datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]] Datasets to copy, must all be part of the cube. May be either the result of :meth:`discover_datasets`, an iterable of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied). Returns ------- keys: Set[str] Set of keys to copy. Raises ------ RuntimeError: In case the copy would not pass successfully or if there is no cube in ``src_store``. """ if not isinstance(datasets, dict): new_datasets = discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=src_store, filter_ktk_cube_dataset_ids=datasets, ) else: new_datasets = datasets if datasets is None: if not new_datasets: raise RuntimeError("{} not found in source store".format(cube)) else: unknown_datasets = set(datasets) - set(new_datasets) if unknown_datasets: raise RuntimeError( "{cube}, datasets {datasets} do not exist in source store". format(cube=cube, datasets=unknown_datasets)) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, tgt_store) if not overwrite: for ktk_cube_dataset_id in sorted(new_datasets.keys()): if ktk_cube_dataset_id in existing_datasets: raise RuntimeError( 'Dataset "{uuid}" exists in target store but overwrite was set to False' .format(uuid=new_datasets[ktk_cube_dataset_id].uuid)) all_datasets = copy(existing_datasets) all_datasets.update(new_datasets) check_datasets(all_datasets, cube) keys = set() for ktk_cube_dataset_id in sorted(new_datasets.keys()): ds = new_datasets[ktk_cube_dataset_id] keys |= get_dataset_keys(ds) return keys
def copy_dataset( source_dataset_uuid: str, store: KeyValueStore, target_dataset_uuid: Optional[str] = None, target_store: Optional[KeyValueStore] = None, ) -> Dict[str, DatasetMetadata]: """ Copies and optionally renames a dataset, either from one store to another or within one store. Parameters ---------- source_dataset_uuid: str UUID of source dataset store: simplekv.KeyValueStore Source store target_dataset_uuid: Optional[str] UUID of target dataset. May be the same as src_dataset_uuid, if store and tgt_store are different. If empty, src_dataset_uuid is used target_store: Optional[simplekv.KeyValueStore] Target Store. May be the same as store, if src_dataset_uuid and target_dataset_uuid are different. If empty, value from parameter store is used """ if target_dataset_uuid is None: target_dataset_uuid = source_dataset_uuid if target_store is None: target_store = store if (source_dataset_uuid == target_dataset_uuid) & (store == target_store): raise ValueError( "Cannot copy to a dataset with the same UUID within the same store!" ) ds_factory_source = _ensure_factory( dataset_uuid=source_dataset_uuid, store=store, factory=None, load_dataset_metadata=True, ) # Create a dict of {source key: target key} entries keys = get_dataset_keys(ds_factory_source.dataset_metadata) mapped_keys = { source_key: source_key.replace(source_dataset_uuid, target_dataset_uuid) for source_key in keys } # Create a dict of metadata which has to be changed. This is only the # <uuid>.by-dataset-metadata.json file md_transformed = { f"{target_dataset_uuid}{METADATA_BASE_SUFFIX}{METADATA_FORMAT_JSON}": DatasetMetadataBuilder.from_dataset( ds_factory_source.dataset_metadata).modify_uuid( target_dataset_uuid).to_dataset() } # Copy the keys from one store to another copy_rename_keys(mapped_keys, store, target_store, md_transformed) return md_transformed
def test_all_indices_loaded(self, function_store, ds): ds = ds.load_all_indices(function_store()) assert get_dataset_keys(ds) == set(function_store().keys())
def test_simple(self, function_store, ds): assert get_dataset_keys(ds) == set(function_store().keys())