コード例 #1
0
def assert_target_keys(src_store, src_uuid, tgt_store, tgt_uuid):
    """
    Check that the expected keys exist in the target data set, and the corresponding
    values are equal to the source data set (or modified as expected)
    """
    df_source = DatasetFactory(
        dataset_uuid=src_uuid, store_factory=lazy_store(src_store),
    )
    src_keys = get_dataset_keys(df_source.dataset_metadata)
    df_target = DatasetFactory(
        dataset_uuid=tgt_uuid, store_factory=lazy_store(tgt_store),
    )
    tgt_keys = get_dataset_keys(df_target.dataset_metadata)

    for src_key in src_keys:
        # check for each source key if the corresponding target key exists
        tgt_key = src_key.replace(src_uuid, tgt_uuid)
        assert tgt_key in tgt_keys

        # check if the files for source and target key are equal (exception:
        # metadata => here the target must contain the modified metadata)
        b1 = src_store.get(src_key)
        b2 = tgt_store.get(tgt_key)

        if tgt_key.endswith("by-dataset-metadata.json"):
            b1_mod = b1.decode("utf-8").replace(src_uuid, tgt_uuid).encode("utf-8")
            assert b1_mod == b2
        else:
            assert b1 == b2
コード例 #2
0
def test_partial_delete(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v": [10, 11, 12, 13]
    })
    df_1 = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "a": [20, 21, 22, 23]
    })
    df_2 = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "b": [20, 21, 22, 23]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    datasets = build_cube(
        data={
            cube.seed_dataset: df_seed,
            "enrich-1": df_1,
            "enrich-2": df_2
        },
        cube=cube,
        store=function_store,
    )
    enrich_1_keys = get_dataset_keys(
        discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=function_store,
            filter_ktk_cube_dataset_ids=["enrich-1"],
        )["enrich-1"])
    enrich_2_keys = get_dataset_keys(
        discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=function_store,
            filter_ktk_cube_dataset_ids=["enrich-2"],
        )["enrich-2"])
    all_keys = set(function_store().keys())
    driver(cube=cube, store=function_store, datasets=["enrich-1"])
    assert set(function_store().keys()) == all_keys - enrich_1_keys

    driver(cube=cube,
           store=function_store,
           datasets={"enrich-2": datasets["enrich-2"]})
    assert set(
        function_store().keys()) == all_keys - enrich_1_keys - enrich_2_keys
コード例 #3
0
def delete_cube(cube, store, datasets=None):
    """
    Delete cube from store.

    .. important::
        This routine only deletes tracked files. Garbage and leftovers from old cubes and failed operations are NOT
        removed.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        KV store.
    datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]]
        Datasets to delete, must all be part of the cube. May be either the result of :meth:`discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case entire cube will be deleted).
    """
    if callable(store):
        store = store()

    if not isinstance(datasets, dict):
        datasets = discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=store,
            filter_ktk_cube_dataset_ids=datasets,
        )

    keys = set()
    for ktk_cube_dataset_id in sorted(datasets.keys()):
        ds = datasets[ktk_cube_dataset_id]
        keys |= get_dataset_keys(ds)

    for k in sorted(keys):
        store.delete(k)
コード例 #4
0
ファイル: test_copy.py プロジェクト: xhochy/kartothek
def test_partial_copy_exclude_pattern(
    cli, built_cube, skv, store, store2, exclude_pattern, copy_tables
):
    extend_cube(
        data={
            "mytable": pd.DataFrame(
                {
                    "x": [0, 1],
                    "y": [0, 0],
                    "p": 0,
                    "q": ["a", "a"],
                    "mycolumn": ["a", "b"],
                }
            )
        },
        cube=built_cube,
        store=store,
    )
    copied_datasets = discover_datasets_unchecked(
        uuid_prefix=built_cube.uuid_prefix,
        store=store,
        filter_ktk_cube_dataset_ids=copy_tables,
    )
    copy_keys = set()
    for name in copy_tables:
        copy_keys |= get_dataset_keys(copied_datasets[name])
    result = cli(
        "--store=cubes",
        "my_cube",
        "copy",
        "--tgt_store=cubes2",
        "--exclude=" + exclude_pattern,
    )
    assert result.exit_code == 0
    assert set(store2.keys()) == copy_keys
コード例 #5
0
def get_keys_to_clean(cube_uuid_prefix, datasets, store):
    """
    Get the keys that are present in the store but can be deleted.

    Parameters
    ----------
    store: simplekv.KeyValueStore
        KV store.
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets to scan for keys.

    Returns
    -------
    keys: Set[str]
        Keys to delete.
    """
    keys_should = reduce(
        set.union, (get_dataset_keys(ds) for ds in datasets.values()), set()
    )

    keys_present = {
        k for k in store.iter_keys(cube_uuid_prefix + KTK_CUBE_UUID_SEPARATOR)
    }

    return keys_present - keys_should
コード例 #6
0
    def test_ignores_untracked(self, function_store, ds):
        keys = set(function_store().keys())

        # irrelevant content
        function_store().put(ds.uuid + ".foo", b"")

        assert get_dataset_keys(ds) == keys
コード例 #7
0
def test_partial_copy_dataset_dict(
    driver, function_store, function_store2, cube, built_cube
):
    driver(
        cube=cube,
        src_store=function_store,
        tgt_store=function_store2,
        datasets={"seed": built_cube["seed"], "enrich": built_cube["enrich"]},
    )
    all_datasets = discover_datasets_unchecked(
        uuid_prefix=cube.uuid_prefix,
        store=function_store,
        filter_ktk_cube_dataset_ids=["seed", "enrich"],
    )
    copied_ds_keys = set()
    copied_ds_keys |= get_dataset_keys(all_datasets["seed"])
    copied_ds_keys |= get_dataset_keys(all_datasets["enrich"])
    tgt_store_keys = set(function_store2().keys())
    assert copied_ds_keys == tgt_store_keys
コード例 #8
0
ファイル: test_copy.py プロジェクト: xhochy/kartothek
def test_partial_copy_include_pattern_nomatch(cli, built_cube, skv, store, store2):
    copied_datasets = discover_datasets_unchecked(
        uuid_prefix=built_cube.uuid_prefix,
        store=store,
        filter_ktk_cube_dataset_ids=["source"],
    )
    copy_keys = get_dataset_keys(copied_datasets["source"])  # noqa
    result = cli(
        "--store=cubes", "my_cube", "copy", "--tgt_store=cubes2", "--include=x*,source"
    )
    assert result.exit_code == 2

    assert "Error: Could not find dataset x*" in result.output
コード例 #9
0
def test_partial_delete_exclude_pattern(
    cli, built_cube, skv, store, exclude_pattern, delete_tables
):
    datasets = discover_datasets_unchecked(
        uuid_prefix=built_cube.uuid_prefix,
        store=store,
        filter_ktk_cube_dataset_ids=delete_tables,
    )
    delete_keys = set()
    for name in delete_tables:
        delete_keys |= get_dataset_keys(datasets[name])
    all_keys = set(store.keys())
    result = cli("--store=cubes", "my_cube", "delete", "--exclude=" + exclude_pattern)
    assert result.exit_code == 0
    assert set(store.keys()) == all_keys - delete_keys
コード例 #10
0
def delete_cube_bag(
    cube: Cube,
    store: StoreFactory,
    blocksize: int = 100,
    datasets: Optional[Union[Iterable[str], Dict[str, DatasetMetadata]]] = None,
):
    """
    Delete cube from store.

    .. important::
        This routine only deletes tracked files. Garbage and leftovers from old cubes and failed operations are NOT
        removed.

    Parameters
    ----------
    cube
        Cube specification.
    store
        KV store.
    blocksize
        Number of keys to delete at once.
    datasets
        Datasets to delete, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case entire cube will be deleted).

    Returns
    -------
    bag: dask.bag.Bag
        A dask bag that performs the given operation. May contain multiple partitions.
    """
    check_store_factory(store)
    check_blocksize(blocksize)

    if not isinstance(datasets, dict):
        datasets = discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=store,
            filter_ktk_cube_dataset_ids=datasets,
        )

    keys = set()
    for ktk_cube_dataset_id in sorted(datasets.keys()):
        ds = datasets[ktk_cube_dataset_id]
        keys |= get_dataset_keys(ds)

    return db.from_sequence(seq=sorted(keys), partition_size=blocksize).map_partitions(
        _delete, store=store
    )
コード例 #11
0
def get_copy_keys(
    cube: Cube,
    src_store: Union[Callable[[], KeyValueStore], KeyValueStore],
    tgt_store: Union[Callable[[], KeyValueStore], KeyValueStore],
    overwrite: bool,
    datasets: Optional[Union[Iterable[str], Dict[str,
                                                 DatasetMetadata]]] = None,
):
    """
    Get and check keys that should be copied from one store to another.

    Parameters
    ----------
    cube:
        Cube specification.
    src_store:
        Source KV store.
    tgt_store:
        Target KV store.
    overwrite:
        If possibly existing datasets in the target store should be overwritten.
    datasets:
        Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, an
        iterable of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied).

    Returns
    -------
    keys: Set[str]
        Set of keys to copy.

    Raises
    ------
    RuntimeError: In case the copy would not pass successfully or if there is no cube in ``src_store``.
    """
    new_datasets = get_datasets_to_copy(
        cube=cube,
        src_store=src_store,
        tgt_store=tgt_store,
        overwrite=overwrite,
        datasets=datasets,
    )

    keys = set()
    for ktk_cube_dataset_id in sorted(new_datasets.keys()):
        ds = new_datasets[ktk_cube_dataset_id]
        keys |= get_dataset_keys(ds)

    return keys
コード例 #12
0
def get_copy_keys(cube, src_store, tgt_store, overwrite, datasets=None):
    """
    Get and check keys that should be copied from one store to another.

    Parameters
    ----------
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    src_store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
        Source KV store.
    tgt_store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
        Target KV store.
    overwrite: bool
        If possibly existing datasets in the target store should be overwritten.
    datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]]
        Datasets to copy, must all be part of the cube. May be either the result of :meth:`discover_datasets`, an
        iterable of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied).

    Returns
    -------
    keys: Set[str]
        Set of keys to copy.

    Raises
    ------
    RuntimeError: In case the copy would not pass successfully or if there is no cube in ``src_store``.
    """
    if not isinstance(datasets, dict):
        new_datasets = discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=src_store,
            filter_ktk_cube_dataset_ids=datasets,
        )
    else:
        new_datasets = datasets

    if datasets is None:
        if not new_datasets:
            raise RuntimeError("{} not found in source store".format(cube))
    else:
        unknown_datasets = set(datasets) - set(new_datasets)
        if unknown_datasets:
            raise RuntimeError(
                "{cube}, datasets {datasets} do not exist in source store".
                format(cube=cube, datasets=unknown_datasets))

    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix,
                                                    tgt_store)

    if not overwrite:
        for ktk_cube_dataset_id in sorted(new_datasets.keys()):
            if ktk_cube_dataset_id in existing_datasets:
                raise RuntimeError(
                    'Dataset "{uuid}" exists in target store but overwrite was set to False'
                    .format(uuid=new_datasets[ktk_cube_dataset_id].uuid))

    all_datasets = copy(existing_datasets)
    all_datasets.update(new_datasets)

    check_datasets(all_datasets, cube)

    keys = set()
    for ktk_cube_dataset_id in sorted(new_datasets.keys()):
        ds = new_datasets[ktk_cube_dataset_id]
        keys |= get_dataset_keys(ds)

    return keys
コード例 #13
0
def copy_dataset(
    source_dataset_uuid: str,
    store: KeyValueStore,
    target_dataset_uuid: Optional[str] = None,
    target_store: Optional[KeyValueStore] = None,
) -> Dict[str, DatasetMetadata]:
    """
    Copies and optionally renames a dataset, either  from one store to another or
    within one store.

    Parameters
    ----------
    source_dataset_uuid: str
        UUID of source dataset
    store: simplekv.KeyValueStore
        Source store
    target_dataset_uuid: Optional[str]
        UUID of target dataset. May be the same as src_dataset_uuid, if store
        and tgt_store are different. If empty, src_dataset_uuid is used
    target_store: Optional[simplekv.KeyValueStore]
        Target Store. May be the same as store, if src_dataset_uuid and
        target_dataset_uuid are different. If empty, value from parameter store is
        used
    """
    if target_dataset_uuid is None:
        target_dataset_uuid = source_dataset_uuid
    if target_store is None:
        target_store = store

    if (source_dataset_uuid == target_dataset_uuid) & (store == target_store):
        raise ValueError(
            "Cannot copy to a dataset with the same UUID within the same store!"
        )

    ds_factory_source = _ensure_factory(
        dataset_uuid=source_dataset_uuid,
        store=store,
        factory=None,
        load_dataset_metadata=True,
    )

    # Create a dict of {source key: target key} entries
    keys = get_dataset_keys(ds_factory_source.dataset_metadata)
    mapped_keys = {
        source_key: source_key.replace(source_dataset_uuid,
                                       target_dataset_uuid)
        for source_key in keys
    }

    # Create a dict of metadata which has to be changed. This is only the
    # <uuid>.by-dataset-metadata.json file

    md_transformed = {
        f"{target_dataset_uuid}{METADATA_BASE_SUFFIX}{METADATA_FORMAT_JSON}":
        DatasetMetadataBuilder.from_dataset(
            ds_factory_source.dataset_metadata).modify_uuid(
                target_dataset_uuid).to_dataset()
    }
    # Copy the keys from one store to another
    copy_rename_keys(mapped_keys, store, target_store, md_transformed)

    return md_transformed
コード例 #14
0
    def test_all_indices_loaded(self, function_store, ds):
        ds = ds.load_all_indices(function_store())

        assert get_dataset_keys(ds) == set(function_store().keys())
コード例 #15
0
 def test_simple(self, function_store, ds):
     assert get_dataset_keys(ds) == set(function_store().keys())