コード例 #1
0
    def test_msgpack_efficiency(self, cube, function_store):
        """
        We should only iterate over the store once, even though we are looking for 2 suffixes.

        Furthermore, we must only load every dataset once.
        """
        store_data(
            cube=cube,
            function_store=function_store,
            df=pd.DataFrame({
                "x": [0],
                "y": [0],
                "p": [0],
                "q": [0]
            }),
            name=cube.seed_dataset,
            metadata_storage_format="msgpack",
        )
        store_data(
            cube=cube,
            function_store=function_store,
            df=pd.DataFrame({
                "x": [0],
                "y": [0],
                "p": [0],
                "q": [0]
            }),
            name=cube.seed_dataset,
            overwrite=True,
        )

        class StoreMock(KeyValueStore):
            def __init__(self, store):
                self._store = store
                self._iter_keys_called = 0
                self._iter_prefixes_called = 0
                self._get_called = Counter()

            def iter_keys(self, prefix=""):
                self._iter_keys_called += 1
                return self._store.iter_keys(prefix)

            def iter_prefixes(self, delimiter, prefix=""):
                self._iter_prefixes_called += 1
                return self._store.iter_prefixes(delimiter, prefix)

            def get(self, key):
                self._get_called[key] += 1
                return self._store.get(key)

        store = StoreMock(function_store())

        discover_datasets_unchecked(cube.uuid_prefix, store)

        assert store._iter_keys_called == 0
        assert store._iter_prefixes_called == 1
        assert max(store._get_called.values()) == 1
コード例 #2
0
def test_partial_delete(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v": [10, 11, 12, 13]
    })
    df_1 = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "a": [20, 21, 22, 23]
    })
    df_2 = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "b": [20, 21, 22, 23]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    datasets = build_cube(
        data={
            cube.seed_dataset: df_seed,
            "enrich-1": df_1,
            "enrich-2": df_2
        },
        cube=cube,
        store=function_store,
    )
    enrich_1_keys = get_dataset_keys(
        discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=function_store,
            filter_ktk_cube_dataset_ids=["enrich-1"],
        )["enrich-1"])
    enrich_2_keys = get_dataset_keys(
        discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=function_store,
            filter_ktk_cube_dataset_ids=["enrich-2"],
        )["enrich-2"])
    all_keys = set(function_store().keys())
    driver(cube=cube, store=function_store, datasets=["enrich-1"])
    assert set(function_store().keys()) == all_keys - enrich_1_keys

    driver(cube=cube,
           store=function_store,
           datasets={"enrich-2": datasets["enrich-2"]})
    assert set(
        function_store().keys()) == all_keys - enrich_1_keys - enrich_2_keys
コード例 #3
0
def cleanup_cube(cube, store):
    """
    Remove unused keys from cube datasets.

    .. important::
        All untracked keys which start with the cube's `uuid_prefix` followed by the `KTK_CUBE_UUID_SEPERATOR`
        (e.g. `my_cube_uuid++seed...`) will be deleted by this routine. These keys may be leftovers from past
        overwrites or index updates.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        KV store.
    """
    if callable(store):
        store = store()

    datasets = discover_datasets_unchecked(uuid_prefix=cube.uuid_prefix,
                                           store=store)
    keys = get_keys_to_clean(cube.uuid_prefix, datasets, store)

    for k in sorted(keys):
        store.delete(k)
コード例 #4
0
ファイル: test_copy.py プロジェクト: xhochy/kartothek
def test_partial_copy_exclude_pattern(
    cli, built_cube, skv, store, store2, exclude_pattern, copy_tables
):
    extend_cube(
        data={
            "mytable": pd.DataFrame(
                {
                    "x": [0, 1],
                    "y": [0, 0],
                    "p": 0,
                    "q": ["a", "a"],
                    "mycolumn": ["a", "b"],
                }
            )
        },
        cube=built_cube,
        store=store,
    )
    copied_datasets = discover_datasets_unchecked(
        uuid_prefix=built_cube.uuid_prefix,
        store=store,
        filter_ktk_cube_dataset_ids=copy_tables,
    )
    copy_keys = set()
    for name in copy_tables:
        copy_keys |= get_dataset_keys(copied_datasets[name])
    result = cli(
        "--store=cubes",
        "my_cube",
        "copy",
        "--tgt_store=cubes2",
        "--exclude=" + exclude_pattern,
    )
    assert result.exit_code == 0
    assert set(store2.keys()) == copy_keys
コード例 #5
0
    def test_msgpack_clean(self, cube, function_store):
        expected = {
            cube.seed_dataset:
            store_data(
                cube=cube,
                function_store=function_store,
                df=pd.DataFrame({
                    "x": [0],
                    "y": [0],
                    "p": [0],
                    "q": [0]
                }),
                name=cube.seed_dataset,
            ),
            "enrich":
            store_data(
                cube=cube,
                function_store=function_store,
                df=pd.DataFrame({
                    "x": [0],
                    "y": [0],
                    "p": [0],
                    "q": [0]
                }),
                name="enrich",
                metadata_storage_format="msgpack",
            ),
        }

        actual = discover_datasets_unchecked(cube.uuid_prefix, function_store)
        assert_datasets_equal(actual, expected)
コード例 #6
0
 def test_filter_partial_datasets_found(self, cube, function_store):
     enrich_dataset = store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({
             "x": [0],
             "y": [0],
             "p": [0],
             "q": [0]
         }),
         name="enrich",
     )
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({
             "x": [0],
             "y": [0],
             "p": [0],
             "q": [0]
         }),
         name="mytable",
     )
     expected = {"enrich": enrich_dataset}
     actual = discover_datasets_unchecked(
         cube.uuid_prefix,
         function_store,
         filter_ktk_cube_dataset_ids=["enrich"])
     assert_dataset_issubset(actual, expected)
コード例 #7
0
ファイル: test_discover.py プロジェクト: lr4d/kartothek
    def test_no_common_metadata(self, cube, function_store):
        expected = {
            cube.seed_dataset: store_data(
                cube=cube,
                function_store=function_store,
                df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}),
                name=cube.seed_dataset,
            )
        }

        store_data(
            cube=cube,
            function_store=function_store,
            df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}),
            name="enrich",
        )
        keys = set(function_store().keys())
        metadata_key = cube.ktk_dataset_uuid("enrich") + ".by-dataset-metadata.json"
        assert metadata_key in keys
        for k in keys:
            if (k != metadata_key) and k.startswith(cube.ktk_dataset_uuid("enrich")):
                function_store().delete(k)

        actual = discover_datasets_unchecked(cube.uuid_prefix, function_store)
        assert_datasets_equal(actual, expected)
コード例 #8
0
ファイル: test_discover.py プロジェクト: lr4d/kartothek
    def test_msgpack_priority(self, cube, function_store):
        """
        json metadata files have priority in kartothek, so the disovery should respect this
        """
        store_data(
            cube=cube,
            function_store=function_store,
            df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "v1": [0]}),
            name=cube.seed_dataset,
            metadata_storage_format="msgpack",
        )
        expected = {
            cube.seed_dataset: store_data(
                cube=cube,
                function_store=function_store,
                df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "v2": [0]}),
                name=cube.seed_dataset,
                overwrite=True,
            )
        }
        store_data(
            cube=cube,
            function_store=function_store,
            df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "v3": [0]}),
            name=cube.seed_dataset,
            metadata_storage_format="msgpack",
            overwrite=True,
        )

        actual = discover_datasets_unchecked(cube.uuid_prefix, function_store)
        assert_datasets_equal(actual, expected)
コード例 #9
0
def cleanup_cube_bag(cube, store, blocksize=100):
    """
    Remove unused keys from cube datasets.

    .. important::
        All untracked keys which start with the cube's `uuid_prefix` followed by the `KTK_CUBE_UUID_SEPERATOR`
        (e.g. `my_cube_uuid++seed...`) will be deleted by this routine. These keys may be leftovers from past
        overwrites or index updates.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        KV store.
    blocksize: int
        Number of keys to delete at once.

    Returns
    -------
    bag: dask.bag.Bag
        A dask bag that performs the given operation. May contain multiple partitions.
    """
    check_store_factory(store)
    check_blocksize(blocksize)

    store_obj = store()

    datasets = discover_datasets_unchecked(uuid_prefix=cube.uuid_prefix,
                                           store=store)
    keys = get_keys_to_clean(cube.uuid_prefix, datasets, store_obj)

    return db.from_sequence(
        seq=sorted(keys), partition_size=blocksize).map_partitions(_delete,
                                                                   store=store)
コード例 #10
0
def delete_cube(cube, store, datasets=None):
    """
    Delete cube from store.

    .. important::
        This routine only deletes tracked files. Garbage and leftovers from old cubes and failed operations are NOT
        removed.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        KV store.
    datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]]
        Datasets to delete, must all be part of the cube. May be either the result of :meth:`discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case entire cube will be deleted).
    """
    if callable(store):
        store = store()

    if not isinstance(datasets, dict):
        datasets = discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=store,
            filter_ktk_cube_dataset_ids=datasets,
        )

    keys = set()
    for ktk_cube_dataset_id in sorted(datasets.keys()):
        ds = datasets[ktk_cube_dataset_id]
        keys |= get_dataset_keys(ds)

    for k in sorted(keys):
        store.delete(k)
コード例 #11
0
def collect_stats(cube, store, datasets=None):
    """
    Collect statistics for given cube.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    store: simplekv.KeyValueStore
        KV store that preserves the cube.
    datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]]
        Datasets to query, must all be part of the cube. May be either the result of :meth:`discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case auto-discovery will be used).

    Returns
    -------
    stats: Dict[str, Dict[str, int]]
        Statistics per ktk_cube dataset ID.
    """
    if callable(store):
        store = store()

    if not isinstance(datasets, dict):
        datasets = discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=store,
            filter_ktk_cube_dataset_ids=datasets,
        )

    all_metapartitions = get_metapartitions_for_stats(datasets)
    return reduce_stats([collect_stats_block(all_metapartitions, store)])
コード例 #12
0
ファイル: test_discover.py プロジェクト: lr4d/kartothek
 def test_no_seed(self, cube, function_store):
     expected = {
         "enrich": store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}),
             name="enrich",
         )
     }
     actual = discover_datasets_unchecked(cube.uuid_prefix, function_store)
     assert_datasets_equal(actual, expected)
コード例 #13
0
ファイル: test_copy.py プロジェクト: xhochy/kartothek
def test_partial_copy_include_pattern_nomatch(cli, built_cube, skv, store, store2):
    copied_datasets = discover_datasets_unchecked(
        uuid_prefix=built_cube.uuid_prefix,
        store=store,
        filter_ktk_cube_dataset_ids=["source"],
    )
    copy_keys = get_dataset_keys(copied_datasets["source"])  # noqa
    result = cli(
        "--store=cubes", "my_cube", "copy", "--tgt_store=cubes2", "--include=x*,source"
    )
    assert result.exit_code == 2

    assert "Error: Could not find dataset x*" in result.output
コード例 #14
0
ファイル: test_discover.py プロジェクト: lr4d/kartothek
    def test_other_files(self, cube, function_store):
        expected = {
            cube.seed_dataset: store_data(
                cube=cube,
                function_store=function_store,
                df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}),
                name=cube.seed_dataset,
            )
        }

        function_store().put(cube.ktk_dataset_uuid("enrich") + "/foo", b"")

        actual = discover_datasets_unchecked(cube.uuid_prefix, function_store)
        assert_datasets_equal(actual, expected)
コード例 #15
0
def collect_stats_bag(
    cube: Cube,
    store: StoreFactory,
    datasets: Optional[Union[Iterable[str], Dict[str, DatasetMetadata]]] = None,
    blocksize: int = 100,
):
    """
    Collect statistics for given cube.

    Parameters
    ----------
    cube
        Cube specification.
    store
        KV store that preserves the cube.
    datasets
        Datasets to query, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case auto-discovery will be used).
    blocksize
        Number of partitions to scan at once.

    Returns
    -------
    bag: dask.bag.Bag
        A dask bag that returns a single result of the form ``Dict[str, Dict[str, int]]`` and contains statistics per
        ktk_cube dataset ID.
    """
    check_store_factory(store)
    check_blocksize(blocksize)

    if not isinstance(datasets, dict):
        datasets = discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=store,
            filter_ktk_cube_dataset_ids=datasets,
        )

    all_metapartitions = get_metapartitions_for_stats(datasets)

    return (
        db.from_sequence(seq=all_metapartitions, partition_size=blocksize)
        .map_partitions(collect_stats_block, store=store)
        .reduction(
            perpartition=_obj_to_list,
            aggregate=_reduce_stats,
            split_every=False,
            out_type=db.Bag,
        )
    )
コード例 #16
0
def test_partial_delete_exclude_pattern(
    cli, built_cube, skv, store, exclude_pattern, delete_tables
):
    datasets = discover_datasets_unchecked(
        uuid_prefix=built_cube.uuid_prefix,
        store=store,
        filter_ktk_cube_dataset_ids=delete_tables,
    )
    delete_keys = set()
    for name in delete_tables:
        delete_keys |= get_dataset_keys(datasets[name])
    all_keys = set(store.keys())
    result = cli("--store=cubes", "my_cube", "delete", "--exclude=" + exclude_pattern)
    assert result.exit_code == 0
    assert set(store.keys()) == all_keys - delete_keys
コード例 #17
0
def delete_cube_bag(
    cube: Cube,
    store: StoreFactory,
    blocksize: int = 100,
    datasets: Optional[Union[Iterable[str], Dict[str, DatasetMetadata]]] = None,
):
    """
    Delete cube from store.

    .. important::
        This routine only deletes tracked files. Garbage and leftovers from old cubes and failed operations are NOT
        removed.

    Parameters
    ----------
    cube
        Cube specification.
    store
        KV store.
    blocksize
        Number of keys to delete at once.
    datasets
        Datasets to delete, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case entire cube will be deleted).

    Returns
    -------
    bag: dask.bag.Bag
        A dask bag that performs the given operation. May contain multiple partitions.
    """
    check_store_factory(store)
    check_blocksize(blocksize)

    if not isinstance(datasets, dict):
        datasets = discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=store,
            filter_ktk_cube_dataset_ids=datasets,
        )

    keys = set()
    for ktk_cube_dataset_id in sorted(datasets.keys()):
        ds = datasets[ktk_cube_dataset_id]
        keys |= get_dataset_keys(ds)

    return db.from_sequence(seq=sorted(keys), partition_size=blocksize).map_partitions(
        _delete, store=store
    )
コード例 #18
0
def test_partial_copy_dataset_dict(
    driver, function_store, function_store2, cube, built_cube
):
    driver(
        cube=cube,
        src_store=function_store,
        tgt_store=function_store2,
        datasets={"seed": built_cube["seed"], "enrich": built_cube["enrich"]},
    )
    all_datasets = discover_datasets_unchecked(
        uuid_prefix=cube.uuid_prefix,
        store=function_store,
        filter_ktk_cube_dataset_ids=["seed", "enrich"],
    )
    copied_ds_keys = set()
    copied_ds_keys |= get_dataset_keys(all_datasets["seed"])
    copied_ds_keys |= get_dataset_keys(all_datasets["enrich"])
    tgt_store_keys = set(function_store2().keys())
    assert copied_ds_keys == tgt_store_keys
コード例 #19
0
def get_datasets_to_copy(
    cube: Cube,
    src_store: Union[Callable[[], KeyValueStore], KeyValueStore],
    tgt_store: Union[Callable[[], KeyValueStore], KeyValueStore],
    overwrite: bool,
    datasets: Optional[Union[Iterable[str], Dict[str,
                                                 DatasetMetadata]]] = None,
) -> Dict[str, DatasetMetadata]:
    """
    Determine all dataset names of a given cube that should be copied and apply addtional consistency checks.
    Copying only a specific set of datasets is possible by providing a list of dataset names via the parameter `datasets`.

    Parameters
    ----------
    cube:
        Cube specification.
    src_store:
        Source KV store.
    tgt_store:
        Target KV store.
    overwrite:
        If possibly existing datasets in the target store should be overwritten.
    datasets:
        Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, an
        iterable of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied).

    Returns
    -------
    all_datasets: Dict[str, DatasetMetadata]
        All datasets that should be copied.
    """
    if not isinstance(datasets, dict):
        new_datasets = discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=src_store,
            filter_ktk_cube_dataset_ids=datasets,
        )
    else:
        new_datasets = datasets

    if datasets is None:
        if not new_datasets:
            raise RuntimeError("{} not found in source store".format(cube))
    else:
        unknown_datasets = set(datasets) - set(new_datasets)
        if unknown_datasets:
            raise RuntimeError(
                "{cube}, datasets {datasets} do not exist in source store".
                format(cube=cube, datasets=unknown_datasets))

    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix,
                                                    tgt_store)

    if not overwrite:
        for ktk_cube_dataset_id in sorted(new_datasets.keys()):
            if ktk_cube_dataset_id in existing_datasets:
                raise RuntimeError(
                    'Dataset "{uuid}" exists in target store but overwrite was set to False'
                    .format(uuid=new_datasets[ktk_cube_dataset_id].uuid))

    all_datasets = copy(existing_datasets)
    all_datasets.update(new_datasets)

    check_datasets(all_datasets, cube)
    return new_datasets
コード例 #20
0
def build_cube_from_bag_internal(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    overwrite: bool,
    partition_on: Optional[Dict[str, Iterable[str]]],
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Create dask computation graph that builds a cube with the data supplied from a dask bag.

    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance. If left unprovided, it is assumed that only the
        seed dataset will be written.
    metadata:
        Metadata for every dataset.
    overwrite:
        If possibly existing datasets should be overwritten.
    partition_on:
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to build a cube returning the dict of dataset metadata objects.
        The bag has a single partition with a single element.
    """
    check_store_factory(store)

    if ktk_cube_dataset_ids is None:
        ktk_cube_dataset_ids = [cube.seed_dataset]
    else:
        ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)

    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)
    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store)
    check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets)
    prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids,
                                                 partition_on)
    cube = ensure_valid_cube_indices(existing_datasets, cube)

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _multiplex_prepare_data_for_ktk,
            cube=cube,
            existing_payload=set(),
            partition_on=prep_partition_on,
        ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        overwrite=overwrite,
        update=False,
        existing_datasets=existing_datasets,
        df_serializer=df_serializer,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data
コード例 #21
0
ファイル: test_discover.py プロジェクト: lr4d/kartothek
 def test_filter_no_datasets_found(self, cube, function_store):
     actual = discover_datasets_unchecked(
         cube.uuid_prefix, function_store, filter_ktk_cube_dataset_ids=["enrich"]
     )
     assert actual == {}
コード例 #22
0
def build_cube_from_dataframe(
    data: Union[dd.DataFrame, Dict[str, dd.DataFrame]],
    cube: Cube,
    store: StoreFactory,
    metadata: Optional[Dict[str, Dict[str, Any]]] = None,
    overwrite: bool = False,
    partition_on: Optional[Dict[str, Iterable[str]]] = None,
    shuffle: bool = False,
    num_buckets: int = 1,
    bucket_by: Optional[Iterable[str]] = None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> Delayed:
    """
    Create dask computation graph that builds a cube with the data supplied from a dask dataframe.

    Parameters
    ----------
    data
        Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed
        dataset.
    cube
        Cube specification.
    store
        Store to which the data should be written to.
    metadata
        Metadata for every dataset.
    overwrite
        If possibly existing datasets should be overwritten.
    partition_on
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.delayed.Delayed
        A dask delayed object containing the compute graph to build a cube returning the dict of dataset metadata
        objects.
    """
    check_store_factory(store)
    if not isinstance(data, dict):
        data = {cube.seed_dataset: data}

    ktk_cube_dataset_ids = sorted(data.keys())

    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)
    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store)
    check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets)
    partition_on_checked = prepare_ktk_partition_on(
        cube, ktk_cube_dataset_ids, partition_on
    )
    del partition_on

    dct = {}
    for table_name, ddf in data.items():
        check_user_df(table_name, ddf, cube, set(), partition_on_checked[table_name])

        indices_to_build = set(cube.index_columns) & set(ddf.columns)
        if table_name == cube.seed_dataset:
            indices_to_build |= set(cube.dimension_columns) - cube.suppress_index_on
        indices_to_build -= set(partition_on_checked[table_name])

        ddf = ddf.map_partitions(
            assert_dimesion_index_cols_notnull,
            ktk_cube_dataset_id=table_name,
            cube=cube,
            partition_on=partition_on_checked[table_name],
            meta=ddf._meta,
        )
        graph = store_dataset_from_ddf(
            ddf,
            dataset_uuid=cube.ktk_dataset_uuid(table_name),
            store=store,
            metadata=prepare_ktk_metadata(cube, table_name, metadata),
            partition_on=partition_on_checked[table_name],
            secondary_indices=sorted(indices_to_build),
            sort_partitions_by=sorted(
                (set(cube.dimension_columns) - set(cube.partition_columns))
                & set(ddf.columns)
            ),
            overwrite=overwrite,
            shuffle=shuffle,
            num_buckets=num_buckets,
            bucket_by=bucket_by,
            df_serializer=df_serializer,
        )
        dct[table_name] = graph

    return dask.delayed(apply_postwrite_checks)(
        dct, cube=cube, store=store, existing_datasets=existing_datasets
    )
コード例 #23
0
def get_copy_keys(cube, src_store, tgt_store, overwrite, datasets=None):
    """
    Get and check keys that should be copied from one store to another.

    Parameters
    ----------
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    src_store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
        Source KV store.
    tgt_store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
        Target KV store.
    overwrite: bool
        If possibly existing datasets in the target store should be overwritten.
    datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]]
        Datasets to copy, must all be part of the cube. May be either the result of :meth:`discover_datasets`, an
        iterable of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied).

    Returns
    -------
    keys: Set[str]
        Set of keys to copy.

    Raises
    ------
    RuntimeError: In case the copy would not pass successfully or if there is no cube in ``src_store``.
    """
    if not isinstance(datasets, dict):
        new_datasets = discover_datasets_unchecked(
            uuid_prefix=cube.uuid_prefix,
            store=src_store,
            filter_ktk_cube_dataset_ids=datasets,
        )
    else:
        new_datasets = datasets

    if datasets is None:
        if not new_datasets:
            raise RuntimeError("{} not found in source store".format(cube))
    else:
        unknown_datasets = set(datasets) - set(new_datasets)
        if unknown_datasets:
            raise RuntimeError(
                "{cube}, datasets {datasets} do not exist in source store".
                format(cube=cube, datasets=unknown_datasets))

    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix,
                                                    tgt_store)

    if not overwrite:
        for ktk_cube_dataset_id in sorted(new_datasets.keys()):
            if ktk_cube_dataset_id in existing_datasets:
                raise RuntimeError(
                    'Dataset "{uuid}" exists in target store but overwrite was set to False'
                    .format(uuid=new_datasets[ktk_cube_dataset_id].uuid))

    all_datasets = copy(existing_datasets)
    all_datasets.update(new_datasets)

    check_datasets(all_datasets, cube)

    keys = set()
    for ktk_cube_dataset_id in sorted(new_datasets.keys()):
        ds = new_datasets[ktk_cube_dataset_id]
        keys |= get_dataset_keys(ds)

    return keys
コード例 #24
0
def build_cube(data,
               cube,
               store,
               metadata=None,
               overwrite=False,
               partition_on=None):
    """
    Store given dataframes as Ktk_cube cube.

    ``data`` can be formatted in multiple ways:

    - single DataFrame::

          pd.DataFrame({
              'x': [0, 1, 2, 3],
              'p': [0, 0, 1, 1],
              'v': [42, 45, 20, 10],
          })

      In that case, the seed dataset will be written.

    - dictionary of DataFrames::

          {
              'seed': pd.DataFrame({
                  'x': [0, 1, 2, 3],
                  'p': [0, 0, 1, 1],
                  'v1': [42, 45, 20, 10],
              }),
              'enrich': pd.DataFrame({
                  'x': [0, 1, 2, 3],
                  'p': [0, 0, 1, 1],
                  'v2': [False, False, True, False],
              }),
          }

      In that case, multiple datasets can be written at the same time. Note that the seed dataset MUST be included.

    - list of anything above::

          [
              # seed data only
              pd.DataFrame({
                  'x': [0, 1, 2, 3],
                  'p': [0, 0, 1, 1],
                  'v1': [42, 45, 20, 10],
              }),
              # seed data only, explicit way
              {
                  'seed': pd.DataFrame({
                      'x': [4, 5, 6, 7],
                      'p': [0, 0, 1, 1],
                      'v1': [12, 32, 22, 9],
                  }),
              },
              # multiple datasets
              {
                  'seed': pd.DataFrame({
                      'x': [8, 9, 10, 11],
                      'p': [0, 0, 1, 1],
                      'v1': [9, 2, 4, 11],
                  }),
                  'enrich': pd.DataFrame({
                      'x': [8, 9, 10, 11],
                      'p': [0, 0, 1, 1],
                      'v2': [True, True, False, False],
                  }),
              },
              # non-seed data only
              {
                  'enrich': pd.DataFrame({
                      'x': [1, 2, 3, 4],
                      'p': [0, 0, 1, 1],
                      'v2': [False, True, False, False],
                  }),
              },
          ]

      In that case, multiple datasets may be written. Note that at least a single list element must contain seed data.

    Extra metdata may be preserved w/ every dataset, e.g.::

        {
            'seed': {
                'source': 'db',
                'host': 'db1.cluster20.company.net',
                'last_event': '230c6edb-b69a-4d30-b56d-28f5dfe20948',
            },
            'enrich': {
                'source': 'python',
                'commit_hash': '8b5d717518439921e6d17c7495956bdad687bc54',
            },
        }

    Note that the given data must be JSON-serializable.

    If the cube already exists, the ``overwrite`` flag must be given. In that case, all datasets that are part of the
    existing cube must be overwritten. Partial overwrites are not allowed.

    Parameters
    ----------
    data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]]
        Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed
        dataset.
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: simplekv.KeyValueStore
        Store to which the data should be written to.
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every dataset.
    overwrite: bool
        If possibly existing datasets should be overwritten.
    partition_on: Optional[Dict[str, Iterable[str]]]
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
        See :ref:`Dimensionality and Partitioning Details` for details.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        DatasetMetadata for every dataset written.
    """
    data = _normalize_user_input(data, cube)
    ktk_cube_dataset_ids = set(data.keys())
    partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids,
                                            partition_on)
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store)
    check_datasets_prebuild(data, cube, existing_datasets)

    # do all data preparation before writing anything
    data = _prepare_data_for_ktk_all(data=data,
                                     cube=cube,
                                     existing_payload=set(),
                                     partition_on=partition_on)

    datasets = {}
    for ktk_cube_dataset_id, part in data.items():
        datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset(
            store=store,
            dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
            dfs=part,
            metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata),
            partition_on=list(partition_on[ktk_cube_dataset_id]),
            metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT,
            metadata_version=KTK_CUBE_METADATA_VERSION,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
            overwrite=overwrite,
        )

    return apply_postwrite_checks(datasets=datasets,
                                  cube=cube,
                                  store=store,
                                  existing_datasets=existing_datasets)
コード例 #25
0
def copy_cube(
    cube: Cube,
    src_store: Union[KeyValueStore, Callable[[], KeyValueStore]],
    tgt_store: Union[KeyValueStore, Callable[[], KeyValueStore]],
    overwrite: bool = False,
    datasets: Union[None, Iterable[str], Dict[str, DatasetMetadata]] = None,
    renamed_cube_prefix: Optional[str] = None,
    renamed_datasets: Optional[Dict[str, str]] = None,
):
    """
    Copy cube from one store to another.

    .. warning::
        A failing copy operation can not be rolled back if the `overwrite` flag is enabled
        and might leave the overwritten dataset in an inconsistent state.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    src_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Source KV store.
    tgt_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Target KV store.
    overwrite: bool
        If possibly existing datasets in the target store should be overwritten.
    datasets: Union[None, Iterable[str], Dict[str, DatasetMetadata]]
        Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied).
    renamed_cube_prefix: Optional[str]
        Optional new cube prefix. If specified, the cube will be renamed while copying.
    renamed_datasets: Optional[Dict[str, str]]
        Optional dict with {old dataset name: new dataset name} entries. If provided,
        the datasets will be renamed accordingly during copying. When the parameter
        datasets is specified, the datasets to rename must be a subset of the datasets
        to copy.
    """
    if callable(src_store):
        src_store = src_store()
    if callable(tgt_store):
        tgt_store = tgt_store()
    assert_stores_different(src_store, tgt_store,
                            cube.ktk_dataset_uuid(cube.seed_dataset))
    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix,
                                                    tgt_store)

    if renamed_datasets is None:
        new_seed_dataset = cube.seed_dataset
    else:
        new_seed_dataset = renamed_datasets.get(cube.seed_dataset,
                                                cube.seed_dataset)

    new_cube = Cube(
        dimension_columns=cube.dimension_columns,
        partition_columns=cube.partition_columns,
        uuid_prefix=renamed_cube_prefix or cube.uuid_prefix,
        seed_dataset=new_seed_dataset,
        index_columns=cube.index_columns,
        suppress_index_on=cube.suppress_index_on,
    )

    datasets_to_copy = get_datasets_to_copy(
        cube=cube,
        src_store=src_store,
        tgt_store=tgt_store,
        overwrite=overwrite,
        datasets=datasets,
    )

    copied = {}  # type: Dict[str, DatasetMetadata]
    for src_ds_name, src_ds_meta in datasets_to_copy.items():
        tgt_ds_uuid = _transform_uuid(
            src_uuid=src_ds_meta.uuid,
            cube_prefix=cube.uuid_prefix,
            renamed_cube_prefix=renamed_cube_prefix,
            renamed_datasets=renamed_datasets,
        )
        try:
            md_transformed = copy_dataset(
                source_dataset_uuid=src_ds_meta.uuid,
                store=src_store,
                target_dataset_uuid=tgt_ds_uuid,
                target_store=tgt_store,
            )
        except Exception as e:
            if overwrite:
                # We can't roll back safely if the target dataset has been partially overwritten.
                raise RuntimeError(e)
            else:
                apply_postwrite_checks(
                    datasets=copied,
                    cube=new_cube,
                    store=tgt_store,
                    existing_datasets=existing_datasets,
                )
        else:
            copied.update(md_transformed)