def test_raises_unspecified_partition_columns(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({
             "x": [0],
             "y": [0],
             "p": [0],
             "q": [0]
         }),
         name=cube.seed_dataset,
         partition_on=["p", "q"],
     )
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({
             "x": [0],
             "y": [0],
             "p": [0],
             "q": [0],
             "v1": [0]
         }),
         name="enrich",
         partition_on=["q"],
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (str(exc.value) ==
             "Unspecified but provided partition columns in enrich: p")
Esempio n. 2
0
 def test_raises_other_index_missing(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data={
                 SINGLE_TABLE: pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]})
             },
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ).build_indices(["x", "y"]),
         name=cube.seed_dataset,
     )
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data={
                 SINGLE_TABLE: pd.DataFrame(
                     {"x": [0], "y": [0], "p": [0], "q": [0], "i1": [1337]}
                 )
             },
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ),
         name="enrich",
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (
         str(exc.value)
         == 'ExplicitSecondaryIndex or PartitionIndex "i1" is missing in dataset "enrich".'
     )
 def test_raises_no_dimension_columns(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({
             "x": [0],
             "y": [0],
             "p": [0],
             "q": [0],
             "v1": 100
         }),
         name=cube.seed_dataset,
     )
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({
             "p": [0],
             "q": [0],
             "v2": 100
         }),
         name="enrich",
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (
         str(exc.value) ==
         'Dataset "enrich" must have at least 1 of the following dimension columns: x, y'
     )
 def test_raises_dtypes(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({
             "x": [0],
             "y": [0],
             "p": [0],
             "q": [0]
         }),
         name=cube.seed_dataset,
     )
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({
             "x": [0],
             "y": [0.0],
             "p": [0],
             "q": [0],
             "v1": 100
         }),
         name="enrich",
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert 'Found incompatible entries for column "y"' in str(exc.value)
 def test_raises_partition_on_overlap(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({
             "x": [0],
             "y": [0],
             "p": [0],
             "q": [0],
             "v1": 100
         }),
         name=cube.seed_dataset,
     )
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({
             "x": [0],
             "y": [0],
             "v1": 100
         }),
         name="enrich",
         partition_on=["v1"],
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert "Found columns present in multiple datasets" in str(exc.value)
Esempio n. 6
0
 def test_filter_empty(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}),
         name=cube.seed_dataset,
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store, {})
     assert str(exc.value) == 'Seed data ("myseed") is missing.'
Esempio n. 7
0
 def test_raises_no_datasets_found(self, cube, function_store):
     with pytest.raises(ValueError) as exc:
         discover_datasets(
             cube,
             function_store,
             filter_ktk_cube_dataset_ids=["enrich", "non_existing_table"],
         )
     assert (
         str(exc.value)
         == "Could not find the following requested datasets: enrich, non_existing_table"
     )
Esempio n. 8
0
def test_all(driver, function_store, existing_cube):
    result = driver(cube=existing_cube, store=function_store)

    assert set(result.keys()) == {"source", "enrich"}

    ds_source = result["source"]
    ds_enrich = result["enrich"]

    assert len(ds_source.partitions) == 0
    assert len(ds_enrich.partitions) == 0

    discover_datasets(existing_cube, function_store)
Esempio n. 9
0
 def test_filter_missing(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}),
         name=cube.seed_dataset,
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store, {"myseed", "enrich"})
     assert (
         str(exc.value) == "Could not find the following requested datasets: enrich"
     )
Esempio n. 10
0
 def test_raises_wrong_partition_on_seed_other(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({"x": [0], "y": [0], "p": [0]}),
         name=cube.seed_dataset,
         partition_on=["p"],
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (
         str(exc.value) == 'Seed dataset "myseed" has missing partition columns: q'
     )
Esempio n. 11
0
 def test_accepts_partition_index_for_index(self, cube, function_store):
     expected = {
         cube.seed_dataset: store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "v1": [0]}),
             name=cube.seed_dataset,
         ),
         "enrich": store_data(
             cube=cube,
             function_store=function_store,
             df=MetaPartition(
                 label=gen_uuid(),
                 data={
                     SINGLE_TABLE: pd.DataFrame(
                         {"x": [0], "y": [0], "i1": [1337], "v2": [42]}
                     )
                 },
                 metadata_version=KTK_CUBE_METADATA_VERSION,
             ),
             name="enrich",
             partition_on=["i1"],
         ),
     }
     actual = discover_datasets(cube, function_store)
     assert_datasets_equal(actual, expected)
Esempio n. 12
0
 def test_filter_ignores_invalid(self, cube, function_store):
     expected = {
         cube.seed_dataset: store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}),
             name=cube.seed_dataset,
         ),
         "enrich": store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "v1": 100}),
             name="enrich",
         ),
     }
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame(
             {
                 "x": [0],
                 "y": [0],
                 "p": [0],
                 "q": [0],
                 "v1": 100,  # overlapping payload
             }
         ),
         name="foo",
     )
     actual = discover_datasets(cube, function_store, {"myseed", "enrich"})
     assert_datasets_equal(actual, expected)
 def test_partition_on_nonseed_no_part(self, cube, function_store):
     expected = {
         cube.seed_dataset:
         store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({
                 "x": [0],
                 "y": [0],
                 "p": [0],
                 "q": [0]
             }),
             name=cube.seed_dataset,
         ),
         "enrich":
         store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({
                 "x": [0],
                 "y": [0],
                 "v1": [0]
             }),
             name="enrich",
             partition_on=[],
         ),
     }
     actual = discover_datasets(cube, function_store)
     assert_datasets_equal(actual, expected)
 def test_msgpack(self, cube, function_store):
     expected = {
         cube.seed_dataset:
         store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({
                 "x": [0],
                 "y": [0],
                 "p": [0],
                 "q": [0]
             }),
             name=cube.seed_dataset,
         ),
         "enrich":
         store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({
                 "x": [0],
                 "y": [0],
                 "p": [0],
                 "q": [0],
                 "v1": 100
             }),
             name="enrich",
             metadata_storage_format="msgpack",
         ),
     }
     actual = discover_datasets(cube, function_store)
     assert_datasets_equal(actual, expected)
Esempio n. 15
0
 def test_raises_missing_dimension_columns(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data={SINGLE_TABLE: pd.DataFrame({"x": [0], "p": [0], "q": [0]})},
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ).build_indices(["x"]),
         name=cube.seed_dataset,
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (
         str(exc.value) == 'Seed dataset "myseed" has missing dimension columns: y'
     )
Esempio n. 16
0
 def test_raises_wrong_table(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data={"foo": pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]})},
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ),
         name=cube.seed_dataset,
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (
         str(exc.value)
         == "Invalid datasets because table is wrong. Expected table: myseed (foo)"
     )
Esempio n. 17
0
 def test_raises_partial_datasets_found(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}),
         name="enrich",
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(
             cube,
             function_store,
             filter_ktk_cube_dataset_ids=["enrich", "non_existing_table"],
         )
     assert (
         str(exc.value)
         == "Could not find the following requested datasets: non_existing_table"
     )
 def test_raises_dimension_index_missing(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data=pd.DataFrame({
                 "x": [0],
                 "y": [0],
                 "p": [0],
                 "q": [0]
             }),
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ),
         name=cube.seed_dataset,
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (str(exc.value) ==
             'ExplicitSecondaryIndex "x" is missing in dataset "myseed".')
Esempio n. 19
0
 def test_seed_only(self, cube, function_store):
     expected = {
         cube.seed_dataset: store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}),
             name=cube.seed_dataset,
         )
     }
     actual = discover_datasets(cube, function_store)
     assert_datasets_equal(actual, expected)
 def test_accepts_projected_datasets(self, cube, function_store):
     expected = {
         cube.seed_dataset:
         store_data(
             cube=cube,
             function_store=function_store,
             df=MetaPartition(
                 label=gen_uuid(),
                 data=pd.DataFrame({
                     "x": [0],
                     "y": [0],
                     "p": [0],
                     "q": [0]
                 }),
                 metadata_version=KTK_CUBE_METADATA_VERSION,
             ).build_indices(["x", "y"]),
             name=cube.seed_dataset,
         ),
         "x":
         store_data(
             cube=cube,
             function_store=function_store,
             df=MetaPartition(
                 label=gen_uuid(),
                 data=pd.DataFrame({
                     "x": [0],
                     "p": [0],
                     "q": [0],
                     "v1": [42]
                 }),
                 metadata_version=KTK_CUBE_METADATA_VERSION,
             ),
             name="x",
         ),
         "y":
         store_data(
             cube=cube,
             function_store=function_store,
             df=MetaPartition(
                 label=gen_uuid(),
                 data=pd.DataFrame({
                     "y": [0],
                     "p": [0],
                     "q": [0],
                     "v2": [42]
                 }),
                 metadata_version=KTK_CUBE_METADATA_VERSION,
             ),
             name="y",
         ),
     }
     actual = discover_datasets(cube, function_store)
     assert_datasets_equal(actual, expected)
 def test_empty_dataset(self, cube, function_store):
     expected = {
         cube.seed_dataset:
         store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({
                 "x": [0],
                 "y": [0],
                 "p": [0],
                 "q": [0]
             }),
             name=cube.seed_dataset,
         ),
         "enrich":
         store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({
                 "x": [0],
                 "y": [0],
                 "p": [0],
                 "q": [0],
                 "v1": 100
             }),
             name="enrich",
             metadata_storage_format="msgpack",
         ),
     }
     expected = {
         filter_ktk_cube_dataset_id:
         update_dataset_from_dataframes([],
                                        store=function_store,
                                        dataset_uuid=ds.uuid,
                                        delete_scope=[{}])
         for filter_ktk_cube_dataset_id, ds in expected.items()
     }
     actual = discover_datasets(cube, function_store)
     assert_datasets_equal(actual, expected)
Esempio n. 22
0
def extend_cube(data,
                cube,
                store,
                metadata=None,
                overwrite=False,
                partition_on=None):
    """
    Store given dataframes into an existing Kartothek cube.

    For details on ``data`` and ``metadata``, see :meth:`build_cube`.

    Parameters
    ----------
    data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]]
        Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed
        dataset.
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: simplekv.KeyValueStore
        Store to which the data should be written to.
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every dataset.
    overwrite: bool
        If possibly existing datasets should be overwritten.
    partition_on: Optional[Dict[str, Iterable[str]]]
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
        See :ref:`Dimensionality and Partitioning Details` for details.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        DatasetMetadata for every dataset written.
    """
    data = _normalize_user_input(data, cube)
    ktk_cube_dataset_ids = set(data.keys())
    partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids,
                                            partition_on)
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    check_datasets_preextend(data, cube)

    existing_datasets = discover_datasets(cube, store)
    if overwrite:
        existing_datasets_cut = {
            ktk_cube_dataset_id: ds
            for ktk_cube_dataset_id, ds in existing_datasets.items()
            if ktk_cube_dataset_id not in data
        }
    else:
        existing_datasets_cut = existing_datasets
    existing_payload = get_cube_payload(existing_datasets_cut, cube)

    # do all data preparation before writing anything
    data = _prepare_data_for_ktk_all(
        data=data,
        cube=cube,
        existing_payload=existing_payload,
        partition_on=partition_on,
    )

    datasets = {}
    for ktk_cube_dataset_id, part in data.items():
        datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset(
            store=store,
            dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
            dfs=part,
            metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata),
            partition_on=list(partition_on[ktk_cube_dataset_id]),
            metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT,
            metadata_version=KTK_CUBE_METADATA_VERSION,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
            overwrite=overwrite,
        )

    return apply_postwrite_checks(datasets=datasets,
                                  cube=cube,
                                  store=store,
                                  existing_datasets=existing_datasets)
Esempio n. 23
0
def append_to_cube_from_bag_internal(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    remove_conditions=None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".


    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance.
    metadata:
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.
    remove_conditions:
        Conditions that select which partitions to remove.
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
        objects. The bag has a single partition with a single element.
    """
    check_store_factory(store)
    if ktk_cube_dataset_ids:
        ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)
    else:
        ktk_cube_dataset_ids = []
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    existing_datasets = discover_datasets(cube, store)
    cube = ensure_valid_cube_indices(existing_datasets, cube)
    # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
    # compat check within 1 dataset
    existing_payload: Set[str] = set()

    partition_on = {k: v.partition_keys for k, v in existing_datasets.items()}

    check_existing_datasets(existing_datasets=existing_datasets,
                            ktk_cube_dataset_ids=ktk_cube_dataset_ids)

    if remove_conditions is not None:
        remove_metapartitions = prepare_metapartitions_for_removal_action(
            cube, store, remove_conditions, ktk_cube_dataset_ids,
            existing_datasets)
        delete_scopes = {
            k: delete_scope
            for k, (_, _, delete_scope) in remove_metapartitions.items()
        }
    else:
        delete_scopes = {}

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _fill_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
                _multiplex_prepare_data_for_ktk,
                cube=cube,
                existing_payload=existing_payload,
                partition_on=partition_on,
            ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        update=True,
        existing_datasets=existing_datasets,
        delete_scopes=delete_scopes,
        df_serializer=df_serializer,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data
Esempio n. 24
0
def extend_cube_from_bag_internal(
    data: db.Bag,
    cube: Cube,
    store: KeyValueStore,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    overwrite: bool,
    partition_on: Optional[Dict[str, Iterable[str]]],
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Create dask computation graph that extends a cube by the data supplied from a dask bag.

    For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`.

    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes (see :func:`~kartothek.io.eager_cube.build_cube` for possible format and types).
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store:
        Store to which the data should be written to.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance.
    metadata:
        Metadata for every dataset.
    overwrite:
        If possibly existing datasets should be overwritten.
    partition_on:
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to extend a cube returning the dict of dataset metadata objects.
        The bag has a single partition with a single element.
    """
    check_store_factory(store)
    check_datasets_preextend(ktk_cube_dataset_ids, cube)
    if ktk_cube_dataset_ids:
        ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)
    else:
        ktk_cube_dataset_ids = []
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)
    prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids,
                                                 partition_on)

    existing_datasets = discover_datasets(cube, store)
    cube = ensure_valid_cube_indices(existing_datasets, cube)
    if overwrite:
        existing_datasets_cut = {
            ktk_cube_dataset_id: ds
            for ktk_cube_dataset_id, ds in existing_datasets.items()
            if ktk_cube_dataset_id not in ktk_cube_dataset_ids
        }
    else:
        existing_datasets_cut = existing_datasets
    existing_payload = get_cube_payload(existing_datasets_cut, cube)

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _multiplex_prepare_data_for_ktk,
            cube=cube,
            existing_payload=existing_payload,
            partition_on=prep_partition_on,
        ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        overwrite=overwrite,
        update=False,
        existing_datasets=existing_datasets,
        df_serializer=df_serializer,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data
Esempio n. 25
0
def plan_query(
    conditions,
    cube,
    datasets,
    dimension_columns,
    partition_by,
    payload_columns,
    store,
):
    """
    Plan cube query execution.

    .. important::
        If the intention does not contain a partition-by, this partition by the cube partition columns to speed up the
        query on parallel backends. In that case, the backend must concat and check the resulting dataframes before
        passing it to the user.

    Parameters
    ----------
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied.
    cube: Cube
        Cube specification.
    datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]]
        Datasets to query, must all be part of the cube.
    dimension_columns: Optional[Iterable[str]]
        Dimension columns of the query, may result in projection.
    partition_by: Optional[Iterable[str]]
        By which column logical partitions should be formed.
    payload_columns: Optional[Iterable[str]]
        Which columns apart from ``dimension_columns`` and ``partition_by`` should be returned.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Store to query from.

    Returns
    -------
    intent: QueryIntention
        Query intention.
    empty_df: pandas.DataFrame
        Empty DataFrame representing the output types.
    groups: Tuple[QueryGroup]
        Tuple of query groups. May be empty.
    """
    if callable(store):
        store = store()

    if not isinstance(datasets, dict):
        datasets = discover_datasets(cube=cube,
                                     store=store,
                                     filter_ktk_cube_dataset_ids=datasets)
    else:
        datasets = check_datasets(datasets, cube)

    datasets = {
        ktk_cube_dataset_id: ds.load_partition_indices()
        for ktk_cube_dataset_id, ds in datasets.items()
    }
    indexed_columns = _get_indexed_columns(datasets)

    intention = determine_intention(
        cube=cube,
        datasets=datasets,
        dimension_columns=dimension_columns,
        partition_by=partition_by,
        conditions=conditions,
        payload_columns=payload_columns,
        indexed_columns=indexed_columns,
    )

    datasets = _load_required_explicit_indices(datasets, intention, store)

    restrictive_dataset_ids = _determine_restrictive_dataset_ids(
        cube=cube, datasets=datasets, intention=intention)

    load_columns = _dermine_load_columns(cube=cube,
                                         datasets=datasets,
                                         intention=intention)

    datasets = _filter_relevant_datasets(datasets=datasets,
                                         load_columns=load_columns)

    empty_df = {
        ktk_cube_dataset_id: _reduce_empty_dtype_sizes(
            empty_dataframe_from_schema(
                schema=ds.schema,
                columns=sorted(
                    get_dataset_columns(ds)
                    & set(load_columns[ktk_cube_dataset_id])),
            ))
        for ktk_cube_dataset_id, ds in datasets.items()
    }

    empty_df_single = empty_df[cube.seed_dataset].copy()
    for k, df in empty_df.items():
        if k == cube.seed_dataset:
            continue
        if empty_df_single is None:
            empty_df_single = df.copy()
        else:
            empty_df_single = empty_df_single.merge(df)
    empty_df_single = empty_df_single[list(intention.output_columns)]

    groups = regroup(
        intention,
        cube=cube,
        datasets=datasets,
        empty_df=empty_df,
        indexed_columns=indexed_columns,
        load_columns=load_columns,
        restrictive_dataset_ids=restrictive_dataset_ids,
    )
    return intention, empty_df_single, groups
Esempio n. 26
0
def remove_partitions(cube,
                      store,
                      conditions=None,
                      ktk_cube_dataset_ids=None,
                      metadata=None):
    """
    Remove given partition range from cube using a transaction.

    Parameters
    ----------
    cube: kartothek.core.cube.cube.Cube
        Cube spec.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Store.
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied, optional. Defaults to "entire cube".
    ktk_cube_dataset_ids: Optional[Union[Iterable[Union[Str, Bytes]], Union[Str, Bytes]]]
        Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all".
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets, updated.
    """
    if callable(store):
        store_instance = store()
        store_factory = store
    else:
        store_instance = store

        def store_factory():
            return store

    existing_datasets = discover_datasets(cube, store)

    for (
            ktk_cube_dataset_id,
        (ds, mp, delete_scope),
    ) in prepare_metapartitions_for_removal_action(
            cube=cube,
            store=store_instance,
            conditions=conditions,
            ktk_cube_dataset_ids=ktk_cube_dataset_ids,
            existing_datasets=existing_datasets,
    ).items():
        mp = mp.store_dataframes(
            store=store_instance,
            dataset_uuid=ds.uuid,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
        )

        ds_factory = metadata_factory_from_dataset(ds,
                                                   with_schema=True,
                                                   store=store_factory)

        existing_datasets[
            ktk_cube_dataset_id] = update_dataset_from_partitions(
                mp,
                store_factory=store_factory,
                dataset_uuid=ds.uuid,
                ds_factory=ds_factory,
                metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id,
                                              metadata),
                metadata_merger=None,
                delete_scope=delete_scope,
            )

    return existing_datasets
Esempio n. 27
0
def remove_partitions(
    cube: Cube,
    store: Union[simplekv.KeyValueStore, StoreFactory],
    conditions: Union[None, Condition, Sequence[Condition],
                      Conjunction] = None,
    ktk_cube_dataset_ids: Optional[Union[Sequence[str], str]] = None,
    metadata: Optional[Dict[str, Dict[str, Any]]] = None,
):
    """
    Remove given partition range from cube using a transaction.

    Remove the partitions selected by ``conditions``. If no ``conditions`` are given,
    remove all partitions. For each considered dataset, only the subset of
    ``conditions`` that refers to the partition columns of the respective dataset
    is used. In particular, a dataset that is not partitioned at all is always considered
    selected by ``conditions``.

    Parameters
    ----------
    cube
        Cube spec.
    store
        Store.
    conditions
        Select the partitions to be removed. Must be a condition only on partition columns.
    ktk_cube_dataset_ids
        Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all".
    metadata
        Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets, updated.
    """
    if callable(store):
        store_instance = store()
        store_factory = store
    else:
        store_instance = store

        def store_factory():
            return store

    existing_datasets = discover_datasets(cube, store)

    for (
            ktk_cube_dataset_id,
        (ds, mp, delete_scope),
    ) in prepare_metapartitions_for_removal_action(
            cube=cube,
            store=store_instance,
            conditions=conditions,
            ktk_cube_dataset_ids=ktk_cube_dataset_ids,
            existing_datasets=existing_datasets,
    ).items():
        mp = mp.store_dataframes(
            store=store_instance,
            dataset_uuid=ds.uuid,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
        )

        ds_factory = metadata_factory_from_dataset(ds,
                                                   with_schema=True,
                                                   store=store_factory)

        existing_datasets[
            ktk_cube_dataset_id] = update_dataset_from_partitions(
                mp,
                store_factory=store_factory,
                dataset_uuid=ds.uuid,
                ds_factory=ds_factory,
                metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id,
                                              metadata),
                metadata_merger=None,
                delete_scope=delete_scope,
            )

    return existing_datasets
Esempio n. 28
0
def append_to_cube(data, cube, store, metadata=None):
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :meth:`build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".

    .. hint::

        To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use
        :meth:`remove_partitions` beforehand.

    Parameters
    ----------
    data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]]
        Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed
        dataset.
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: simplekv.KeyValueStore
        Store to which the data should be written to.
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        DatasetMetadata for every dataset written.
    """
    data = _normalize_user_input(data, cube)

    existing_datasets = discover_datasets(cube, store)
    partition_on = {k: v.partition_keys for k, v in existing_datasets.items()}

    check_existing_datasets(existing_datasets=existing_datasets,
                            ktk_cube_dataset_ids=set(data.keys()))

    # do all data preparation before writing anything
    # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
    # compat check within 1 dataset
    data = _prepare_data_for_ktk_all(data=data,
                                     cube=cube,
                                     existing_payload=set(),
                                     partition_on=partition_on)

    # update_dataset_from_dataframes requires a store factory, so create one
    # if not provided
    if not callable(store):

        def store_factory():
            return store

    else:
        store_factory = store

    updated_datasets = {}
    for ktk_cube_dataset_id, part in data.items():
        updated_datasets[ktk_cube_dataset_id] = update_dataset_from_dataframes(
            store=store_factory,
            dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
            df_list=part,
            partition_on=list(partition_on[ktk_cube_dataset_id]),
            df_serializer=KTK_CUBE_DF_SERIALIZER,
            metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata),
        )

    return apply_postwrite_checks(
        datasets=updated_datasets,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )
Esempio n. 29
0
def append_to_cube_from_bag_internal(data, cube, store, ktk_cube_dataset_ids,
                                     metadata):
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :meth:`build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".

    .. hint::

        To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use
        :meth:`remove_partitions` beforehand.

    Parameters
    ----------
    data: dask.Bag
        Bag containing dataframes
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: Callable[[], simplekv.KeyValueStore]
        Store to which the data should be written to.
    ktk_cube_dataset_ids: Optional[Iterable[str]]
        Datasets that will be written, must be specified in advance.
    metadata: Dict[str, Dict[str, Any]]
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
        objects. The bag has a single partition with a single element.
    """
    check_store_factory(store)
    ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids)
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    existing_datasets = discover_datasets(cube, store)
    # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
    # compat check within 1 dataset
    existing_payload = set()

    partition_on = {k: v.partition_keys for k, v in existing_datasets.items()}

    check_existing_datasets(existing_datasets=existing_datasets,
                            ktk_cube_dataset_ids=ktk_cube_dataset_ids)

    data = (data.map(multiplex_user_input, cube=cube).map(
        _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map(
            _multiplex_prepare_data_for_ktk,
            cube=cube,
            existing_payload=existing_payload,
            partition_on=partition_on,
        ))

    data = _store_bag_as_dataset_parallel(
        bag=data,
        store=store,
        cube=cube,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata={
            ktk_cube_dataset_id:
            prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata)
            for ktk_cube_dataset_id in ktk_cube_dataset_ids
        },
        update=True,
        existing_datasets=existing_datasets,
    )

    data = data.map(
        apply_postwrite_checks,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )

    return data