Example #1
0
def test_collect_dataset_metadata_concat(store_factory):
    """Smoke-test concatenation of empty and non-empty dataset metadata collections."""
    df = pd.DataFrame(data={"A": [1, 1, 1, 1], "b": [1, 1, 2, 2]})
    store_dataframes_as_dataset(store=store_factory,
                                dataset_uuid="dataset_uuid",
                                dfs=[df],
                                partition_on=["A"])
    df_stats1 = collect_dataset_metadata(
        store=store_factory,
        dataset_uuid="dataset_uuid",
        table_name="table",
    ).compute()

    # Remove all partitions of the dataset
    update_dataset_from_dataframes([],
                                   store=store_factory,
                                   dataset_uuid="dataset_uuid",
                                   delete_scope=[{
                                       "A": 1
                                   }])

    df_stats2 = collect_dataset_metadata(
        store=store_factory,
        dataset_uuid="dataset_uuid",
        table_name="table",
    ).compute()
    pd.concat([df_stats1, df_stats2])
Example #2
0
def test_collect_dataset_metadata_delete_dataset(store_factory):
    df = pd.DataFrame(data={"A": [1, 1, 1, 1], "b": [1, 1, 2, 2]})
    store_dataframes_as_dataset(store=store_factory,
                                dataset_uuid="dataset_uuid",
                                dfs=[df],
                                partition_on=["A"])
    # Remove all partitions of the dataset
    update_dataset_from_dataframes([],
                                   store=store_factory,
                                   dataset_uuid="dataset_uuid",
                                   delete_scope=[{
                                       "A": 1
                                   }])

    df_stats = collect_dataset_metadata(
        store=store_factory,
        dataset_uuid="dataset_uuid",
    ).compute()
    expected = pd.DataFrame(columns=_METADATA_SCHEMA)
    expected = expected.astype(_METADATA_SCHEMA)
    pd.testing.assert_frame_equal(expected, df_stats)
 def test_empty_dataset(self, cube, function_store):
     expected = {
         cube.seed_dataset:
         store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({
                 "x": [0],
                 "y": [0],
                 "p": [0],
                 "q": [0]
             }),
             name=cube.seed_dataset,
         ),
         "enrich":
         store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({
                 "x": [0],
                 "y": [0],
                 "p": [0],
                 "q": [0],
                 "v1": 100
             }),
             name="enrich",
             metadata_storage_format="msgpack",
         ),
     }
     expected = {
         filter_ktk_cube_dataset_id:
         update_dataset_from_dataframes([],
                                        store=function_store,
                                        dataset_uuid=ds.uuid,
                                        delete_scope=[{}])
         for filter_ktk_cube_dataset_id, ds in expected.items()
     }
     actual = discover_datasets(cube, function_store)
     assert_datasets_equal(actual, expected)
Example #4
0
def append_to_cube(data, cube, store, metadata=None):
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :meth:`build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".

    .. hint::

        To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use
        :meth:`remove_partitions` beforehand.

    Parameters
    ----------
    data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]]
        Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed
        dataset.
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: simplekv.KeyValueStore
        Store to which the data should be written to.
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        DatasetMetadata for every dataset written.
    """
    data = _normalize_user_input(data, cube)

    existing_datasets = discover_datasets(cube, store)
    partition_on = {k: v.partition_keys for k, v in existing_datasets.items()}

    check_existing_datasets(existing_datasets=existing_datasets,
                            ktk_cube_dataset_ids=set(data.keys()))

    # do all data preparation before writing anything
    # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
    # compat check within 1 dataset
    data = _prepare_data_for_ktk_all(data=data,
                                     cube=cube,
                                     existing_payload=set(),
                                     partition_on=partition_on)

    # update_dataset_from_dataframes requires a store factory, so create one
    # if not provided
    if not callable(store):

        def store_factory():
            return store

    else:
        store_factory = store

    updated_datasets = {}
    for ktk_cube_dataset_id, part in data.items():
        updated_datasets[ktk_cube_dataset_id] = update_dataset_from_dataframes(
            store=store_factory,
            dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
            df_list=part,
            partition_on=list(partition_on[ktk_cube_dataset_id]),
            df_serializer=KTK_CUBE_DF_SERIALIZER,
            metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata),
        )

    return apply_postwrite_checks(
        datasets=updated_datasets,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )