Example #1
0
def _map_ktk_mps_to_groups(cube, datasets, label2gp):
    """
    Map Kartothek metapartitions to groups.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that are processed by the regrouper.
    label2gp: Dict[str, Dict[str, Tuple[int, int]]]
        Maps "dataset ID -> (label -> (group ID, partition ID))".

    Returns
    -------
    groups: Dict[int, Dict[int, Dict[str, Tuple[kartothek.io_components.metapartition.MetaPartition, ...]]]]
        Maps "group ID -> (partition ID -> (dataset ID -> list of MetaPartitions))"
    """
    groups = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    for ktk_cube_dataset_id, ds in datasets.items():
        label2gp_sub = label2gp[ktk_cube_dataset_id]
        for mp in dispatch_metapartitions_from_factory(
                dataset_factory=metadata_factory_from_dataset(ds), ):
            # FIXME: can this be simplified?
            if mp.label not in label2gp_sub:
                # filtered out by pre-condition
                continue
            for group_id, partition_id in label2gp_sub[mp.label]:
                groups[group_id][partition_id][ktk_cube_dataset_id].append(mp)

    return groups
Example #2
0
def _multiplex_store_dataset_from_partitions_flat(mpss, cube, metadata, update,
                                                  store, existing_datasets):
    dct = defaultdict(list)
    for sublist in mpss:
        for mp in sublist:
            for k, v in mp.items():
                dct[k].append(v)

    result = {}
    for k, v in dct.items():
        if update:
            ds_factory = metadata_factory_from_dataset(existing_datasets[k],
                                                       with_schema=True,
                                                       store=store)
            result[k] = update_dataset_from_partitions(
                v,
                dataset_uuid=cube.ktk_dataset_uuid(k),
                delete_scope=[],
                ds_factory=ds_factory,
                metadata=metadata[k],
                metadata_merger=None,
                store_factory=store,
            )
        else:
            result[k] = store_dataset_from_partitions(
                v,
                dataset_metadata=metadata[k],
                dataset_uuid=cube.ktk_dataset_uuid(k),
                metadata_merger=None,
                metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT,
                store=store,
            )

    # list required for dask.bag
    return [result]
Example #3
0
def test_metadata_factory_from_dataset_no_store(function_store, ds, load_schema):
    ds2 = DatasetMetadata.load_from_store(
        "uuid", function_store(), load_schema=load_schema
    )
    factory = metadata_factory_from_dataset(ds2, with_schema=load_schema)
    assert factory.dataset_metadata is ds2

    store = factory.store
    with pytest.raises(NotImplementedError):
        store.get("foo")
Example #4
0
def test_metadata_factory_from_dataset_with_store(function_store, ds, load_schema):
    ds2 = DatasetMetadata.load_from_store(
        "uuid", function_store(), load_schema=load_schema
    )
    factory = metadata_factory_from_dataset(
        ds2, with_schema=load_schema, store=function_store
    )
    assert factory.dataset_metadata is ds2

    store = factory.store
    store.put("foo", b"bar")
    assert store.get("foo") == b"bar"
Example #5
0
def get_metapartitions_for_stats(datasets):
    """
    Get all metapartitions that need to be scanned to gather cube stats.

    Parameters
    ----------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that are present.

    Returns
    -------
    metapartitions: Tuple[Tuple[str, Tuple[kartothek.io_components.metapartition.MetaPartition, ...]], ...]
        Pre-aligned metapartitions (by primary index / physical partitions) and the ktk_cube dataset ID belonging to them.
    """
    all_metapartitions = []
    for ktk_cube_dataset_id, ds in datasets.items():
        dataset_factory = metadata_factory_from_dataset(ds)
        for mp in dispatch_metapartitions_from_factory(
                dataset_factory=dataset_factory,
                dispatch_by=dataset_factory.partition_keys):
            all_metapartitions.append((ktk_cube_dataset_id, mp))
    return all_metapartitions
Example #6
0
def remove_partitions(cube,
                      store,
                      conditions=None,
                      ktk_cube_dataset_ids=None,
                      metadata=None):
    """
    Remove given partition range from cube using a transaction.

    Parameters
    ----------
    cube: kartothek.core.cube.cube.Cube
        Cube spec.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Store.
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied, optional. Defaults to "entire cube".
    ktk_cube_dataset_ids: Optional[Union[Iterable[Union[Str, Bytes]], Union[Str, Bytes]]]
        Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all".
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets, updated.
    """
    if callable(store):
        store_instance = store()
        store_factory = store
    else:
        store_instance = store

        def store_factory():
            return store

    existing_datasets = discover_datasets(cube, store)

    for (
            ktk_cube_dataset_id,
        (ds, mp, delete_scope),
    ) in prepare_metapartitions_for_removal_action(
            cube=cube,
            store=store_instance,
            conditions=conditions,
            ktk_cube_dataset_ids=ktk_cube_dataset_ids,
            existing_datasets=existing_datasets,
    ).items():
        mp = mp.store_dataframes(
            store=store_instance,
            dataset_uuid=ds.uuid,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
        )

        ds_factory = metadata_factory_from_dataset(ds,
                                                   with_schema=True,
                                                   store=store_factory)

        existing_datasets[
            ktk_cube_dataset_id] = update_dataset_from_partitions(
                mp,
                store_factory=store_factory,
                dataset_uuid=ds.uuid,
                ds_factory=ds_factory,
                metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id,
                                              metadata),
                metadata_merger=None,
                delete_scope=delete_scope,
            )

    return existing_datasets
Example #7
0
def remove_partitions(
    cube: Cube,
    store: Union[simplekv.KeyValueStore, StoreFactory],
    conditions: Union[None, Condition, Sequence[Condition],
                      Conjunction] = None,
    ktk_cube_dataset_ids: Optional[Union[Sequence[str], str]] = None,
    metadata: Optional[Dict[str, Dict[str, Any]]] = None,
):
    """
    Remove given partition range from cube using a transaction.

    Remove the partitions selected by ``conditions``. If no ``conditions`` are given,
    remove all partitions. For each considered dataset, only the subset of
    ``conditions`` that refers to the partition columns of the respective dataset
    is used. In particular, a dataset that is not partitioned at all is always considered
    selected by ``conditions``.

    Parameters
    ----------
    cube
        Cube spec.
    store
        Store.
    conditions
        Select the partitions to be removed. Must be a condition only on partition columns.
    ktk_cube_dataset_ids
        Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all".
    metadata
        Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets, updated.
    """
    if callable(store):
        store_instance = store()
        store_factory = store
    else:
        store_instance = store

        def store_factory():
            return store

    existing_datasets = discover_datasets(cube, store)

    for (
            ktk_cube_dataset_id,
        (ds, mp, delete_scope),
    ) in prepare_metapartitions_for_removal_action(
            cube=cube,
            store=store_instance,
            conditions=conditions,
            ktk_cube_dataset_ids=ktk_cube_dataset_ids,
            existing_datasets=existing_datasets,
    ).items():
        mp = mp.store_dataframes(
            store=store_instance,
            dataset_uuid=ds.uuid,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
        )

        ds_factory = metadata_factory_from_dataset(ds,
                                                   with_schema=True,
                                                   store=store_factory)

        existing_datasets[
            ktk_cube_dataset_id] = update_dataset_from_partitions(
                mp,
                store_factory=store_factory,
                dataset_uuid=ds.uuid,
                ds_factory=ds_factory,
                metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id,
                                              metadata),
                metadata_merger=None,
                delete_scope=delete_scope,
            )

    return existing_datasets