def test_prepare_ktk_metadata_no_source(cube): metadata = prepare_ktk_metadata(cube, "no_source", None) assert metadata == { KTK_CUBE_METADATA_DIMENSION_COLUMNS: ["x"], KTK_CUBE_METADATA_PARTITION_COLUMNS: ["p"], KTK_CUBE_METADATA_KEY_IS_SEED: False, }
def test_prepare_ktk_metadata_simple(cube): metadata = prepare_ktk_metadata(cube, "source", None) assert metadata == { KTK_CUBE_METADATA_DIMENSION_COLUMNS: ["x"], KTK_CUBE_METADATA_PARTITION_COLUMNS: ["p"], KTK_CUBE_METADATA_KEY_IS_SEED: True, KTK_CUBE_METADATA_SUPPRESS_INDEX_ON: [], }
def test_prepare_ktk_metadata_suppress_index_on(cube): cube = cube.copy(suppress_index_on=["x"]) metadata = prepare_ktk_metadata(cube, "no_source", None) assert metadata == { KTK_CUBE_METADATA_DIMENSION_COLUMNS: ["x"], KTK_CUBE_METADATA_PARTITION_COLUMNS: ["p"], KTK_CUBE_METADATA_KEY_IS_SEED: False, KTK_CUBE_METADATA_SUPPRESS_INDEX_ON: ["x"], }
def test_prepare_ktk_metadata_usermeta(cube): metadata = prepare_ktk_metadata( cube, "no_source", {"source": {"bla": "blub"}, "no_source": {"user_key0": "value0"}}, ) assert metadata == { KTK_CUBE_METADATA_DIMENSION_COLUMNS: ["x"], KTK_CUBE_METADATA_PARTITION_COLUMNS: ["p"], KTK_CUBE_METADATA_KEY_IS_SEED: False, "user_key0": "value0", }
def build_cube(data, cube, store, metadata=None, overwrite=False, partition_on=None): """ Store given dataframes as Ktk_cube cube. ``data`` can be formatted in multiple ways: - single DataFrame:: pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v': [42, 45, 20, 10], }) In that case, the seed dataset will be written. - dictionary of DataFrames:: { 'seed': pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v1': [42, 45, 20, 10], }), 'enrich': pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v2': [False, False, True, False], }), } In that case, multiple datasets can be written at the same time. Note that the seed dataset MUST be included. - list of anything above:: [ # seed data only pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v1': [42, 45, 20, 10], }), # seed data only, explicit way { 'seed': pd.DataFrame({ 'x': [4, 5, 6, 7], 'p': [0, 0, 1, 1], 'v1': [12, 32, 22, 9], }), }, # multiple datasets { 'seed': pd.DataFrame({ 'x': [8, 9, 10, 11], 'p': [0, 0, 1, 1], 'v1': [9, 2, 4, 11], }), 'enrich': pd.DataFrame({ 'x': [8, 9, 10, 11], 'p': [0, 0, 1, 1], 'v2': [True, True, False, False], }), }, # non-seed data only { 'enrich': pd.DataFrame({ 'x': [1, 2, 3, 4], 'p': [0, 0, 1, 1], 'v2': [False, True, False, False], }), }, ] In that case, multiple datasets may be written. Note that at least a single list element must contain seed data. Extra metdata may be preserved w/ every dataset, e.g.:: { 'seed': { 'source': 'db', 'host': 'db1.cluster20.company.net', 'last_event': '230c6edb-b69a-4d30-b56d-28f5dfe20948', }, 'enrich': { 'source': 'python', 'commit_hash': '8b5d717518439921e6d17c7495956bdad687bc54', }, } Note that the given data must be JSON-serializable. If the cube already exists, the ``overwrite`` flag must be given. In that case, all datasets that are part of the existing cube must be overwritten. Partial overwrites are not allowed. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset. overwrite: bool If possibly existing datasets should be overwritten. partition_on: Optional[Dict[str, Iterable[str]]] Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). See :ref:`Dimensionality and Partitioning Details` for details. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) ktk_cube_dataset_ids = set(data.keys()) partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(data, cube, existing_datasets) # do all data preparation before writing anything data = _prepare_data_for_ktk_all(data=data, cube=cube, existing_payload=set(), partition_on=partition_on) datasets = {} for ktk_cube_dataset_id, part in data.items(): datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset( store=store, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), dfs=part, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), partition_on=list(partition_on[ktk_cube_dataset_id]), metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, metadata_version=KTK_CUBE_METADATA_VERSION, df_serializer=KTK_CUBE_DF_SERIALIZER, overwrite=overwrite, ) return apply_postwrite_checks(datasets=datasets, cube=cube, store=store, existing_datasets=existing_datasets)
def append_to_cube(data, cube, store, metadata=None): """ Append data to existing cube. For details on ``data`` and ``metadata``, see :meth:`build_cube`. .. important:: Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the old data is treated as "removed". .. hint:: To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use :meth:`remove_partitions` beforehand. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of metadata keys is not possible. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) existing_datasets = discover_datasets(cube, store) partition_on = {k: v.partition_keys for k, v in existing_datasets.items()} check_existing_datasets(existing_datasets=existing_datasets, ktk_cube_dataset_ids=set(data.keys())) # do all data preparation before writing anything # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the # compat check within 1 dataset data = _prepare_data_for_ktk_all(data=data, cube=cube, existing_payload=set(), partition_on=partition_on) # update_dataset_from_dataframes requires a store factory, so create one # if not provided if not callable(store): def store_factory(): return store else: store_factory = store updated_datasets = {} for ktk_cube_dataset_id, part in data.items(): updated_datasets[ktk_cube_dataset_id] = update_dataset_from_dataframes( store=store_factory, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), df_list=part, partition_on=list(partition_on[ktk_cube_dataset_id]), df_serializer=KTK_CUBE_DF_SERIALIZER, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), ) return apply_postwrite_checks( datasets=updated_datasets, cube=cube, store=store, existing_datasets=existing_datasets, )
def remove_partitions(cube, store, conditions=None, ktk_cube_dataset_ids=None, metadata=None): """ Remove given partition range from cube using a transaction. Parameters ---------- cube: kartothek.core.cube.cube.Cube Cube spec. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Store. conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied, optional. Defaults to "entire cube". ktk_cube_dataset_ids: Optional[Union[Iterable[Union[Str, Bytes]], Union[Str, Bytes]]] Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all". metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of metadata keys is not possible. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets, updated. """ if callable(store): store_instance = store() store_factory = store else: store_instance = store def store_factory(): return store existing_datasets = discover_datasets(cube, store) for ( ktk_cube_dataset_id, (ds, mp, delete_scope), ) in prepare_metapartitions_for_removal_action( cube=cube, store=store_instance, conditions=conditions, ktk_cube_dataset_ids=ktk_cube_dataset_ids, existing_datasets=existing_datasets, ).items(): mp = mp.store_dataframes( store=store_instance, dataset_uuid=ds.uuid, df_serializer=KTK_CUBE_DF_SERIALIZER, ) ds_factory = metadata_factory_from_dataset(ds, with_schema=True, store=store_factory) existing_datasets[ ktk_cube_dataset_id] = update_dataset_from_partitions( mp, store_factory=store_factory, dataset_uuid=ds.uuid, ds_factory=ds_factory, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), metadata_merger=None, delete_scope=delete_scope, ) return existing_datasets
def extend_cube(data, cube, store, metadata=None, overwrite=False, partition_on=None): """ Store given dataframes into an existing Kartothek cube. For details on ``data`` and ``metadata``, see :meth:`build_cube`. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset. overwrite: bool If possibly existing datasets should be overwritten. partition_on: Optional[Dict[str, Iterable[str]]] Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). See :ref:`Dimensionality and Partitioning Details` for details. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) ktk_cube_dataset_ids = set(data.keys()) partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) check_datasets_preextend(data, cube) existing_datasets = discover_datasets(cube, store) if overwrite: existing_datasets_cut = { ktk_cube_dataset_id: ds for ktk_cube_dataset_id, ds in existing_datasets.items() if ktk_cube_dataset_id not in data } else: existing_datasets_cut = existing_datasets existing_payload = get_cube_payload(existing_datasets_cut, cube) # do all data preparation before writing anything data = _prepare_data_for_ktk_all( data=data, cube=cube, existing_payload=existing_payload, partition_on=partition_on, ) datasets = {} for ktk_cube_dataset_id, part in data.items(): datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset( store=store, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), dfs=part, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), partition_on=list(partition_on[ktk_cube_dataset_id]), metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, metadata_version=KTK_CUBE_METADATA_VERSION, df_serializer=KTK_CUBE_DF_SERIALIZER, overwrite=overwrite, ) return apply_postwrite_checks(datasets=datasets, cube=cube, store=store, existing_datasets=existing_datasets)
def build_cube_from_bag_internal( data: db.Bag, cube: Cube, store: StoreFactory, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], overwrite: bool, partition_on: Optional[Dict[str, Iterable[str]]], df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Create dask computation graph that builds a cube with the data supplied from a dask bag. Parameters ---------- data: dask.bag.Bag Bag containing dataframes cube: Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. If left unprovided, it is assumed that only the seed dataset will be written. metadata: Metadata for every dataset. overwrite: If possibly existing datasets should be overwritten. partition_on: Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to build a cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) if ktk_cube_dataset_ids is None: ktk_cube_dataset_ids = [cube.seed_dataset] else: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets) prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) cube = ensure_valid_cube_indices(existing_datasets, cube) data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=set(), partition_on=prep_partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, overwrite=overwrite, update=False, existing_datasets=existing_datasets, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def append_to_cube_from_bag_internal( data: db.Bag, cube: Cube, store: StoreFactory, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], remove_conditions=None, df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Append data to existing cube. For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`. .. important:: Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the old data is treated as "removed". Parameters ---------- data: dask.bag.Bag Bag containing dataframes cube: Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. metadata: Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of metadata keys is not possible. remove_conditions: Conditions that select which partitions to remove. df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) if ktk_cube_dataset_ids: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) else: ktk_cube_dataset_ids = [] metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets(cube, store) cube = ensure_valid_cube_indices(existing_datasets, cube) # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the # compat check within 1 dataset existing_payload: Set[str] = set() partition_on = {k: v.partition_keys for k, v in existing_datasets.items()} check_existing_datasets(existing_datasets=existing_datasets, ktk_cube_dataset_ids=ktk_cube_dataset_ids) if remove_conditions is not None: remove_metapartitions = prepare_metapartitions_for_removal_action( cube, store, remove_conditions, ktk_cube_dataset_ids, existing_datasets) delete_scopes = { k: delete_scope for k, (_, _, delete_scope) in remove_metapartitions.items() } else: delete_scopes = {} data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _fill_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=existing_payload, partition_on=partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, update=True, existing_datasets=existing_datasets, delete_scopes=delete_scopes, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def extend_cube_from_bag_internal( data: db.Bag, cube: Cube, store: KeyValueStore, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], overwrite: bool, partition_on: Optional[Dict[str, Iterable[str]]], df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Create dask computation graph that extends a cube by the data supplied from a dask bag. For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`. Parameters ---------- data: dask.bag.Bag Bag containing dataframes (see :func:`~kartothek.io.eager_cube.build_cube` for possible format and types). cube: kartothek.core.cube.cube.Cube Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. metadata: Metadata for every dataset. overwrite: If possibly existing datasets should be overwritten. partition_on: Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to extend a cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) check_datasets_preextend(ktk_cube_dataset_ids, cube) if ktk_cube_dataset_ids: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) else: ktk_cube_dataset_ids = [] metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) existing_datasets = discover_datasets(cube, store) cube = ensure_valid_cube_indices(existing_datasets, cube) if overwrite: existing_datasets_cut = { ktk_cube_dataset_id: ds for ktk_cube_dataset_id, ds in existing_datasets.items() if ktk_cube_dataset_id not in ktk_cube_dataset_ids } else: existing_datasets_cut = existing_datasets existing_payload = get_cube_payload(existing_datasets_cut, cube) data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=existing_payload, partition_on=prep_partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, overwrite=overwrite, update=False, existing_datasets=existing_datasets, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def build_cube_from_dataframe( data: Union[dd.DataFrame, Dict[str, dd.DataFrame]], cube: Cube, store: StoreFactory, metadata: Optional[Dict[str, Dict[str, Any]]] = None, overwrite: bool = False, partition_on: Optional[Dict[str, Iterable[str]]] = None, shuffle: bool = False, num_buckets: int = 1, bucket_by: Optional[Iterable[str]] = None, df_serializer: Optional[ParquetSerializer] = None, ) -> Delayed: """ Create dask computation graph that builds a cube with the data supplied from a dask dataframe. Parameters ---------- data Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube Cube specification. store Store to which the data should be written to. metadata Metadata for every dataset. overwrite If possibly existing datasets should be overwritten. partition_on Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.delayed.Delayed A dask delayed object containing the compute graph to build a cube returning the dict of dataset metadata objects. """ check_store_factory(store) if not isinstance(data, dict): data = {cube.seed_dataset: data} ktk_cube_dataset_ids = sorted(data.keys()) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets) partition_on_checked = prepare_ktk_partition_on( cube, ktk_cube_dataset_ids, partition_on ) del partition_on dct = {} for table_name, ddf in data.items(): check_user_df(table_name, ddf, cube, set(), partition_on_checked[table_name]) indices_to_build = set(cube.index_columns) & set(ddf.columns) if table_name == cube.seed_dataset: indices_to_build |= set(cube.dimension_columns) - cube.suppress_index_on indices_to_build -= set(partition_on_checked[table_name]) ddf = ddf.map_partitions( assert_dimesion_index_cols_notnull, ktk_cube_dataset_id=table_name, cube=cube, partition_on=partition_on_checked[table_name], meta=ddf._meta, ) graph = store_dataset_from_ddf( ddf, dataset_uuid=cube.ktk_dataset_uuid(table_name), store=store, metadata=prepare_ktk_metadata(cube, table_name, metadata), partition_on=partition_on_checked[table_name], secondary_indices=sorted(indices_to_build), sort_partitions_by=sorted( (set(cube.dimension_columns) - set(cube.partition_columns)) & set(ddf.columns) ), overwrite=overwrite, shuffle=shuffle, num_buckets=num_buckets, bucket_by=bucket_by, df_serializer=df_serializer, ) dct[table_name] = graph return dask.delayed(apply_postwrite_checks)( dct, cube=cube, store=store, existing_datasets=existing_datasets )
def remove_partitions( cube: Cube, store: Union[simplekv.KeyValueStore, StoreFactory], conditions: Union[None, Condition, Sequence[Condition], Conjunction] = None, ktk_cube_dataset_ids: Optional[Union[Sequence[str], str]] = None, metadata: Optional[Dict[str, Dict[str, Any]]] = None, ): """ Remove given partition range from cube using a transaction. Remove the partitions selected by ``conditions``. If no ``conditions`` are given, remove all partitions. For each considered dataset, only the subset of ``conditions`` that refers to the partition columns of the respective dataset is used. In particular, a dataset that is not partitioned at all is always considered selected by ``conditions``. Parameters ---------- cube Cube spec. store Store. conditions Select the partitions to be removed. Must be a condition only on partition columns. ktk_cube_dataset_ids Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all". metadata Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of metadata keys is not possible. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets, updated. """ if callable(store): store_instance = store() store_factory = store else: store_instance = store def store_factory(): return store existing_datasets = discover_datasets(cube, store) for ( ktk_cube_dataset_id, (ds, mp, delete_scope), ) in prepare_metapartitions_for_removal_action( cube=cube, store=store_instance, conditions=conditions, ktk_cube_dataset_ids=ktk_cube_dataset_ids, existing_datasets=existing_datasets, ).items(): mp = mp.store_dataframes( store=store_instance, dataset_uuid=ds.uuid, df_serializer=KTK_CUBE_DF_SERIALIZER, ) ds_factory = metadata_factory_from_dataset(ds, with_schema=True, store=store_factory) existing_datasets[ ktk_cube_dataset_id] = update_dataset_from_partitions( mp, store_factory=store_factory, dataset_uuid=ds.uuid, ds_factory=ds_factory, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), metadata_merger=None, delete_scope=delete_scope, ) return existing_datasets
def append_to_cube_from_bag_internal(data, cube, store, ktk_cube_dataset_ids, metadata): """ Append data to existing cube. For details on ``data`` and ``metadata``, see :meth:`build_cube`. .. important:: Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the old data is treated as "removed". .. hint:: To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use :meth:`remove_partitions` beforehand. Parameters ---------- data: dask.Bag Bag containing dataframes cube: kartothek.core.cube.cube.Cube Cube specification. store: Callable[[], simplekv.KeyValueStore] Store to which the data should be written to. ktk_cube_dataset_ids: Optional[Iterable[str]] Datasets that will be written, must be specified in advance. metadata: Dict[str, Dict[str, Any]] Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of metadata keys is not possible. Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets(cube, store) # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the # compat check within 1 dataset existing_payload = set() partition_on = {k: v.partition_keys for k, v in existing_datasets.items()} check_existing_datasets(existing_datasets=existing_datasets, ktk_cube_dataset_ids=ktk_cube_dataset_ids) data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=existing_payload, partition_on=partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, update=True, existing_datasets=existing_datasets, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data