def build_cube(data, cube, store, metadata=None, overwrite=False, partition_on=None): """ Store given dataframes as Ktk_cube cube. ``data`` can be formatted in multiple ways: - single DataFrame:: pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v': [42, 45, 20, 10], }) In that case, the seed dataset will be written. - dictionary of DataFrames:: { 'seed': pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v1': [42, 45, 20, 10], }), 'enrich': pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v2': [False, False, True, False], }), } In that case, multiple datasets can be written at the same time. Note that the seed dataset MUST be included. - list of anything above:: [ # seed data only pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v1': [42, 45, 20, 10], }), # seed data only, explicit way { 'seed': pd.DataFrame({ 'x': [4, 5, 6, 7], 'p': [0, 0, 1, 1], 'v1': [12, 32, 22, 9], }), }, # multiple datasets { 'seed': pd.DataFrame({ 'x': [8, 9, 10, 11], 'p': [0, 0, 1, 1], 'v1': [9, 2, 4, 11], }), 'enrich': pd.DataFrame({ 'x': [8, 9, 10, 11], 'p': [0, 0, 1, 1], 'v2': [True, True, False, False], }), }, # non-seed data only { 'enrich': pd.DataFrame({ 'x': [1, 2, 3, 4], 'p': [0, 0, 1, 1], 'v2': [False, True, False, False], }), }, ] In that case, multiple datasets may be written. Note that at least a single list element must contain seed data. Extra metdata may be preserved w/ every dataset, e.g.:: { 'seed': { 'source': 'db', 'host': 'db1.cluster20.company.net', 'last_event': '230c6edb-b69a-4d30-b56d-28f5dfe20948', }, 'enrich': { 'source': 'python', 'commit_hash': '8b5d717518439921e6d17c7495956bdad687bc54', }, } Note that the given data must be JSON-serializable. If the cube already exists, the ``overwrite`` flag must be given. In that case, all datasets that are part of the existing cube must be overwritten. Partial overwrites are not allowed. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset. overwrite: bool If possibly existing datasets should be overwritten. partition_on: Optional[Dict[str, Iterable[str]]] Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). See :ref:`Dimensionality and Partitioning Details` for details. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) ktk_cube_dataset_ids = set(data.keys()) partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(data, cube, existing_datasets) # do all data preparation before writing anything data = _prepare_data_for_ktk_all(data=data, cube=cube, existing_payload=set(), partition_on=partition_on) datasets = {} for ktk_cube_dataset_id, part in data.items(): datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset( store=store, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), dfs=part, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), partition_on=list(partition_on[ktk_cube_dataset_id]), metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, metadata_version=KTK_CUBE_METADATA_VERSION, df_serializer=KTK_CUBE_DF_SERIALIZER, overwrite=overwrite, ) return apply_postwrite_checks(datasets=datasets, cube=cube, store=store, existing_datasets=existing_datasets)
def extend_cube(data, cube, store, metadata=None, overwrite=False, partition_on=None): """ Store given dataframes into an existing Kartothek cube. For details on ``data`` and ``metadata``, see :meth:`build_cube`. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset. overwrite: bool If possibly existing datasets should be overwritten. partition_on: Optional[Dict[str, Iterable[str]]] Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). See :ref:`Dimensionality and Partitioning Details` for details. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) ktk_cube_dataset_ids = set(data.keys()) partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) check_datasets_preextend(data, cube) existing_datasets = discover_datasets(cube, store) if overwrite: existing_datasets_cut = { ktk_cube_dataset_id: ds for ktk_cube_dataset_id, ds in existing_datasets.items() if ktk_cube_dataset_id not in data } else: existing_datasets_cut = existing_datasets existing_payload = get_cube_payload(existing_datasets_cut, cube) # do all data preparation before writing anything data = _prepare_data_for_ktk_all( data=data, cube=cube, existing_payload=existing_payload, partition_on=partition_on, ) datasets = {} for ktk_cube_dataset_id, part in data.items(): datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset( store=store, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), dfs=part, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), partition_on=list(partition_on[ktk_cube_dataset_id]), metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, metadata_version=KTK_CUBE_METADATA_VERSION, df_serializer=KTK_CUBE_DF_SERIALIZER, overwrite=overwrite, ) return apply_postwrite_checks(datasets=datasets, cube=cube, store=store, existing_datasets=existing_datasets)
def append_to_cube(data, cube, store, metadata=None): """ Append data to existing cube. For details on ``data`` and ``metadata``, see :meth:`build_cube`. .. important:: Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the old data is treated as "removed". .. hint:: To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use :meth:`remove_partitions` beforehand. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of metadata keys is not possible. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) existing_datasets = discover_datasets(cube, store) partition_on = {k: v.partition_keys for k, v in existing_datasets.items()} check_existing_datasets(existing_datasets=existing_datasets, ktk_cube_dataset_ids=set(data.keys())) # do all data preparation before writing anything # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the # compat check within 1 dataset data = _prepare_data_for_ktk_all(data=data, cube=cube, existing_payload=set(), partition_on=partition_on) # update_dataset_from_dataframes requires a store factory, so create one # if not provided if not callable(store): def store_factory(): return store else: store_factory = store updated_datasets = {} for ktk_cube_dataset_id, part in data.items(): updated_datasets[ktk_cube_dataset_id] = update_dataset_from_dataframes( store=store_factory, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), df_list=part, partition_on=list(partition_on[ktk_cube_dataset_id]), df_serializer=KTK_CUBE_DF_SERIALIZER, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), ) return apply_postwrite_checks( datasets=updated_datasets, cube=cube, store=store, existing_datasets=existing_datasets, )
def copy_cube( cube: Cube, src_store: Union[KeyValueStore, Callable[[], KeyValueStore]], tgt_store: Union[KeyValueStore, Callable[[], KeyValueStore]], overwrite: bool = False, datasets: Union[None, Iterable[str], Dict[str, DatasetMetadata]] = None, renamed_cube_prefix: Optional[str] = None, renamed_datasets: Optional[Dict[str, str]] = None, ): """ Copy cube from one store to another. .. warning:: A failing copy operation can not be rolled back if the `overwrite` flag is enabled and might leave the overwritten dataset in an inconsistent state. Parameters ---------- cube: Cube Cube specification. src_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Source KV store. tgt_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Target KV store. overwrite: bool If possibly existing datasets in the target store should be overwritten. datasets: Union[None, Iterable[str], Dict[str, DatasetMetadata]] Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied). renamed_cube_prefix: Optional[str] Optional new cube prefix. If specified, the cube will be renamed while copying. renamed_datasets: Optional[Dict[str, str]] Optional dict with {old dataset name: new dataset name} entries. If provided, the datasets will be renamed accordingly during copying. When the parameter datasets is specified, the datasets to rename must be a subset of the datasets to copy. """ if callable(src_store): src_store = src_store() if callable(tgt_store): tgt_store = tgt_store() assert_stores_different(src_store, tgt_store, cube.ktk_dataset_uuid(cube.seed_dataset)) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, tgt_store) if renamed_datasets is None: new_seed_dataset = cube.seed_dataset else: new_seed_dataset = renamed_datasets.get(cube.seed_dataset, cube.seed_dataset) new_cube = Cube( dimension_columns=cube.dimension_columns, partition_columns=cube.partition_columns, uuid_prefix=renamed_cube_prefix or cube.uuid_prefix, seed_dataset=new_seed_dataset, index_columns=cube.index_columns, suppress_index_on=cube.suppress_index_on, ) datasets_to_copy = get_datasets_to_copy( cube=cube, src_store=src_store, tgt_store=tgt_store, overwrite=overwrite, datasets=datasets, ) copied = {} # type: Dict[str, DatasetMetadata] for src_ds_name, src_ds_meta in datasets_to_copy.items(): tgt_ds_uuid = _transform_uuid( src_uuid=src_ds_meta.uuid, cube_prefix=cube.uuid_prefix, renamed_cube_prefix=renamed_cube_prefix, renamed_datasets=renamed_datasets, ) try: md_transformed = copy_dataset( source_dataset_uuid=src_ds_meta.uuid, store=src_store, target_dataset_uuid=tgt_ds_uuid, target_store=tgt_store, ) except Exception as e: if overwrite: # We can't roll back safely if the target dataset has been partially overwritten. raise RuntimeError(e) else: apply_postwrite_checks( datasets=copied, cube=new_cube, store=tgt_store, existing_datasets=existing_datasets, ) else: copied.update(md_transformed)