def _multiplex_store_dataset_from_partitions_flat(mpss, cube, metadata, update, store, existing_datasets): dct = defaultdict(list) for sublist in mpss: for mp in sublist: for k, v in mp.items(): dct[k].append(v) result = {} for k, v in dct.items(): if update: ds_factory = metadata_factory_from_dataset(existing_datasets[k], with_schema=True, store=store) result[k] = update_dataset_from_partitions( v, dataset_uuid=cube.ktk_dataset_uuid(k), delete_scope=[], ds_factory=ds_factory, metadata=metadata[k], metadata_merger=None, store_factory=store, ) else: result[k] = store_dataset_from_partitions( v, dataset_metadata=metadata[k], dataset_uuid=cube.ktk_dataset_uuid(k), metadata_merger=None, metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, store=store, ) # list required for dask.bag return [result]
def remove_partitions(cube, store, conditions=None, ktk_cube_dataset_ids=None, metadata=None): """ Remove given partition range from cube using a transaction. Parameters ---------- cube: kartothek.core.cube.cube.Cube Cube spec. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Store. conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied, optional. Defaults to "entire cube". ktk_cube_dataset_ids: Optional[Union[Iterable[Union[Str, Bytes]], Union[Str, Bytes]]] Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all". metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of metadata keys is not possible. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets, updated. """ if callable(store): store_instance = store() store_factory = store else: store_instance = store def store_factory(): return store existing_datasets = discover_datasets(cube, store) for ( ktk_cube_dataset_id, (ds, mp, delete_scope), ) in prepare_metapartitions_for_removal_action( cube=cube, store=store_instance, conditions=conditions, ktk_cube_dataset_ids=ktk_cube_dataset_ids, existing_datasets=existing_datasets, ).items(): mp = mp.store_dataframes( store=store_instance, dataset_uuid=ds.uuid, df_serializer=KTK_CUBE_DF_SERIALIZER, ) ds_factory = metadata_factory_from_dataset(ds, with_schema=True, store=store_factory) existing_datasets[ ktk_cube_dataset_id] = update_dataset_from_partitions( mp, store_factory=store_factory, dataset_uuid=ds.uuid, ds_factory=ds_factory, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), metadata_merger=None, delete_scope=delete_scope, ) return existing_datasets
def update_dataset_from_dataframes__iter( df_generator, store=None, dataset_uuid=None, delete_scope=None, metadata=None, df_serializer=None, metadata_merger=None, central_partition_metadata=True, default_metadata_version=DEFAULT_METADATA_VERSION, partition_on=None, load_dynamic_metadata=True, sort_partitions_by=None, secondary_indices=None, factory=None, ): """ Update a kartothek dataset in store iteratively, using a generator of dataframes. Useful for datasets which do not fit into memory. Parameters ---------- Returns ------- The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`). """ if load_dynamic_metadata is not True: warnings.warn( "The keyword `load_dynamic_metadata` has no use and will be removed soon", DeprecationWarning, ) if central_partition_metadata is not True: warnings.warn( "The keyword `central_partition_metadata` has no use and will be removed in the next major release ", DeprecationWarning, ) ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, ds_factory=factory, default_metadata_version=default_metadata_version, partition_on=partition_on, ) secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices) if sort_partitions_by: # Define function which sorts each partition by column sort_partitions_by_fn = partial( sort_values_categorical, column=sort_partitions_by ) new_partitions = [] for df in df_generator: mp = parse_input_to_metapartition( df, metadata_version=metadata_version, expected_secondary_indices=secondary_indices, ) if sort_partitions_by: mp = mp.apply(sort_partitions_by_fn) if partition_on: mp = mp.partition_on(partition_on=partition_on) if secondary_indices: mp = mp.build_indices(columns=secondary_indices) # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition mp = mp.store_dataframes( store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid ) new_partitions.append(mp) return update_dataset_from_partitions( new_partitions, store_factory=store, dataset_uuid=dataset_uuid, ds_factory=ds_factory, delete_scope=delete_scope, metadata=metadata, metadata_merger=metadata_merger, )
def update_dataset_from_dataframes__iter( df_generator, store=None, dataset_uuid=None, delete_scope=None, metadata=None, df_serializer=None, metadata_merger=None, central_partition_metadata=True, default_metadata_version=DEFAULT_METADATA_VERSION, partition_on=None, load_dynamic_metadata=True, sort_partitions_by=None, secondary_indices=None, factory=None, ): """ Update a kartothek dataset in store iteratively, using a generator of dataframes. Useful for datasets which do not fit into memory. Parameters ---------- df_generator: Iterable[Union[pd.DataFrame, Dict[str, pd.DataFrame]]] The dataframe(s) to be stored. Returns ------- The dataset metadata object (`kartothek.dataset.DatasetMetadata`). """ ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, ds_factory=factory, default_metadata_version=default_metadata_version, partition_on=partition_on, load_dynamic_metadata=load_dynamic_metadata, ) secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices) if sort_partitions_by: # Define function which sorts each partition by column sort_partitions_by_fn = partial(sort_values_categorical, column=sort_partitions_by) new_partitions = [] for df in df_generator: mp = parse_input_to_metapartition( df, metadata_version=default_metadata_version) if sort_partitions_by: mp = mp.apply(sort_partitions_by_fn) if partition_on: mp = mp.partition_on(partition_on=partition_on) if secondary_indices: mp = mp.build_indices(columns=secondary_indices) # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition mp = mp.store_dataframes( store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, store_metadata=not central_partition_metadata, ) new_partitions.append(mp) return update_dataset_from_partitions( new_partitions, store_factory=store, dataset_uuid=dataset_uuid, ds_factory=ds_factory, delete_scope=delete_scope, metadata=metadata, metadata_merger=metadata_merger, )
def update_dataset_from_dataframes__iter( df_generator, store=None, dataset_uuid=None, delete_scope=None, metadata=None, df_serializer=None, metadata_merger=None, default_metadata_version=DEFAULT_METADATA_VERSION, partition_on=None, sort_partitions_by=None, secondary_indices=None, factory=None, table_name: str = SINGLE_TABLE, ): """ Update a kartothek dataset in store iteratively, using a generator of dataframes. Useful for datasets which do not fit into memory. Parameters ---------- Returns ------- The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`). See Also -------- :ref:`mutating_datasets` """ ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, ds_factory=factory, default_metadata_version=default_metadata_version, partition_on=partition_on, ) secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices) if sort_partitions_by: # Define function which sorts each partition by column sort_partitions_by_fn = partial( sort_values_categorical, columns=sort_partitions_by ) new_partitions = [] for df in df_generator: mp = parse_input_to_metapartition( df, metadata_version=metadata_version, table_name=table_name, ) if sort_partitions_by: mp = mp.apply(sort_partitions_by_fn) if partition_on: mp = mp.partition_on(partition_on=partition_on) if secondary_indices: mp = mp.build_indices(columns=secondary_indices) # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition mp = mp.store_dataframes( store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid ) new_partitions.append(mp) return update_dataset_from_partitions( new_partitions, store_factory=store, dataset_uuid=dataset_uuid, ds_factory=ds_factory, delete_scope=delete_scope, metadata=metadata, metadata_merger=metadata_merger, )
def update_dataset_from_dataframes( df_list, store=None, dataset_uuid=None, delete_scope=None, metadata=None, df_serializer=None, metadata_merger=None, central_partition_metadata=True, default_metadata_version=DEFAULT_METADATA_VERSION, partition_on=None, load_dynamic_metadata=True, sort_partitions_by=None, secondary_indices=None, factory=None, ): """ Update a kartothek dataset in store at once, using a list of dataframes. Useful for datasets which do not fit into memory. Parameters ---------- df_list: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]] The dataframe(s) to be stored. Returns ------- The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`). """ if load_dynamic_metadata is not True: warnings.warn( "The keyword `load_dynamic_metadata` has no use and will be removed in the next major release ", DeprecationWarning, ) if central_partition_metadata is not True: warnings.warn( "The keyword `central_partition_metadata` has no use and will be removed in the next major release ", DeprecationWarning, ) ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, ds_factory=factory, default_metadata_version=default_metadata_version, partition_on=partition_on, ) secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices) mp = parse_input_to_metapartition( df_list, metadata_version=metadata_version, expected_secondary_indices=secondary_indices, ) if sort_partitions_by: mp = mp.apply( partial(sort_values_categorical, column=sort_partitions_by)) if partition_on: mp = mp.partition_on(partition_on) if secondary_indices: mp = mp.build_indices(secondary_indices) mp = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) return update_dataset_from_partitions( mp, store_factory=store, dataset_uuid=dataset_uuid, ds_factory=ds_factory, delete_scope=delete_scope, metadata=metadata, metadata_merger=metadata_merger, )
def commit_dataset( store=None, dataset_uuid=None, new_partitions=NoDefault(), output_dataset_uuid=None, delete_scope=None, metadata=None, df_serializer=None, metadata_merger=None, default_metadata_version=DEFAULT_METADATA_VERSION, partition_on=None, factory=None, secondary_indices=None, ): """ Update an existing dataset with new, already written partitions. This should be used in combination with :func:`~kartothek.io.eager.write_single_partition` and :func:`~kartothek.io.eager.create_empty_dataset_header`. .. note:: It is highly recommended to use the full pipelines whenever possible. This functionality should be used with caution and should only be necessary in cases where traditional pipeline scheduling is not an option. Example: .. code:: import storefact import pandas as pd from functools import partial from kartothek.io.eager import write_single_partition form kartothek.io.eager.update import commit_dataset store = partial(storefact.get_store_from_url, url="hfs://my_store") new_data={ "data": { "table_1": pd.DataFrame({'column': [1, 2]}), "table_1": pd.DataFrame({'other_column': ['a', 'b']}), } } # The partition writing can be done concurrently and distributed if wanted. # Only the information about what partitions have been written is required for the commit. new_partitions = [ write_single_partition( store=store, dataset_uuid='dataset_uuid', data=new_data ) ] new_dataset = commit_dataset( store=store, dataset_uuid='dataset_uuid', new_partitions=new_partitions ) Parameters ---------- new_partitions: List[kartothek.io_components.metapartition.MetaPartition] Input partition to be committed. """ if output_dataset_uuid is not None: warnings.warn( "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ", DeprecationWarning, ) if df_serializer is not None: warnings.warn( "The keyword `df_serializer` is deprecated and will be removed in the next major release.", DeprecationWarning, ) if isinstance(new_partitions, NoDefault): raise TypeError("The parameter `new_partitions` is not optional") store = _make_callable(store) ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, ds_factory=factory, default_metadata_version=default_metadata_version, partition_on=partition_on, ) mps = parse_input_to_metapartition(new_partitions, metadata_version=metadata_version) if secondary_indices: mps = mps.build_indices(columns=secondary_indices) mps = [_maybe_infer_files_attribute(mp, dataset_uuid) for mp in mps] dmd = update_dataset_from_partitions( mps, store_factory=store, dataset_uuid=dataset_uuid, ds_factory=ds_factory, delete_scope=delete_scope, metadata=metadata, metadata_merger=metadata_merger, ) return dmd
def commit_dataset( store: Optional[StoreInput] = None, dataset_uuid: Optional[str] = None, new_partitions: Optional[Iterable[MetaPartition]] = None, output_dataset_uuid: Optional[str] = None, delete_scope: Optional[Iterable[Dict[str, Any]]] = None, metadata: Dict = None, df_serializer: DataFrameSerializer = None, metadata_merger: Callable[[List[Dict]], Dict] = None, default_metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[Iterable[str]] = None, factory: Optional[DatasetFactory] = None, secondary_indices: Optional[Iterable[str]] = None, ): """ Commit new state to an existing dataset. This can be used for three distinct operations 1. Add previously written partitions to this dataset If for some reasons, the existing pipelines are not sufficient but you need more control, you can write the files outside of a kartothek pipeline and commit them whenever you choose to. This should be used in combination with :func:`~kartothek.io.eager.write_single_partition` and :func:`~kartothek.io.eager.create_empty_dataset_header`. .. code:: import pandas as pd from kartothek.io.eager import write_single_partition, commit_dataset store = "hfs://my_store" # The partition writing can be done concurrently and distributed if wanted. # Only the information about what partitions have been written is required for the commit. new_partitions = [ write_single_partition( store=store, dataset_uuid='dataset_uuid', data=pd.DataFrame({'column': [1, 2]}), ) ] new_dataset = commit_dataset( store=store, dataset_uuid='dataset_uuid', new_partitions=new_partitions, ) 2. Simple delete of partitions If you want to remove some partitions this is one of the simples ways of doing so. By simply providing a delete_scope, this removes the references to these files in an atomic commit. .. code:: commit_dataset( store=store, dataset_uuid='dataset_uuid', delete_scope=[ { "partition_column": "part_value_to_be_removed" } ], ) 3. Add additional metadata To add new metadata to an existing dataset .. code:: commit_dataset( store=store, dataset_uuid='dataset_uuid', metadata={"new": "user_metadata"}, ) Note:: If you do not want the new metadata to be merged with the existing one, povide a custom ``metadata_merger`` Parameters ---------- new_partitions: Input partition to be committed. """ if output_dataset_uuid is not None: warnings.warn( "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ", DeprecationWarning, ) if df_serializer is not None: warnings.warn( "The keyword `df_serializer` is deprecated and will be removed in the next major release.", DeprecationWarning, ) if not new_partitions and not metadata and not delete_scope: raise ValueError( "Need to provide either new data, new metadata or a delete scope. None of it was provided." ) store = lazy_store(store) ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, ds_factory=factory, default_metadata_version=default_metadata_version, partition_on=partition_on, ) mps = parse_input_to_metapartition(new_partitions, metadata_version=metadata_version) if secondary_indices: mps = mps.build_indices(columns=secondary_indices) mps_list = [_maybe_infer_files_attribute(mp, dataset_uuid) for mp in mps] dmd = update_dataset_from_partitions( mps_list, store_factory=store, dataset_uuid=dataset_uuid, ds_factory=ds_factory, delete_scope=delete_scope, metadata=metadata, metadata_merger=metadata_merger, ) return dmd
def update_dataset_from_dataframes( df_list: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]], store: Optional[KeyValueStore] = None, dataset_uuid: Optional[str] = None, delete_scope=None, metadata=None, df_serializer: Optional[ParquetSerializer] = None, metadata_merger: Callable = None, central_partition_metadata: bool = True, default_metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[List[str]] = None, load_dynamic_metadata: bool = True, sort_partitions_by: Optional[str] = None, secondary_indices: Optional[List[str]] = None, factory: Optional[DatasetFactory] = None, ) -> DatasetMetadata: """ Update a kartothek dataset in store at once, using a list of dataframes. Useful for datasets which do not fit into memory. Parameters ---------- df_list: The dataframe(s) to be stored. Returns ------- The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`). See Also -------- :ref:`mutating_datasets` """ ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, ds_factory=factory, default_metadata_version=default_metadata_version, partition_on=partition_on, ) inferred_indices = _ensure_compatible_indices(ds_factory, secondary_indices) del secondary_indices mp = parse_input_to_metapartition( df_list, metadata_version=metadata_version, expected_secondary_indices=inferred_indices, ) if sort_partitions_by: mp = mp.apply( partial(sort_values_categorical, columns=sort_partitions_by)) if partition_on: mp = mp.partition_on(partition_on) if inferred_indices: mp = mp.build_indices(inferred_indices) mp = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) return update_dataset_from_partitions( mp, store_factory=store, dataset_uuid=dataset_uuid, ds_factory=ds_factory, delete_scope=delete_scope, metadata=metadata, metadata_merger=metadata_merger, )
def _commit_update_from_reduction(df_mps, **kwargs): partitions = pd.Series(df_mps.values.flatten()).dropna() return update_dataset_from_partitions( partition_list=partitions, **kwargs, )
def remove_partitions( cube: Cube, store: Union[simplekv.KeyValueStore, StoreFactory], conditions: Union[None, Condition, Sequence[Condition], Conjunction] = None, ktk_cube_dataset_ids: Optional[Union[Sequence[str], str]] = None, metadata: Optional[Dict[str, Dict[str, Any]]] = None, ): """ Remove given partition range from cube using a transaction. Remove the partitions selected by ``conditions``. If no ``conditions`` are given, remove all partitions. For each considered dataset, only the subset of ``conditions`` that refers to the partition columns of the respective dataset is used. In particular, a dataset that is not partitioned at all is always considered selected by ``conditions``. Parameters ---------- cube Cube spec. store Store. conditions Select the partitions to be removed. Must be a condition only on partition columns. ktk_cube_dataset_ids Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all". metadata Metadata for every the datasets, optional. Only given keys are updated/replaced. Deletion of metadata keys is not possible. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets, updated. """ if callable(store): store_instance = store() store_factory = store else: store_instance = store def store_factory(): return store existing_datasets = discover_datasets(cube, store) for ( ktk_cube_dataset_id, (ds, mp, delete_scope), ) in prepare_metapartitions_for_removal_action( cube=cube, store=store_instance, conditions=conditions, ktk_cube_dataset_ids=ktk_cube_dataset_ids, existing_datasets=existing_datasets, ).items(): mp = mp.store_dataframes( store=store_instance, dataset_uuid=ds.uuid, df_serializer=KTK_CUBE_DF_SERIALIZER, ) ds_factory = metadata_factory_from_dataset(ds, with_schema=True, store=store_factory) existing_datasets[ ktk_cube_dataset_id] = update_dataset_from_partitions( mp, store_factory=store_factory, dataset_uuid=ds.uuid, ds_factory=ds_factory, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), metadata_merger=None, delete_scope=delete_scope, ) return existing_datasets