def test_input_to_metaframes_dict(): df_input = { "label": "cluster_1", "data": [ ("some_file", pd.DataFrame({"A": [1]})), ("some_other_file", pd.DataFrame({"B": [2]})), ], } mp = parse_input_to_metapartition(obj=df_input) assert isinstance(mp, MetaPartition) assert len(mp.data) == 2 assert len(mp.files) == 0 assert mp.label == "cluster_1" data = mp.data df = data["some_file"] pdt.assert_frame_equal(df, pd.DataFrame({"A": [1]}), check_dtype=False, check_like=True) df2 = data["some_other_file"] pdt.assert_frame_equal(df2, pd.DataFrame({"B": [2]}), check_dtype=False, check_like=True)
def write_partition( partition_df: MetaPartitionInput, secondary_indices: List[str], sort_partitions_by: List[str], dataset_uuid: str, partition_on: List[str], store_factory: StoreFactory, df_serializer: Optional[DataFrameSerializer], metadata_version: int, dataset_table_name: str = SINGLE_TABLE, ) -> MetaPartition: """ Write a dataframe to store, performing all necessary preprocessing tasks like partitioning, bucketing (NotImplemented), indexing, etc. in the correct order. """ store = ensure_store(store_factory) # I don't have access to the group values mps = parse_input_to_metapartition( partition_df, metadata_version=metadata_version, table_name=dataset_table_name, ) if sort_partitions_by: mps = mps.apply( partial(sort_values_categorical, columns=sort_partitions_by)) if partition_on: mps = mps.partition_on(partition_on) if secondary_indices: mps = mps.build_indices(secondary_indices) return mps.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer)
def test_parse_nested_input_schema_compatible_but_different(): # Ensure that input can be parsed even though the schemas are not identical but compatible df_input = [[pd.DataFrame({"A": [None]}), pd.DataFrame({"A": ["str"]})]] mp = parse_input_to_metapartition(df_input, metadata_version=4) expected_schema = make_meta(pd.DataFrame({"A": ["str"]}), origin="expected") assert mp.schema == expected_schema
def store_dataframes_as_dataset__iter( df_generator, store, dataset_uuid=None, metadata=None, partition_on=None, df_serializer=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, secondary_indices=None, ): """ Store `pd.DataFrame` s iteratively as a partitioned dataset with multiple tables (files). Useful for datasets which do not fit into memory. Parameters ---------- Returns ------- dataset: kartothek.core.dataset.DatasetMetadata The stored dataset. """ if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) new_partitions = [] for df in df_generator: mp = parse_input_to_metapartition(df, metadata_version=metadata_version) if partition_on: mp = mp.partition_on(partition_on) if secondary_indices: mp = mp.build_indices(secondary_indices) # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition mp = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) # Add `kartothek.io_components.metapartition.MetaPartition` object to list to track partitions new_partitions.append(mp) # Store metadata and return `kartothek.DatasetMetadata` object return store_dataset_from_partitions( partition_list=new_partitions, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_storage_format=metadata_storage_format, )
def _store_partition( df: pd.DataFrame, secondary_indices: List[str], sort_partitions_by: Optional[str], table: str, dataset_uuid: str, partition_on: Optional[List[str]], store_factory: StoreFactoryType, df_serializer: DataFrameSerializer, metadata_version: int, unpacked_meta: pd.DataFrame, ) -> MetaPartition: df = unpack_payload_pandas(df, unpacked_meta) if _KTK_HASH_BUCKET in df: df = df.drop(_KTK_HASH_BUCKET, axis=1) store = store_factory() # I don't have access to the group values mps = parse_input_to_metapartition({"data": { table: df }}, metadata_version=metadata_version) # delete reference to enable release after partition_on; before index build del df if sort_partitions_by: mps = mps.apply( partial(sort_values_categorical, column=sort_partitions_by)) if partition_on: mps = mps.partition_on(partition_on) if secondary_indices: mps = mps.build_indices(secondary_indices) return mps.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer)
def _write_dataframe_partitions( ddf: dd.DataFrame, store: StoreFactory, dataset_uuid: str, table: str, secondary_indices: List[str], shuffle: bool, repartition_ratio: Optional[SupportsFloat], num_buckets: int, sort_partitions_by: List[str], df_serializer: Optional[DataFrameSerializer], metadata_version: int, partition_on: List[str], bucket_by: List[str], ) -> dd.Series: if repartition_ratio and ddf is not None: ddf = ddf.repartition( npartitions=int(np.ceil(ddf.npartitions / repartition_ratio))) if ddf is None: mps = dd.from_pandas( pd.Series([ parse_input_to_metapartition( None, metadata_version=metadata_version, table_name=table, ) ]), npartitions=1, ) else: if shuffle: mps = shuffle_store_dask_partitions( ddf=ddf, table=table, secondary_indices=secondary_indices, metadata_version=metadata_version, partition_on=partition_on, store_factory=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, num_buckets=num_buckets, sort_partitions_by=sort_partitions_by, bucket_by=bucket_by, ) else: mps = ddf.map_partitions( write_partition, secondary_indices=secondary_indices, metadata_version=metadata_version, partition_on=partition_on, store_factory=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, sort_partitions_by=sort_partitions_by, dataset_table_name=table, meta=(MetaPartition), ) return mps
def test_input_to_metaframes_simple(): df_input = pd.DataFrame({"A": [1]}) mp = parse_input_to_metapartition(obj=df_input) assert isinstance(mp, MetaPartition) assert len(mp.data) == 1 assert len(mp.files) == 0 df = list(mp.data.values())[0] pdt.assert_frame_equal(df, df_input) assert isinstance(mp.label, six.string_types)
def store_dataframes_as_dataset( store, dataset_uuid, dfs, metadata=None, partition_on=None, df_serializer=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, ): """ Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files). Useful for very small datasets where all data fits into memory. Parameters ---------- dfs : dict of pd.DataFrame or pd.DataFrame The dataframe(s) to be stored. If only a single dataframe is passed, it will be stored as the `core` table. Returns ------- The stored dataset """ if dataset_uuid is None: dataset_uuid = gen_uuid() if isinstance(dfs, dict): dfs = {"data": [(table, df) for table, df in dfs.items()]} if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) mp = parse_input_to_metapartition(dfs, metadata_version) if partition_on: mp = MetaPartition.partition_on(mp, partition_on) mps = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) return store_dataset_from_partitions( partition_list=mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_storage_format=metadata_storage_format, )
def test_input_to_metaframes_empty(): mp = parse_input_to_metapartition(obj=[None]) assert mp == MetaPartition(label=None) mp = parse_input_to_metapartition(obj=[]) assert mp == MetaPartition(label=None)
def commit_dataset( store: Optional[StoreInput] = None, dataset_uuid: Optional[str] = None, new_partitions: Optional[Iterable[MetaPartition]] = None, output_dataset_uuid: Optional[str] = None, delete_scope: Optional[Iterable[Dict[str, Any]]] = None, metadata: Dict = None, df_serializer: DataFrameSerializer = None, metadata_merger: Callable[[List[Dict]], Dict] = None, default_metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[Iterable[str]] = None, factory: Optional[DatasetFactory] = None, secondary_indices: Optional[Iterable[str]] = None, ): """ Commit new state to an existing dataset. This can be used for three distinct operations 1. Add previously written partitions to this dataset If for some reasons, the existing pipelines are not sufficient but you need more control, you can write the files outside of a kartothek pipeline and commit them whenever you choose to. This should be used in combination with :func:`~kartothek.io.eager.write_single_partition` and :func:`~kartothek.io.eager.create_empty_dataset_header`. .. code:: import pandas as pd from kartothek.io.eager import write_single_partition, commit_dataset store = "hfs://my_store" # The partition writing can be done concurrently and distributed if wanted. # Only the information about what partitions have been written is required for the commit. new_partitions = [ write_single_partition( store=store, dataset_uuid='dataset_uuid', data=pd.DataFrame({'column': [1, 2]}), ) ] new_dataset = commit_dataset( store=store, dataset_uuid='dataset_uuid', new_partitions=new_partitions, ) 2. Simple delete of partitions If you want to remove some partitions this is one of the simples ways of doing so. By simply providing a delete_scope, this removes the references to these files in an atomic commit. .. code:: commit_dataset( store=store, dataset_uuid='dataset_uuid', delete_scope=[ { "partition_column": "part_value_to_be_removed" } ], ) 3. Add additional metadata To add new metadata to an existing dataset .. code:: commit_dataset( store=store, dataset_uuid='dataset_uuid', metadata={"new": "user_metadata"}, ) Note:: If you do not want the new metadata to be merged with the existing one, povide a custom ``metadata_merger`` Parameters ---------- new_partitions: Input partition to be committed. """ if output_dataset_uuid is not None: warnings.warn( "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ", DeprecationWarning, ) if df_serializer is not None: warnings.warn( "The keyword `df_serializer` is deprecated and will be removed in the next major release.", DeprecationWarning, ) if not new_partitions and not metadata and not delete_scope: raise ValueError( "Need to provide either new data, new metadata or a delete scope. None of it was provided." ) store = lazy_store(store) ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, ds_factory=factory, default_metadata_version=default_metadata_version, partition_on=partition_on, ) mps = parse_input_to_metapartition(new_partitions, metadata_version=metadata_version) if secondary_indices: mps = mps.build_indices(columns=secondary_indices) mps_list = [_maybe_infer_files_attribute(mp, dataset_uuid) for mp in mps] dmd = update_dataset_from_partitions( mps_list, store_factory=store, dataset_uuid=dataset_uuid, ds_factory=ds_factory, delete_scope=delete_scope, metadata=metadata, metadata_merger=metadata_merger, ) return dmd
def update_dataset_from_ddf( ddf, store=None, dataset_uuid=None, table=None, secondary_indices=None, shuffle=False, repartition_ratio=None, num_buckets=1, sort_partitions_by=None, delete_scope=None, metadata=None, df_serializer=None, metadata_merger=None, default_metadata_version=DEFAULT_METADATA_VERSION, partition_on=None, factory=None, ): """ Update a dataset from a dask.dataframe. .. admonition:: Behavior without ``shuffle==False`` In the case without ``partition_on`` every dask partition is mapped to a single kartothek partition In the case with ``partition_on`` every dask partition is mapped to N kartothek partitions, where N depends on the content of the respective partition, such that every resulting kartothek partition has only a single value in the respective ``partition_on`` columns. .. admonition:: Behavior with ``shuffle==True`` ``partition_on`` is mandatory Perform a data shuffle to ensure that every primary key will have at most ``num_bucket``. .. note:: The number of allowed buckets will have an impact on the required resources and runtime. Using a larger number of allowed buckets will usually reduce resource consumption and in some cases also improves runtime performance. :Example: >>> partition_on="primary_key" >>> num_buckets=2 # doctest: +SKIP primary_key=1/bucket1.parquet primary_key=1/bucket2.parquet .. note:: This can only be used for datasets with a single table! Parameters ---------- ddf: Union[dask.dataframe.DataFrame, None] The dask.Dataframe to be used to calculate the new partitions from. If this parameter is `None`, the update pipeline will only delete partitions without creating new ones. shuffle: bool If True and partition_on is requested, shuffle the data to reduce number of output partitions repartition_ratio: Optional[Union[int, float]] If provided, repartition the dataframe before calculation starts to ``ceil(ddf.npartitions / repartition_ratio)`` num_buckets: int If provided, the output partitioning will have ``num_buckets`` files per primary key partitioning. This effectively splits up the execution ``num_buckets`` times. Setting this parameter may be helpful when scaling. This only has an effect if ``shuffle==True`` """ partition_on = normalize_arg("partition_on", partition_on) secondary_indices = normalize_arg("secondary_indices", secondary_indices) delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope) if table is None: raise TypeError("The parameter `table` is not optional.") ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, default_metadata_version=default_metadata_version, partition_on=partition_on, ds_factory=factory, ) if shuffle and not partition_on: raise ValueError( "If ``shuffle`` is requested, at least one ``partition_on`` column needs to be provided." ) if ds_factory is not None: check_single_table_dataset(ds_factory, table) if repartition_ratio and ddf is not None: ddf = ddf.repartition( npartitions=int(np.ceil(ddf.npartitions / repartition_ratio))) if ddf is None: mps = [ parse_input_to_metapartition( None, metadata_version=default_metadata_version) ] else: secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices) if shuffle and partition_on: mps = _update_dask_partitions_shuffle( ddf=ddf, table=table, secondary_indices=secondary_indices, metadata_version=metadata_version, partition_on=partition_on, store_factory=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, num_buckets=num_buckets, sort_partitions_by=sort_partitions_by, ) else: delayed_tasks = ddf.to_delayed() delayed_tasks = [{"data": {table: task}} for task in delayed_tasks] mps = _update_dask_partitions_one_to_one( delayed_tasks=delayed_tasks, secondary_indices=secondary_indices, metadata_version=metadata_version, partition_on=partition_on, store_factory=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, sort_partitions_by=sort_partitions_by, ) return dask.delayed(update_dataset_from_partitions)( mps, store_factory=store, dataset_uuid=dataset_uuid, ds_factory=ds_factory, delete_scope=delete_scope, metadata=metadata, metadata_merger=metadata_merger, )
def update_dataset_from_dataframes( df_list: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]], store: Optional[KeyValueStore] = None, dataset_uuid: Optional[str] = None, delete_scope=None, metadata=None, df_serializer: Optional[ParquetSerializer] = None, metadata_merger: Callable = None, central_partition_metadata: bool = True, default_metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[List[str]] = None, load_dynamic_metadata: bool = True, sort_partitions_by: Optional[str] = None, secondary_indices: Optional[List[str]] = None, factory: Optional[DatasetFactory] = None, ) -> DatasetMetadata: """ Update a kartothek dataset in store at once, using a list of dataframes. Useful for datasets which do not fit into memory. Parameters ---------- df_list: The dataframe(s) to be stored. Returns ------- The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`). See Also -------- :ref:`mutating_datasets` """ ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, ds_factory=factory, default_metadata_version=default_metadata_version, partition_on=partition_on, ) inferred_indices = _ensure_compatible_indices(ds_factory, secondary_indices) del secondary_indices mp = parse_input_to_metapartition( df_list, metadata_version=metadata_version, expected_secondary_indices=inferred_indices, ) if sort_partitions_by: mp = mp.apply( partial(sort_values_categorical, columns=sort_partitions_by)) if partition_on: mp = mp.partition_on(partition_on) if inferred_indices: mp = mp.build_indices(inferred_indices) mp = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) return update_dataset_from_partitions( mp, store_factory=store, dataset_uuid=dataset_uuid, ds_factory=ds_factory, delete_scope=delete_scope, metadata=metadata, metadata_merger=metadata_merger, )
def test_dataframe_input_to_metapartition(): with pytest.raises(ValueError): parse_input_to_metapartition(tuple([1])) with pytest.raises(ValueError): parse_input_to_metapartition("abc")