def test_raises_other_index_missing(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=MetaPartition( label=gen_uuid(), data={ SINGLE_TABLE: pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}) }, metadata_version=KTK_CUBE_METADATA_VERSION, ).build_indices(["x", "y"]), name=cube.seed_dataset, ) store_data( cube=cube, function_store=function_store, df=MetaPartition( label=gen_uuid(), data={ SINGLE_TABLE: pd.DataFrame( {"x": [0], "y": [0], "p": [0], "q": [0], "i1": [1337]} ) }, metadata_version=KTK_CUBE_METADATA_VERSION, ), name="enrich", ) with pytest.raises(ValueError) as exc: discover_datasets(cube, function_store) assert ( str(exc.value) == 'ExplicitSecondaryIndex or PartitionIndex "i1" is missing in dataset "enrich".' )
def test_accepts_projected_datasets(self, cube, function_store): expected = { cube.seed_dataset: store_data( cube=cube, function_store=function_store, df=MetaPartition( label=gen_uuid(), data=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), metadata_version=KTK_CUBE_METADATA_VERSION, ).build_indices(["x", "y"]), name=cube.seed_dataset, ), "x": store_data( cube=cube, function_store=function_store, df=MetaPartition( label=gen_uuid(), data=pd.DataFrame({ "x": [0], "p": [0], "q": [0], "v1": [42] }), metadata_version=KTK_CUBE_METADATA_VERSION, ), name="x", ), "y": store_data( cube=cube, function_store=function_store, df=MetaPartition( label=gen_uuid(), data=pd.DataFrame({ "y": [0], "p": [0], "q": [0], "v2": [42] }), metadata_version=KTK_CUBE_METADATA_VERSION, ), name="y", ), } actual = discover_datasets(cube, function_store) assert_datasets_equal(actual, expected)
def test_accepts_partition_index_for_index(self, cube, function_store): expected = { cube.seed_dataset: store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "v1": [0]}), name=cube.seed_dataset, ), "enrich": store_data( cube=cube, function_store=function_store, df=MetaPartition( label=gen_uuid(), data={ SINGLE_TABLE: pd.DataFrame( {"x": [0], "y": [0], "i1": [1337], "v2": [42]} ) }, metadata_version=KTK_CUBE_METADATA_VERSION, ), name="enrich", partition_on=["i1"], ), } actual = discover_datasets(cube, function_store) assert_datasets_equal(actual, expected)
def test_store_dataframes_as_dataset_mp(metadata_version, store): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) mp = MetaPartition( label=gen_uuid(), data={ "core": df, "helper": df2 }, metadata_version=metadata_version, ) dataset = store_dataframes_as_dataset( store=store, dataset_uuid="dataset_uuid", dfs=mp, metadata_version=metadata_version, ) assert isinstance(dataset, DatasetMetadata) assert len(dataset.partitions) == 1 assert dataset.metadata_version == metadata_version stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset
def test_store_dataframes_as_dataset_mp_partition_on_none( metadata_version, store, store_factory, bound_store_dataframes ): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)}) mp = MetaPartition( label=gen_uuid(), data={"core": df, "helper": df2}, metadata_version=metadata_version, ) df_list = [None, mp] dataset = bound_store_dataframes( df_list, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=metadata_version, partition_on=["P"], ) assert isinstance(dataset, DatasetMetadata) assert dataset.partition_keys == ["P"] assert len(dataset.partitions) == 10 assert dataset.metadata_version == metadata_version stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) assert dataset == stored_dataset
def store_dataframes_as_dataset__iter( df_generator, store, dataset_uuid=None, metadata=None, partition_on=None, df_serializer=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, secondary_indices=None, ): """ Store `pd.DataFrame` s iteratively as a partitioned dataset with multiple tables (files). Useful for datasets which do not fit into memory. Parameters ---------- Returns ------- dataset: kartothek.core.dataset.DatasetMetadata The stored dataset. """ if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) new_partitions = [] for df in df_generator: mp = parse_input_to_metapartition(df, metadata_version=metadata_version) if partition_on: mp = mp.partition_on(partition_on) if secondary_indices: mp = mp.build_indices(secondary_indices) # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition mp = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) # Add `kartothek.io_components.metapartition.MetaPartition` object to list to track partitions new_partitions.append(mp) # Store metadata and return `kartothek.DatasetMetadata` object return store_dataset_from_partitions( partition_list=new_partitions, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_storage_format=metadata_storage_format, )
def store_delayed_as_dataset( delayed_tasks: List[Delayed], store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, table_name: str = SINGLE_TABLE, secondary_indices=None, ) -> Delayed: """ Transform and store a list of dictionaries containing dataframes to a kartothek dataset in store. Parameters ---------- """ store = lazy_store(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) raise_if_indices_overlap(partition_on, secondary_indices) input_to_mps = partial( parse_input_to_metapartition, metadata_version=metadata_version, table_name=table_name, ) mps = map_delayed(input_to_mps, delayed_tasks) if partition_on: mps = map_delayed(MetaPartition.partition_on, mps, partition_on=partition_on) if secondary_indices: mps = map_delayed(MetaPartition.build_indices, mps, columns=secondary_indices) mps = map_delayed( MetaPartition.store_dataframes, mps, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) return delayed(store_dataset_from_partitions)( mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, )
def store_data( cube, function_store, df, name, partition_on="default", metadata_version=KTK_CUBE_METADATA_VERSION, metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, metadata=None, overwrite=False, new_ktk_cube_metadata=True, write_suppress_index_on=True, ): if partition_on == "default": partition_on = cube.partition_columns if isinstance(df, pd.DataFrame): mp = MetaPartition( label=gen_uuid(), data={SINGLE_TABLE: df}, metadata_version=metadata_version ) indices_to_build = set(cube.index_columns) & set(df.columns) if name == cube.seed_dataset: indices_to_build |= set(cube.dimension_columns) - set( cube.suppress_index_on ) mp = mp.build_indices(indices_to_build) dfs = mp else: assert isinstance(df, MetaPartition) assert df.metadata_version == metadata_version dfs = df if metadata is None: metadata = { KTK_CUBE_METADATA_DIMENSION_COLUMNS: cube.dimension_columns, KTK_CUBE_METADATA_KEY_IS_SEED: (name == cube.seed_dataset), } if new_ktk_cube_metadata: metadata.update( {KTK_CUBE_METADATA_PARTITION_COLUMNS: cube.partition_columns} ) if write_suppress_index_on: metadata.update( {KTK_CUBE_METADATA_SUPPRESS_INDEX_ON: list(cube.suppress_index_on)} ) return store_dataframes_as_dataset( store=function_store, dataset_uuid=cube.ktk_dataset_uuid(name), dfs=dfs, partition_on=list(partition_on) if partition_on else None, metadata_storage_format=metadata_storage_format, metadata_version=metadata_version, df_serializer=KTK_CUBE_DF_SERIALIZER, metadata=metadata, overwrite=overwrite, )
def parse_input_to_metapartition( obj: MetaPartitionInput, table_name: str = SINGLE_TABLE, metadata_version: Optional[int] = None, ) -> MetaPartition: """ Parses given user input and return a MetaPartition The expected input is a :class:`pandas.DataFrame` or a list of :class:`pandas.DataFrame`. Every element of the list will be treated as a dedicated user input and will result in a physical file, if not specified otherwise. Parameters ---------- obj table_name The table name assigned to the partitions metadata_version The kartothek dataset specification version """ if obj is None: obj = [] if isinstance(obj, list): if len(obj) == 0: return MetaPartition(label=None, metadata_version=metadata_version) first_element = obj[0] mp = parse_input_to_metapartition( obj=first_element, metadata_version=metadata_version, table_name=table_name, ) for mp_in in obj[1:]: mp = mp.add_metapartition( parse_input_to_metapartition( obj=mp_in, metadata_version=metadata_version, table_name=table_name, )) elif isinstance(obj, pd.DataFrame): mp = MetaPartition( label=gen_uuid(), data=obj, metadata_version=metadata_version, table_name=table_name, ) elif isinstance(obj, MetaPartition): return obj else: raise ValueError( f"Unexpected type during parsing encountered: ({type(obj)}, {obj})" ) return mp
def store_dataframes_as_dataset( store, dataset_uuid, dfs, metadata=None, partition_on=None, df_serializer=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, ): """ Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files). Useful for very small datasets where all data fits into memory. Parameters ---------- dfs : dict of pd.DataFrame or pd.DataFrame The dataframe(s) to be stored. If only a single dataframe is passed, it will be stored as the `core` table. Returns ------- The stored dataset """ if dataset_uuid is None: dataset_uuid = gen_uuid() if isinstance(dfs, dict): dfs = {"data": [(table, df) for table, df in dfs.items()]} if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) mp = parse_input_to_metapartition(dfs, metadata_version) if partition_on: mp = MetaPartition.partition_on(mp, partition_on) mps = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) return store_dataset_from_partitions( partition_list=mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_storage_format=metadata_storage_format, )
def test_raises_missing_dimension_columns(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=MetaPartition( label=gen_uuid(), data={SINGLE_TABLE: pd.DataFrame({"x": [0], "p": [0], "q": [0]})}, metadata_version=KTK_CUBE_METADATA_VERSION, ).build_indices(["x"]), name=cube.seed_dataset, ) with pytest.raises(ValueError) as exc: discover_datasets(cube, function_store) assert ( str(exc.value) == 'Seed dataset "myseed" has missing dimension columns: y' )
def test_raises_wrong_table(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=MetaPartition( label=gen_uuid(), data={"foo": pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]})}, metadata_version=KTK_CUBE_METADATA_VERSION, ), name=cube.seed_dataset, ) with pytest.raises(ValueError) as exc: discover_datasets(cube, function_store) assert ( str(exc.value) == "Invalid datasets because table is wrong. Expected table: myseed (foo)" )
def test_raises_dimension_index_missing(self, cube, function_store): store_data( cube=cube, function_store=function_store, df=MetaPartition( label=gen_uuid(), data=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), metadata_version=KTK_CUBE_METADATA_VERSION, ), name=cube.seed_dataset, ) with pytest.raises(ValueError) as exc: discover_datasets(cube, function_store) assert (str(exc.value) == 'ExplicitSecondaryIndex "x" is missing in dataset "myseed".')
def prepare_data_for_ktk(df, ktk_cube_dataset_id, cube, existing_payload, partition_on, consume_df=False): """ Prepare data so it can be handed over to Kartothek. Some checks will be applied to the data to ensure it is sane. Parameters ---------- df: pandas.DataFrame DataFrame to be passed to Kartothek. ktk_cube_dataset_id: str Ktk_cube dataset UUID (w/o cube prefix). cube: kartothek.core.cube.cube.Cube Cube specification. existing_payload: Set[str] Existing payload columns. partition_on: Iterable[str] Partition-on attribute for given dataset. consume_df: bool Whether the incoming DataFrame can be destroyed while processing it. Returns ------- mp: kartothek.io_components.metapartition.MetaPartition Kartothek-ready MetaPartition, may be sentinel (aka empty and w/o label). Raises ------ ValueError In case anything is fishy. """ check_user_df(ktk_cube_dataset_id, df, cube, existing_payload, partition_on) if (df is None) or df.empty: # fast-path for empty DF return MetaPartition( label=None, metadata_version=KTK_CUBE_METADATA_VERSION, partition_keys=list(partition_on), ) # TODO: find a more elegant solution that works w/o copy df_orig = df df = df.copy() if consume_df: # the original df is still referenced in the parent scope, so drop it df_orig.drop(columns=df_orig.columns, index=df_orig.index, inplace=True) df_columns = list(df.columns) df_columns_set = set(df_columns) # normalize value order and reset index sort_keys = [ col for col in itertools.chain(cube.partition_columns, cube.dimension_columns) if col in df_columns_set ] df = sort_dataframe(df=df, columns=sort_keys) # check duplicate cells _check_duplicates(ktk_cube_dataset_id, df, sort_keys, cube) # check+convert column names to unicode strings df.rename(columns={c: converter_str(c) for c in df_columns}, inplace=True) # create MetaPartition object for easier handling mp = MetaPartition( label=gen_uuid(), data=df, metadata_version=KTK_CUBE_METADATA_VERSION, ) del df # partition data mp = mp.partition_on(list(partition_on)) # reset indices again (because partition_on breaks it) for mp2 in mp: mp2.data.reset_index(drop=True, inplace=True) del mp2 # calculate indices indices_to_build = set(cube.index_columns) & df_columns_set if ktk_cube_dataset_id == cube.seed_dataset: indices_to_build |= set(cube.dimension_columns) - set( cube.suppress_index_on) indices_to_build -= set(partition_on) mp = mp.build_indices(indices_to_build) return mp
def store_delayed_as_dataset( delayed_tasks, store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, secondary_indices=None, ): """ Transform and store a list of dictionaries containing dataframes to a kartothek dataset in store. Parameters ---------- delayed_tasks: list of dask.delayed Every delayed object represents a partition and should be accepted by :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition` Returns ------- A dask.delayed dataset object. """ _check_callable(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) input_to_mps = partial(parse_input_to_metapartition, metadata_version=metadata_version) mps = map_delayed(delayed_tasks, input_to_mps) if partition_on: mps = map_delayed(mps, MetaPartition.partition_on, partition_on=partition_on) if secondary_indices: mps = map_delayed(mps, MetaPartition.build_indices, columns=secondary_indices) mps = map_delayed( mps, MetaPartition.store_dataframes, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) return delayed(store_dataset_from_partitions)( mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, )
def store_bag_as_dataset( bag, store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, secondary_indices=None, ): """ Transform and store a dask.bag of dictionaries containing dataframes to a kartothek dataset in store. This is the dask.bag-equivalent of :func:`~kartothek.io.dask.delayed.store_delayed_as_dataset`. See there for more detailed documentation on the different possible input types. Parameters ---------- bag: dask.bag.Bag A dask bag containing dictionaries of dataframes or dataframes. """ store = lazy_store(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) raise_if_indices_overlap(partition_on, secondary_indices) input_to_mps = partial(parse_input_to_metapartition, metadata_version=metadata_version) mps = bag.map(input_to_mps) if partition_on: mps = mps.map(MetaPartition.partition_on, partition_on=partition_on) if secondary_indices: mps = mps.map(MetaPartition.build_indices, columns=secondary_indices) mps = mps.map( MetaPartition.store_dataframes, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) aggregate = partial( _store_dataset_from_partitions_flat, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, ) return mps.reduction(perpartition=list, aggregate=aggregate, split_every=False)
def _f(b_c): b, c = b_c df = pd.DataFrame({"a": [1, 1], "b": [b, b], "c": c, "d": [b, b + 1]}) return {"label": gen_uuid(), "data": [(table_name, df)]}