def test_build_indices(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" partitions = [ {"label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))]}, {"label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))]}, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, ) dataset = dataset.load_all_indices(store=store_factory) assert not dataset.indices # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) expected = {2: ["cluster_1", "cluster_2"], 3: ["cluster_2"], 1: ["cluster_1"]} assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)
def test_update_dataset_with_partitions_no_index_input_info( store_factory, metadata_version, bound_update_dataset, store ): partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1]}))], "indices": {"p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})}, }, { "label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2]}))], "indices": {"p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})}, }, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, metadata={"dataset": "metadata"}, dataset_uuid="dataset_uuid", metadata_version=metadata_version, ) # The input information doesn't explicitly provide index information # Since the dataset has an index, it must be updated either way part3 = {"label": "cluster_3", "data": [("core", pd.DataFrame({"p": [3]}))]} dataset_updated = bound_update_dataset( [part3], store=store_factory, dataset_uuid=dataset.uuid, delete_scope=[{"p": 1}], metadata={"extra": "metadata"}, default_metadata_version=metadata_version, secondary_indices=["p"], ) dataset_updated = dataset_updated.load_all_indices(store) assert 3 in dataset_updated.indices["p"].to_dict()
def test_metadata_version( store_factory, bound_update_dataset, mock_default_metadata_version, backend_identifier, ): if backend_identifier in ("dask.dataframe", "dask.delayed"): pytest.skip() # TODO: fix `io.dask.*.test_update._update_dataset` dataset_uuid = "dataset_uuid" partitions = [ {"label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))]}, {"label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))]}, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=DEFAULT_METADATA_VERSION, ) with pytest.raises(AssertionError, match="Traversed through mock"): # Try to commit data to dataset using a different metadata version # and different data format (format is mocked) # This does not raise when the `parse_input_to_metapartition` # argument is `default_metadata_version` instead of `metadata_version` new_partitions = ("core", pd.DataFrame({"p": [2, 3]})) bound_update_dataset( new_partitions, store=store_factory, dataset_uuid=dataset_uuid, default_metadata_version=mock_default_metadata_version, ) mps = read_dataset_as_metapartitions(store=store_factory, dataset_uuid=dataset_uuid) assert len(mps) == len(dataset.partitions)
def test_dispatch_metapartitions_sorted_dispatch_by(store): df = pd.DataFrame( {"p": np.random.randint(high=100000, low=-100000, size=(100,)), "x": 0} ) # Integers are sorted when using too small values (maybe connected to the # singleton implementation of integers in CPython??) # Verify this is not happening, otherwise we'll get immediately a sorted # index (which is nice in this case but not generally true, of course) arr = set(df["p"].unique()) assert list(arr) != sorted(arr) dataset = store_dataframes_as_dataset( dfs=[df], dataset_uuid="test", store=store, secondary_indices=["p", "x"] ) wout_preds = list(dispatch_metapartitions(dataset.uuid, store, dispatch_by="p")) last = -math.inf for mps in wout_preds: for mp in mps: current = mp.logical_conjunction assert len(current) == 1 current = current[0][2] assert current > last last = current
def test_commit_dataset_delete_all(store, metadata_version): partitions = [pd.DataFrame({"p": [1]})] dataset = store_dataframes_as_dataset( dfs=partitions, store=lambda: store, metadata={"dataset": "metadata"}, dataset_uuid="dataset_uuid", secondary_indices="p", metadata_version=metadata_version, ) dataset = dataset.load_index("p", store) assert len(dataset.partitions) == 1 delete_scope = [{"p": 1}] updated_dataset = commit_dataset( store=store, dataset_uuid=dataset.uuid, new_partitions=None, delete_scope=delete_scope, partition_on=None, ) assert len(updated_dataset.partitions) == 0 assert updated_dataset.explicit_partitions is True
def build_cube(data, cube, store, metadata=None, overwrite=False, partition_on=None): """ Store given dataframes as Ktk_cube cube. ``data`` can be formatted in multiple ways: - single DataFrame:: pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v': [42, 45, 20, 10], }) In that case, the seed dataset will be written. - dictionary of DataFrames:: { 'seed': pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v1': [42, 45, 20, 10], }), 'enrich': pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v2': [False, False, True, False], }), } In that case, multiple datasets can be written at the same time. Note that the seed dataset MUST be included. - list of anything above:: [ # seed data only pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v1': [42, 45, 20, 10], }), # seed data only, explicit way { 'seed': pd.DataFrame({ 'x': [4, 5, 6, 7], 'p': [0, 0, 1, 1], 'v1': [12, 32, 22, 9], }), }, # multiple datasets { 'seed': pd.DataFrame({ 'x': [8, 9, 10, 11], 'p': [0, 0, 1, 1], 'v1': [9, 2, 4, 11], }), 'enrich': pd.DataFrame({ 'x': [8, 9, 10, 11], 'p': [0, 0, 1, 1], 'v2': [True, True, False, False], }), }, # non-seed data only { 'enrich': pd.DataFrame({ 'x': [1, 2, 3, 4], 'p': [0, 0, 1, 1], 'v2': [False, True, False, False], }), }, ] In that case, multiple datasets may be written. Note that at least a single list element must contain seed data. Extra metdata may be preserved w/ every dataset, e.g.:: { 'seed': { 'source': 'db', 'host': 'db1.cluster20.company.net', 'last_event': '230c6edb-b69a-4d30-b56d-28f5dfe20948', }, 'enrich': { 'source': 'python', 'commit_hash': '8b5d717518439921e6d17c7495956bdad687bc54', }, } Note that the given data must be JSON-serializable. If the cube already exists, the ``overwrite`` flag must be given. In that case, all datasets that are part of the existing cube must be overwritten. Partial overwrites are not allowed. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset. overwrite: bool If possibly existing datasets should be overwritten. partition_on: Optional[Dict[str, Iterable[str]]] Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). See :ref:`Dimensionality and Partitioning Details` for details. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) ktk_cube_dataset_ids = set(data.keys()) partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(data, cube, existing_datasets) # do all data preparation before writing anything data = _prepare_data_for_ktk_all(data=data, cube=cube, existing_payload=set(), partition_on=partition_on) datasets = {} for ktk_cube_dataset_id, part in data.items(): datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset( store=store, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), dfs=part, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), partition_on=list(partition_on[ktk_cube_dataset_id]), metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, metadata_version=KTK_CUBE_METADATA_VERSION, df_serializer=KTK_CUBE_DF_SERIALIZER, overwrite=overwrite, ) return apply_postwrite_checks(datasets=datasets, cube=cube, store=store, existing_datasets=existing_datasets)
def extend_cube(data, cube, store, metadata=None, overwrite=False, partition_on=None): """ Store given dataframes into an existing Kartothek cube. For details on ``data`` and ``metadata``, see :meth:`build_cube`. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset. overwrite: bool If possibly existing datasets should be overwritten. partition_on: Optional[Dict[str, Iterable[str]]] Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). See :ref:`Dimensionality and Partitioning Details` for details. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) ktk_cube_dataset_ids = set(data.keys()) partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) check_datasets_preextend(data, cube) existing_datasets = discover_datasets(cube, store) if overwrite: existing_datasets_cut = { ktk_cube_dataset_id: ds for ktk_cube_dataset_id, ds in existing_datasets.items() if ktk_cube_dataset_id not in data } else: existing_datasets_cut = existing_datasets existing_payload = get_cube_payload(existing_datasets_cut, cube) # do all data preparation before writing anything data = _prepare_data_for_ktk_all( data=data, cube=cube, existing_payload=existing_payload, partition_on=partition_on, ) datasets = {} for ktk_cube_dataset_id, part in data.items(): datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset( store=store, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), dfs=part, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), partition_on=list(partition_on[ktk_cube_dataset_id]), metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, metadata_version=KTK_CUBE_METADATA_VERSION, df_serializer=KTK_CUBE_DF_SERIALIZER, overwrite=overwrite, ) return apply_postwrite_checks(datasets=datasets, cube=cube, store=store, existing_datasets=existing_datasets)
def test_update_dataset_with_partitions__reducer_partitions( store_factory, frozen_time_em, bound_update_dataset ): assert set(store_factory().keys()) == set() df1 = pd.DataFrame( {"P": [1, 2, 3, 1, 2, 3], "L": [1, 1, 1, 1, 1, 1], "TARGET": np.arange(10, 16)} ) df2 = df1.copy(deep=True) df2.L = 2 df2.TARGET += 2 df_list = [ { "label": "cluster_1", "data": [("core", df1)], "indices": {"L": {k: ["cluster_1"] for k in df1["L"].unique()}}, }, { "label": "cluster_2", "data": [("core", df2)], "indices": {"L": {k: ["cluster_2"] for k in df2["L"].unique()}}, }, ] dataset = store_dataframes_as_dataset( dfs=df_list, store=store_factory, dataset_uuid="dataset_uuid", partition_on=["P"], metadata_version=4, ) dataset_loadedidx = dataset.load_all_indices(store=store_factory()) cluster_1_label = ( dataset_loadedidx.indices["L"].eval_operator(op="==", value=1).pop() ) cluster_1_label = cluster_1_label.split("/")[-1] cluster_2_label = ( dataset_loadedidx.indices["L"].eval_operator(op="==", value=2).pop() ) cluster_2_label = cluster_2_label.split("/")[-1] df3 = df2.copy(deep=True) df3.TARGET -= 5 part3 = { "label": "cluster_3", "data": {"core": df3}, "indices": {"L": {k: ["cluster_3"] for k in df3["L"].unique()}}, } dataset_updated = bound_update_dataset( [part3], store=store_factory, dataset_uuid="dataset_uuid", delete_scope=[{"L": 2}], metadata={"extra": "metadata"}, partition_on=["P"], secondary_indices=["L"], ) dataset_updated_loadedidx = dataset_updated.load_all_indices(store=store_factory()) cluster_3_labels = dataset_updated_loadedidx.indices["L"].eval_operator( op="==", value=2 ) cluster_3_label = {c3_label.split("/")[-1] for c3_label in cluster_3_labels} assert len(cluster_3_label) == 1 cluster_3_label = cluster_3_label.pop() exp_partitions = [ "P=1/{}".format(cluster_1_label), "P=1/{}".format(cluster_3_label), "P=2/{}".format(cluster_1_label), "P=2/{}".format(cluster_3_label), "P=3/{}".format(cluster_1_label), "P=3/{}".format(cluster_3_label), ] assert sorted(exp_partitions) == sorted(dataset_updated.partitions.keys()) updated_idx_keys = sorted(dataset_updated.indices.keys()) assert sorted(dataset.indices.keys()) == updated_idx_keys expected_new_idx = {} for k, v in dataset_loadedidx.indices["P"].index_dct.items(): val = [pl.replace(cluster_2_label, cluster_3_label) for pl in v] expected_new_idx[k] = val updated_P_idx_dct = dataset_updated_loadedidx.indices["P"].index_dct assert sorted(expected_new_idx.keys()) == sorted(updated_P_idx_dct.keys()) for k, v in updated_P_idx_dct.items(): assert sorted(expected_new_idx[k]) == sorted(v)
def test_update_dataset_with_partitions__reducer_delete_only( store, metadata_version, frozen_time_em, bound_update_dataset): partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1]}))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]}) }, }, { "label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2]}))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]}) }, }, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=lambda: store, metadata={"dataset": "metadata"}, dataset_uuid="dataset_uuid", metadata_version=metadata_version, ) dataset = dataset.load_index("p", store) empty_part = [] dataset_updated = bound_update_dataset( [empty_part], store=lambda: store, dataset_uuid="dataset_uuid", delete_scope=[{ "p": 1 }], metadata={"extra": "metadata"}, default_metadata_version=metadata_version, secondary_indices=["p"], ) dataset_updated = dataset_updated.load_index("p", store) assert sorted(dataset.partitions) == ["cluster_1", "cluster_2"] assert list(dataset_updated.partitions) == ["cluster_2"] store_files = list(store.keys()) # 1 dataset metadata file and 1 index file and 2 partition files # note: the update writes a new index file but due to frozen_time this gets # the same name as the previous one and overwrites it. expected_number_files = 4 # common metadata for v4 datasets (1 table) expected_number_files += 1 assert len(store_files) == expected_number_files assert dataset.indices["p"].index_dct == { 1: ["cluster_1"], 2: ["cluster_2"] } assert dataset_updated.indices["p"].index_dct == {2: ["cluster_2"]} # Ensure the dataset can be loaded properly stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) stored_dataset = stored_dataset.load_index("p", store) assert dataset_updated == stored_dataset
def test_overwrite_rollback_ktk(driver, function_store): """ Checks that require a rollback (like overlapping columns) should recover the former state correctly. """ cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", index_columns=["i1", "i2", "i3", "i4"], ) df_source1 = pd.DataFrame( { "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13], "i1": [10, 11, 12, 13], } ) df_enrich1 = pd.DataFrame( { "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "i2": [20, 21, 22, 23], "v1": [20, 21, 22, 23], } ) store_dataframes_as_dataset( dfs=[{"ktk_source": df_source1, "ktk_enrich": df_enrich1}], store=function_store, dataset_uuid=cube.ktk_dataset_uuid(cube.seed_dataset), metadata_version=KTK_CUBE_METADATA_VERSION, secondary_indices=["i1", "i2"], ) df_source2 = pd.DataFrame( { "x": [10, 11], "p": [10, 10], "v1": [10.0, 11.0], # also use another dtype here (was int) "i3": [10, 11], } ) df_enrich2 = pd.DataFrame( { "x": [10, 11], "p": [10, 10], "v1": [20.0, 21.0], # also use another dtype here (was int) "i4": [20, 21], } ) with pytest.raises(MultiTableCommitAborted) as exc_info: driver( data={"source": df_source2, "enrich": df_enrich2}, cube=cube, store=function_store, overwrite=True, ) cause = exc_info.value.__cause__ assert str(cause).startswith("Found columns present in multiple datasets:") ds_source = DatasetMetadata.load_from_store( uuid=cube.ktk_dataset_uuid(cube.seed_dataset), store=function_store() ).load_all_indices(function_store()) assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset) assert len(ds_source.partitions) == 1 assert ds_source.table_meta["ktk_source"].field("v1").type == pa.int64() assert ds_source.table_meta["ktk_enrich"].field("v1").type == pa.int64()
def test_dispatch_metapartitions_complex_or_predicates(store_factory): dataset_uuid = "test" df = pd.DataFrame({"A": range(10), "B": ["A", "B"] * 5, "C": range(-10, 0)}) store_dataframes_as_dataset( store=store_factory, dataset_uuid=dataset_uuid, dfs=[df], partition_on=["A", "B"], ) predicates = [[("A", "<", 3)], [("B", "==", "B")]] mps = [ mp.load_dataframes(store_factory) for mp in dispatch_metapartitions( dataset_uuid, store_factory, predicates=predicates ) ] actual = pd.concat([mp.data["table"] for mp in mps]) actual = actual.sort_values(by="A", ignore_index=True) expected = pd.DataFrame( data={ "A": [0, 1, 2, 3, 5, 7, 9], "B": ["A", "B", "A", "B", "B", "B", "B"], "C": [-10, -9, -8, -7, -5, -3, -1], } ) pd.testing.assert_frame_equal(actual, expected) predicates = [[("A", "<", 3)], [("B", "==", "notthere")]] mps = [ mp.load_dataframes(store_factory) for mp in dispatch_metapartitions( dataset_uuid, store_factory, predicates=predicates ) ] actual = pd.concat([mp.data["table"] for mp in mps]) actual = actual.sort_values(by="A", ignore_index=True) expected = pd.DataFrame( data={"A": [0, 1, 2], "B": ["A", "B", "A"], "C": [-10, -9, -8]} ) pd.testing.assert_frame_equal(actual, expected) predicates = [[("A", "<", 3), ("B", "==", "A")], [("B", "==", "B"), ("A", ">", 2)]] mps = [ mp.load_dataframes(store_factory) for mp in dispatch_metapartitions( dataset_uuid, store_factory, predicates=predicates ) ] actual = pd.concat([mp.data["table"] for mp in mps]) actual = actual.sort_values(by="A", ignore_index=True) expected = pd.DataFrame( data={ "A": [0, 2, 3, 5, 7, 9], "B": ["A", "A", "B", "B", "B", "B"], "C": [-10, -8, -7, -5, -3, -1], } ) pd.testing.assert_frame_equal(actual, expected) predicates = [[("A", "<", 3)], [("B", "==", "B"), ("A", ">", 2)]] mps = [ mp.load_dataframes(store_factory) for mp in dispatch_metapartitions( dataset_uuid, store_factory, predicates=predicates ) ] actual = pd.concat([mp.data["table"] for mp in mps]) actual = actual.sort_values(by="A", ignore_index=True) expected = pd.DataFrame( data={ "A": [0, 1, 2, 3, 5, 7, 9], "B": ["A", "B", "A", "B", "B", "B", "B"], "C": [-10, -9, -8, -7, -5, -3, -1], } ) pd.testing.assert_frame_equal(actual, expected)
def assert_hive_compat(df, store_factory, uuid, **kwargs): TABLE_NAME = uuid # Hive table name dm = store_dataframes_as_dataset(store=store_factory, dataset_uuid=uuid, dfs=[df], **kwargs) store = store_factory() print(f"Dataset location: {VOLUME_LOCATION}") # Use Pyhive to query hive conn = hive.Connection(host="hive-server", port=10000) cursor = conn.cursor() # TODO: test partitioned dataset for filepath in store.iter_keys(): if filepath.endswith(".parquet"): parquet_file_parentdir = f"{VOLUME_LOCATION}/{os.path.dirname(filepath)}" break if kwargs.get("partition_on"): for i in kwargs.get( "partition_on" ): # Get the parent directory of the parquet file for each column it is partitioned on # Note. Parquet filepath looks like: `/tmp/uuid/table/partition_col1=x/partition_col2=y/1300dadda3.parquet` parquet_file_parentdir = os.path.dirname(parquet_file_parentdir) # Create Hive table ## Non-nested columns not included: `np.uint64` (max value is too large for `BIGINT`) ## The `null` column can be specified as multiple types (at least `STRING` and `FLOAT`) # TODO: have a mapping from kartothek/arrow dtypes to Hive dtypes selected_columns_and_dtypes = """\ bool BOOLEAN, bytes BINARY, date_ DATE, datetime64 BIGINT, float32 FLOAT, float64 DOUBLE, int8 TINYINT, int16 SMALLINT, int32 INT, int64 BIGINT, uint8 SMALLINT, uint16 INT, uint32 BIGINT, unicode STRING, null_ FLOAT""" # Hive allows us to only select a subset of columns to be loaded from the Parquet file hive_query = f""" CREATE external table {TABLE_NAME} ( {selected_columns_and_dtypes} ) STORED AS PARQUET LOCATION "{parquet_file_parentdir}" """ print(f"Hive query: {hive_query}") cursor.execute(hive_query) # Get column names from query substring selected_columns = [ l.strip().split(" ")[0] for l in selected_columns_and_dtypes.splitlines() ] # Read hive table into pandas hive_df = pd.read_sql(f"SELECT * FROM {TABLE_NAME}", conn) hive_df.columns = selected_columns # Pyarrow stores timestamp as microseconds from epoch, convert to date hive_df["datetime64"] = pd.to_datetime(hive_df.loc[:, "datetime64"] * 1000, unit="ns") # Output from hive is a string, parse this to date hive_df["date_"] = pd.to_datetime( hive_df.loc[:, "date_"], format="%Y-%m-%d").apply(lambda x: x.date()) # Ignore dtype for numeric comparisons (e.g. int32 with int64) pdt.assert_frame_equal(df[selected_columns], hive_df, check_dtype=False) print(f"Test completed for the following data types: {selected_columns}")
def test_indices_uints(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" # min uint64 p1 = 0 # max uint64 => cannot even be cast to float32 p2 = int(~np.uint64(0)) # number that would be cut if converted to float64 and back p3 = 17128351978467489013 partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({"p": pd.Series([p1], dtype=np.uint64)}))], }, { "label": "cluster_2", "data": [("core", pd.DataFrame({"p": pd.Series([p2], dtype=np.uint64)}))], }, { "label": "cluster_3", "data": [("core", pd.DataFrame({"p": pd.Series([p3], dtype=np.uint64)}))], }, ] expected = {p1: ["cluster_1"], p2: ["cluster_2"], p3: ["cluster_3"]} dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, ) dataset = dataset.load_all_indices(store=store_factory) assert not dataset.indices # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct) # Re-create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)