def test_table_meta(store): mp = MetaPartition( label="label_1", data=pd.DataFrame( { "i32": np.array([1, 2, 3, 1, 2, 3], dtype="int32"), "float": np.array([1, 1, 1, 2, 2, 2], dtype="float64"), } ), metadata_version=4, ) assert mp.schema is not None expected_meta = make_meta( pd.DataFrame( {"i32": np.array([], dtype="int32"), "float": np.array([], dtype="float64")} ), origin="1", ) actual_meta = mp.schema assert actual_meta == expected_meta mp = mp.store_dataframes(store, "dataset_uuid") actual_meta = mp.schema assert actual_meta == expected_meta
def test_store_single_dataframe_as_partition(store, metadata_version): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) mp = MetaPartition(label="test_label", data=df, metadata_version=metadata_version) meta_partition = mp.store_dataframes( store=store, df_serializer=ParquetSerializer(), dataset_uuid="dataset_uuid", ) assert meta_partition.data is None expected_key = "dataset_uuid/table/test_label.parquet" assert meta_partition.file == expected_key assert meta_partition.label == "test_label" files_in_store = list(store.keys()) expected_num_files = 1 assert len(files_in_store) == expected_num_files stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_key) pdt.assert_frame_equal(df, stored_df) files_in_store.remove(expected_key) assert len(files_in_store) == expected_num_files - 1
def test_load_dataframe_logical_conjunction(store, metadata_version): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) mp = MetaPartition( label="cluster_1", data=df, metadata_version=metadata_version, logical_conjunction=[("P", ">", 4)], ) meta_partition = mp.store_dataframes( store=store, df_serializer=None, dataset_uuid="dataset_uuid", ) predicates = None loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates) data = pd.DataFrame( {"P": [5, 6, 7, 8, 9], "L": [5, 6, 7, 8, 9], "TARGET": [15, 16, 17, 18, 19]} ).set_index(np.arange(5, 10)) pdt.assert_frame_equal(loaded_mp.data, data) predicates = [[("L", ">", 6), ("TARGET", "<", 18)]] loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates) data = pd.DataFrame({"P": [7], "L": [7], "TARGET": [17]}).set_index(np.array([7])) pdt.assert_frame_equal(loaded_mp.data, data) predicates = [[("L", ">", 2), ("TARGET", "<", 17)], [("TARGET", "==", 19)]] loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates) data = pd.DataFrame( {"P": [5, 6, 9], "L": [5, 6, 9], "TARGET": [15, 16, 19]} ).set_index(np.array([5, 6, 9])) pdt.assert_frame_equal(loaded_mp.data, data)
def test_get_parquet_metadata_empty_df(store): df = pd.DataFrame() mp = MetaPartition(label="test_label", data=df) meta_partition = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid") actual = meta_partition.get_parquet_metadata(store=store) actual.drop( columns=[ "serialized_size", "row_group_compressed_size", "row_group_uncompressed_size", ], axis=1, inplace=True, ) expected = pd.DataFrame( { "partition_label": ["test_label"], "row_group_id": 0, "number_rows_total": 0, "number_row_groups": 1, "number_rows_per_row_group": 0, } ) pd.testing.assert_frame_equal(actual, expected)
def test_get_parquet_metadata_row_group_size(store): df = pd.DataFrame({"P": np.arange(0, 10), "L": np.arange(0, 10)}) mp = MetaPartition(label="test_label", data=df) ps = ParquetSerializer(chunk_size=5) meta_partition = mp.store_dataframes( store=store, dataset_uuid="dataset_uuid", df_serializer=ps ) actual = meta_partition.get_parquet_metadata(store=store) actual.drop( columns=[ "serialized_size", "row_group_compressed_size", "row_group_uncompressed_size", ], axis=1, inplace=True, ) expected = pd.DataFrame( { "partition_label": ["test_label", "test_label"], "row_group_id": [0, 1], "number_rows_total": [10, 10], "number_row_groups": [2, 2], "number_rows_per_row_group": [5, 5], } ) pd.testing.assert_frame_equal(actual, expected)
def test_store_single_dataframe_as_partition_no_metadata( store, metadata_version): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) mp = MetaPartition(label="test_label", data={"core": df}, metadata_version=metadata_version) partition = mp.store_dataframes( store=store, df_serializer=ParquetSerializer(), dataset_uuid="dataset_uuid", store_metadata=False, ) assert len(partition.data) == 0 expected_file = "dataset_uuid/core/test_label.parquet" assert partition.files == {"core": expected_file} assert partition.label == "test_label" # One meta one actual file files_in_store = list(store.keys()) assert len(files_in_store) == 1 stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_file) pdt.assert_frame_equal(df, stored_df)
def test_store_single_dataframe_as_partition(store, metadata_storage_format, metadata_version, expected_key): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) mp = MetaPartition(label="test_label", data={"core": df}, metadata_version=metadata_version) meta_partition = mp.store_dataframes( store=store, df_serializer=ParquetSerializer(), dataset_uuid="dataset_uuid", store_metadata=True, metadata_storage_format=metadata_storage_format, ) assert len(meta_partition.data) == 0 assert meta_partition.files == {"core": expected_key} assert meta_partition.label == "test_label" files_in_store = list(store.keys()) expected_num_files = 1 assert len(files_in_store) == expected_num_files stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_key) pdt.assert_frame_equal(df, stored_df) files_in_store.remove(expected_key) assert len(files_in_store) == expected_num_files - 1
def test_table_meta(store): mp = MetaPartition( label="label_1", data={ "core": pd.DataFrame({ "i32": np.array([1, 2, 3, 1, 2, 3], dtype="int32"), "float": np.array([1, 1, 1, 2, 2, 2], dtype="float64"), }) }, metadata_version=4, ) assert len(mp.table_meta) == 1 assert "core" in mp.table_meta expected_meta = make_meta( pd.DataFrame({ "i32": np.array([], dtype="int32"), "float": np.array([], dtype="float64") }), origin="1", ) actual_meta = mp.table_meta["core"] assert actual_meta == expected_meta mp = mp.store_dataframes(store, "dataset_uuid") actual_meta = mp.table_meta["core"] assert actual_meta == expected_meta
def _multiplex_store(data, cube, store): result = {} for k in sorted(data.keys()): v = data.pop(k) result[k] = MetaPartition.store_dataframes( v, dataset_uuid=cube.ktk_dataset_uuid(k), df_serializer=KTK_CUBE_DF_SERIALIZER, store=store, ) del v return result
def test_store_multiple_dataframes_as_partition(store, metadata_storage_format, metadata_version): df = pd.DataFrame({ "P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20) }) df_2 = pd.DataFrame({ "P": np.arange(0, 10), "info": string.ascii_lowercase[:10] }) mp = MetaPartition( label="cluster_1", data={ "core": df, "helper": df_2 }, metadata_version=metadata_version, ) meta_partition = mp.store_dataframes( store=store, df_serializer=None, dataset_uuid="dataset_uuid", store_metadata=True, metadata_storage_format=metadata_storage_format, ) expected_file = "dataset_uuid/core/cluster_1.parquet" expected_file_helper = "dataset_uuid/helper/cluster_1.parquet" assert meta_partition.files == { "core": expected_file, "helper": expected_file_helper, } assert meta_partition.label == "cluster_1" files_in_store = list(store.keys()) assert len(files_in_store) == 2 stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_file) pdt.assert_frame_equal(df, stored_df) files_in_store.remove(expected_file) stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_file_helper) pdt.assert_frame_equal(df_2, stored_df) files_in_store.remove(expected_file_helper)
def _multiplex_store( data: db.Bag, cube: Cube, store: StoreFactory, df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: result = {} for k in sorted(data.keys()): v = data.pop(k) result[k] = MetaPartition.store_dataframes( v, dataset_uuid=cube.ktk_dataset_uuid(k), df_serializer=df_serializer or KTK_CUBE_DF_SERIALIZER, store=store, ) del v return result