def test_read_dataset_as_dataframes_partition_size(store_factory, metadata_version): cluster1 = pd.DataFrame( {"A": [1, 1], "B": [10, 10], "C": [1, 2], "Content": ["cluster1", "cluster1"]} ) cluster2 = pd.DataFrame( {"A": [1, 1], "B": [10, 10], "C": [2, 3], "Content": ["cluster2", "cluster2"]} ) cluster3 = pd.DataFrame({"A": [1], "B": [20], "C": [1], "Content": ["cluster3"]}) cluster4 = pd.DataFrame( {"A": [2, 2], "B": [10, 10], "C": [1, 2], "Content": ["cluster4", "cluster4"]} ) clusters = [cluster1, cluster2, cluster3, cluster4] partitions = [{"data": [("data", c)]} for c in clusters] store_dataframes_as_dataset__iter( df_generator=partitions, store=store_factory, dataset_uuid="partitioned_uuid", metadata_version=metadata_version, ) for func in [read_dataset_as_dataframe_bag, read_dataset_as_metapartitions_bag]: bag = func( dataset_uuid="partitioned_uuid", store=store_factory, partition_size=None ) assert bag.npartitions == 4 bag = func( dataset_uuid="partitioned_uuid", store=store_factory, partition_size=2 ) assert bag.npartitions == 2
def dataset_dispatch_by(metadata_version, store_session_factory, dataset_dispatch_by_uuid): cluster1 = pd.DataFrame({ "A": [1, 1], "B": [10, 10], "C": [1, 2], "Content": ["cluster1", "cluster1"] }) cluster2 = pd.DataFrame({ "A": [1, 1], "B": [10, 10], "C": [2, 3], "Content": ["cluster2", "cluster2"] }) cluster3 = pd.DataFrame({ "A": [1], "B": [20], "C": [1], "Content": ["cluster3"] }) cluster4 = pd.DataFrame({ "A": [2, 2], "B": [10, 10], "C": [1, 2], "Content": ["cluster4", "cluster4"] }) clusters = [cluster1, cluster2, cluster3, cluster4] partitions = [{"data": [("data", c)]} for c in clusters] store_dataframes_as_dataset__iter( df_generator=partitions, store=store_session_factory, dataset_uuid=dataset_dispatch_by_uuid, metadata_version=metadata_version, partition_on=["A", "B"], secondary_indices=["C"], ) return pd.concat(clusters).sort_values(["A", "B", "C"]).reset_index(drop=True)
def store_dataframes_as_dataset( store, dataset_uuid, dfs, metadata=None, partition_on=None, df_serializer=None, overwrite=False, secondary_indices=None, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, ): """ Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files). Useful for very small datasets where all data fits into memory. Parameters ---------- dfs: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]] The dataframe(s) to be stored. Returns ------- The stored dataset """ if isinstance(dfs, (pd.DataFrame, dict)): dfs = [dfs] warnings.warn( "Passing a single dataframe instead of an iterable is deprecated and may " "be removed in the next major release.", DeprecationWarning, ) return store_dataframes_as_dataset__iter( dfs, store=store, dataset_uuid=dataset_uuid, metadata=metadata, partition_on=partition_on, df_serializer=df_serializer, overwrite=overwrite, secondary_indices=secondary_indices, metadata_storage_format=metadata_storage_format, metadata_version=metadata_version, )
def test_read_dataset_as_dataframes_dispatch_by_multi_col( store_factory, bound_load_dataframes, backend_identifier, output_type, metadata_version, ): if output_type == "table": pytest.skip() cluster1 = pd.DataFrame({ "A": [1, 1], "B": [10, 10], "C": [1, 2], "Content": ["cluster1", "cluster1"] }) cluster2 = pd.DataFrame({ "A": [1, 1], "B": [10, 10], "C": [2, 3], "Content": ["cluster2", "cluster2"] }) cluster3 = pd.DataFrame({ "A": [1], "B": [20], "C": [1], "Content": ["cluster3"] }) cluster4 = pd.DataFrame({ "A": [2, 2], "B": [10, 10], "C": [1, 2], "Content": ["cluster4", "cluster4"] }) clusters = [cluster1, cluster2, cluster3, cluster4] partitions = [{"data": [("data", c)]} for c in clusters] store_dataframes_as_dataset__iter( df_generator=partitions, store=store_factory, dataset_uuid="partitioned_uuid", metadata_version=metadata_version, partition_on=["A", "B"], secondary_indices=["C"], ) for dispatch_by in permutations(("A", "B", "C"), 2): dispatched = bound_load_dataframes( dataset_uuid="partitioned_uuid", store=store_factory, dispatch_by=dispatch_by, ) uniques = pd.DataFrame(columns=dispatch_by) for part in dispatched: if isinstance(part, MetaPartition): data = part.data["data"] else: data = part["data"] unique_dispatch = data[list(dispatch_by)].drop_duplicates() assert len(unique_dispatch) == 1 row = unique_dispatch uniques.append(row) assert not any(uniques.duplicated())
def test_read_dataset_as_dataframes_dispatch_by_single_col( store_factory, bound_load_dataframes, backend_identifier, dispatch_by, output_type, metadata_version, ): if output_type == "table": pytest.skip() cluster1 = pd.DataFrame({ "A": [1, 1], "B": [10, 10], "C": [1, 2], "Content": ["cluster1", "cluster1"] }) cluster2 = pd.DataFrame({ "A": [1, 1], "B": [10, 10], "C": [2, 3], "Content": ["cluster2", "cluster2"] }) cluster3 = pd.DataFrame({ "A": [1], "B": [20], "C": [1], "Content": ["cluster3"] }) cluster4 = pd.DataFrame({ "A": [2, 2], "B": [10, 10], "C": [1, 2], "Content": ["cluster4", "cluster4"] }) clusters = [cluster1, cluster2, cluster3, cluster4] partitions = [{"data": [("data", c)]} for c in clusters] store_dataframes_as_dataset__iter( df_generator=partitions, store=store_factory, dataset_uuid="partitioned_uuid", metadata_version=metadata_version, partition_on=["A", "B"], secondary_indices=["C"], ) # Dispatch by primary index "A" dispatched_a = bound_load_dataframes(dataset_uuid="partitioned_uuid", store=store_factory, dispatch_by=[dispatch_by]) unique_a = set() for part in dispatched_a: if isinstance(part, MetaPartition): data = part.data["data"] else: data = part["data"] unique_dispatch = data[dispatch_by].unique() assert len(unique_dispatch) == 1 unique_dispatch[0] not in unique_a unique_a.add(unique_dispatch[0])