def metadata_factory_from_dataset(dataset, with_schema=True, store=None): """ Create :py:class:`DatasetFactory` from :py:class:`DatasetMetadata`. Parameters ---------- dataset: DatasetMetadata Already loaded dataset. with_schema: bool If dataset was loaded with ``load_schema``. store: Optional[Callable[[], simplekv.KeyValueStore]] Optional store factory. Returns ------- factory: DatasetFactory Metadata factory w/ caches pre-filled. """ factory = DatasetFactory( dataset_uuid=dataset.uuid, store_factory=store or _dummy_store_factory, load_schema=with_schema, ) factory._cache_metadata = dataset factory.is_loaded = True return factory
def assert_target_keys(src_store, src_uuid, tgt_store, tgt_uuid): """ Check that the expected keys exist in the target data set, and the corresponding values are equal to the source data set (or modified as expected) """ df_source = DatasetFactory( dataset_uuid=src_uuid, store_factory=lazy_store(src_store), ) src_keys = get_dataset_keys(df_source.dataset_metadata) df_target = DatasetFactory( dataset_uuid=tgt_uuid, store_factory=lazy_store(tgt_store), ) tgt_keys = get_dataset_keys(df_target.dataset_metadata) for src_key in src_keys: # check for each source key if the corresponding target key exists tgt_key = src_key.replace(src_uuid, tgt_uuid) assert tgt_key in tgt_keys # check if the files for source and target key are equal (exception: # metadata => here the target must contain the modified metadata) b1 = src_store.get(src_key) b2 = tgt_store.get(tgt_key) if tgt_key.endswith("by-dataset-metadata.json"): b1_mod = b1.decode("utf-8").replace(src_uuid, tgt_uuid).encode("utf-8") assert b1_mod == b2 else: assert b1 == b2
def test_indices_uints(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" # min uint64 p1 = 0 # max uint64 => cannot even be cast to float32 p2 = int(~np.uint64(0)) # number that would be cut if converted to float64 and back p3 = 17128351978467489013 partitions = [ pd.DataFrame({"p": pd.Series([p1], dtype=np.uint64)}), pd.DataFrame({"p": pd.Series([p2], dtype=np.uint64)}), pd.DataFrame({"p": pd.Series([p3], dtype=np.uint64)}), ] def assert_expected(index_dct): assert len(index_dct) == 3 referenced_partitions = [] for val in index_dct.values(): referenced_partitions.extend(val) assert len(referenced_partitions) == 3 dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, ) dataset = dataset.load_all_indices(store=store_factory) assert not dataset.indices # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert_expected(dataset_factory.indices["p"].index_dct) first_run = dataset_factory.indices["p"].index_dct.copy() # Re-create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert_index_dct_equal(first_run, dataset_factory.indices["p"].index_dct)
def test_indices_uints(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" # min uint64 p1 = 0 # max uint64 => cannot even be cast to float32 p2 = int(~np.uint64(0)) # number that would be cut if converted to float64 and back p3 = 17128351978467489013 partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({"p": pd.Series([p1], dtype=np.uint64)}))], }, { "label": "cluster_2", "data": [("core", pd.DataFrame({"p": pd.Series([p2], dtype=np.uint64)}))], }, { "label": "cluster_3", "data": [("core", pd.DataFrame({"p": pd.Series([p3], dtype=np.uint64)}))], }, ] expected = {p1: ["cluster_1"], p2: ["cluster_2"], p3: ["cluster_3"]} dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, ) dataset = dataset.load_all_indices(store=store_factory) assert not dataset.indices # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct) # Re-create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)
def test_repr(store_factory): factory = DatasetFactory( dataset_uuid="dataset_uuid", store_factory=store_factory # does not exist ) assert repr( factory) == "<DatasetFactory: uuid=dataset_uuid is_loaded=False>"
def dispatch_metapartitions( dataset_uuid: str, store: StoreInput, load_dataset_metadata: bool = True, keep_indices: bool = True, keep_table_meta: bool = True, label_filter: Optional[Callable] = None, concat_partitions_on_primary_index: bool = False, predicates: PredicatesType = None, dispatch_by: Optional[List[str]] = None, dispatch_metadata: bool = False, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: dataset_factory = DatasetFactory( dataset_uuid=dataset_uuid, store_factory=store, load_schema=True, load_all_indices=False, load_dataset_metadata=load_dataset_metadata, ) return dispatch_metapartitions_from_factory( dataset_factory=dataset_factory, store=None, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, concat_partitions_on_primary_index=concat_partitions_on_primary_index, dispatch_metadata=dispatch_metadata, )
def test_add_column_to_existing_index(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" partitions = [ pd.DataFrame({ "p": [1, 2], "x": [100, 4500] }), pd.DataFrame({ "p": [4, 3], "x": [500, 10] }), ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, secondary_indices="p", ) assert dataset.load_all_indices(store=store_factory()).indices.keys() == { "p" } # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["x"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert dataset_factory.indices.keys() == {"p", "x"}
def test_empty_partitions(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" partitions = [ pd.DataFrame({"p": pd.Series([], dtype=np.int8)}), pd.DataFrame({"p": pd.Series([1], dtype=np.int8)}), ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, ) dataset = dataset.load_all_indices(store=store_factory) assert not dataset.indices # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert len(dataset_factory.indices["p"].index_dct) == 1
def test_build_indices(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" partitions = [ pd.DataFrame({"p": [1, 2]}), pd.DataFrame({"p": [2, 3]}), ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, ) dataset = dataset.load_all_indices(store=store_factory) assert not dataset.indices # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) index_dct = dataset_factory.indices["p"].index_dct assert len(index_dct[1]) == 1 assert len(index_dct[2]) == 2 assert len(index_dct[3]) == 1 assert len(set(index_dct[3]) & set(index_dct[2])) == 1 assert len(set(index_dct[1]) & set(index_dct[2])) == 1 assert len(set(index_dct[1]) & set(index_dct[3])) == 0
def test_empty_partitions(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({"p": pd.Series([], dtype=np.int8)}))], }, { "label": "cluster_2", "data": [("core", pd.DataFrame({"p": pd.Series([1], dtype=np.int8)}))], }, ] expected = {1: ["cluster_2"]} dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, ) dataset = dataset.load_all_indices(store=store_factory) assert not dataset.indices # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)
def dispatch_metapartitions( dataset_uuid, store, load_dataset_metadata=True, keep_indices=True, keep_table_meta=True, label_filter=None, concat_partitions_on_primary_index=False, predicates=None, dispatch_by=None, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: dataset_factory = DatasetFactory( dataset_uuid=dataset_uuid, store_factory=_make_callable(store), load_schema=True, load_all_indices=False, load_dataset_metadata=load_dataset_metadata, ) return dispatch_metapartitions_from_factory( dataset_factory=dataset_factory, label_filter=label_filter, predicates=predicates, dispatch_by=dispatch_by, concat_partitions_on_primary_index=concat_partitions_on_primary_index, )
def dataset_with_index_factory(dataset_with_index, store_session_factory): return DatasetFactory( dataset_uuid=dataset_with_index.uuid, store_factory=store_session_factory, load_schema=True, load_all_indices=False, )
def test_add_column_to_existing_index(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({ "p": [1, 2], "x": [100, 4500] }))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={ 1: ["cluster_1"], 2: ["cluster_1"] }) }, }, { "label": "cluster_2", "data": [("core", pd.DataFrame({ "p": [4, 3], "x": [500, 10] }))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={ 4: ["cluster_2"], 3: ["cluster_2"] }) }, }, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, ) assert dataset.load_all_indices(store=store_factory()).indices.keys() == { "p" } # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["x"]) # Assert indices are properly created mps = read_dataset_as_metapartitions(store=store_factory, dataset_uuid=dataset_uuid) for column_name in ["p", "x"]: assert all([mp.indices[column_name] for mp in mps]) dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert dataset_factory.indices.keys() == {"p", "x"}
def dataset_partition_keys_factory(dataset_partition_keys, store_session_factory): return DatasetFactory( dataset_uuid=dataset_partition_keys.uuid, store_factory=store_session_factory, load_schema=True, load_all_indices=False, load_dataset_metadata=True, )
def dataset_factory_alternative_table_name(dataset_alternative_table_name, store_factory): return DatasetFactory( dataset_uuid=dataset_alternative_table_name.uuid, store_factory=store_factory, load_schema=True, load_all_indices=False, load_dataset_metadata=True, )
def test_pickle(count_store, dataset_function): factory = DatasetFactory(dataset_uuid="dataset_uuid", store_factory=count_store) assert factory._cache_store is None assert factory._cache_metadata is None factory.store factory.dataset_metadata assert factory._cache_store is not None assert factory._cache_metadata is not None factory2 = pickle.loads(pickle.dumps(factory, pickle.HIGHEST_PROTOCOL)) assert factory2._cache_store is None assert factory2._cache_metadata is None
def test_get_metadata(count_store, dataset_function): factory = DatasetFactory(dataset_uuid="dataset_uuid", store_factory=count_store) store = factory.store assert store.get_count == 0 metadata = factory.dataset_metadata assert hasattr(metadata, "metadata") initial_count = store.get_count # second get should cache metadata = factory.dataset_metadata assert store.get_count == initial_count
def test_dill(count_store, dataset_function): factory = DatasetFactory(dataset_uuid="dataset_uuid", store_factory=count_store) assert factory._cache_store is None assert factory._cache_metadata is None factory.store factory.dataset_metadata assert factory._cache_store is not None assert factory._cache_metadata is not None factory2 = dill.loads(dill.dumps(factory)) assert factory2._cache_store is None assert factory2._cache_metadata is None
def dispatch_metapartitions( dataset_uuid: str, store: StoreInput, predicates: PredicatesType = None, dispatch_by: Optional[List[str]] = None, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: dataset_factory = DatasetFactory( dataset_uuid=dataset_uuid, store_factory=store, load_schema=True, load_all_indices=False, ) return dispatch_metapartitions_from_factory( dataset_factory=dataset_factory, predicates=predicates, dispatch_by=dispatch_by, )
def test_store_init(count_store, dataset_function): factory = DatasetFactory(dataset_uuid="dataset_uuid", store_factory=count_store) assert count_store.count == 0 store = factory.store assert hasattr(store, "get") assert count_store.count == 1 assert count_store.last == store assert store.get_count == 0 # second get should cache store = factory.store assert count_store.count == 1 assert count_store.last == store
def test_build_indices(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" partitions = [ {"label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))]}, {"label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))]}, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, ) dataset = dataset.load_all_indices(store=store_factory) assert not dataset.indices # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"]) # Assert indices are properly created dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) expected = {2: ["cluster_1", "cluster_2"], 3: ["cluster_2"], 1: ["cluster_1"]} assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)
def dispatch_metapartitions_from_factory( dataset_factory: DatasetFactory, predicates: PredicatesType = None, dispatch_by: Optional[List[str]] = None, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: """ :meta private: """ if dispatch_by is not None and not set(dispatch_by).issubset( set(dataset_factory.index_columns) ): raise RuntimeError( f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}" ) check_predicates(predicates) # Determine which indices need to be loaded. index_cols: Set[str] = set() if dispatch_by: index_cols |= set(dispatch_by) if predicates: predicate_cols = set(columns_in_predicates(predicates)) predicate_index_cols = predicate_cols & set(dataset_factory.index_columns) index_cols |= predicate_index_cols for col in index_cols: dataset_factory.load_index(col) base_df = dataset_factory.get_indices_as_dataframe( list(index_cols), predicates=predicates ) if dispatch_by is not None: base_df = cast(pd.DataFrame, base_df) if len(dispatch_by) == 0: merged_partitions = [((""), base_df)] else: # Group the resulting MetaParitions by partition keys or a subset of those keys merged_partitions = base_df.groupby( by=list(dispatch_by), sort=True, as_index=False ) for group_name, group in merged_partitions: if not isinstance(group_name, tuple): group_name = (group_name,) # type: ignore mps = [] logical_conjunction = list( zip(dispatch_by, ["=="] * len(dispatch_by), group_name) ) for label in group.index.unique(): mps.append( MetaPartition.from_partition( partition=dataset_factory.partitions[label], metadata_version=dataset_factory.metadata_version, schema=dataset_factory.schema, partition_keys=dataset_factory.partition_keys, logical_conjunction=logical_conjunction, table_name=dataset_factory.table_name, ) ) yield mps else: for part_label in base_df.index.unique(): part = dataset_factory.partitions[part_label] yield MetaPartition.from_partition( partition=part, metadata_version=dataset_factory.metadata_version, schema=dataset_factory.schema, partition_keys=dataset_factory.partition_keys, table_name=dataset_factory.table_name, )
def test_uuid(count_store, dataset_function): factory = DatasetFactory(dataset_uuid="dataset_uuid", store_factory=count_store) assert factory.dataset_uuid == "dataset_uuid"
def test_update_shuffle_buckets( store_factory, metadata_version, unique_primaries, unique_secondaries, num_buckets, repartition, npartitions, bucket_by, ): """ Assert that certain properties are always given for the output dataset no matter how the input data distribution looks like Properties to assert: * All partitions have a unique value for its correspondent primary key * number of partitions is at least one per unique partition value, at most ``num_buckets`` per primary partition value. * If we demand a column to be sorted it is per partition monotonic """ primaries = np.arange(unique_primaries) secondary = np.arange(unique_secondaries) num_rows = 100 primaries = np.repeat(primaries, np.ceil(num_rows / unique_primaries))[:num_rows] secondary = np.repeat(secondary, np.ceil(num_rows / unique_secondaries))[:num_rows] # ensure that there is an unsorted column uncorrelated # to the primary and secondary columns which can be sorted later on per partition unsorted_column = np.repeat(np.arange(100 / 10), 10) np.random.shuffle(unsorted_column) np.random.shuffle(primaries) np.random.shuffle(secondary) df = pd.DataFrame({ "primary": primaries, "secondary": secondary, "sorted_column": unsorted_column }) secondary_indices = ["secondary"] expected_num_indices = 2 # One primary # used for tests later on to if bucket_by: secondary_indices.append(bucket_by) expected_num_indices = 3 # shuffle all rows. properties of result should be reproducible df = df.sample(frac=1).reset_index(drop=True) ddf = dd.from_pandas(df, npartitions=npartitions) dataset_comp = update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", secondary_indices=secondary_indices, shuffle=True, bucket_by=bucket_by, repartition_ratio=repartition, num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, partition_on=["primary"], ) s = pickle.dumps(dataset_comp, pickle.HIGHEST_PROTOCOL) dataset_comp = pickle.loads(s) dataset = dataset_comp.compute() dataset = dataset.load_all_indices(store_factory()) assert len(dataset.partitions) <= num_buckets * unique_primaries assert len(dataset.partitions) >= unique_primaries assert len(dataset.indices) == expected_num_indices assert set(dataset.indices["primary"].index_dct.keys()) == set( range(unique_primaries)) assert (list( map(lambda x: len(x), dataset.indices["primary"].index_dct.values())) <= [num_buckets] * unique_primaries) assert set(dataset.indices["secondary"].index_dct.keys()) == set( range(unique_secondaries)) assert set(dataset.table_meta["core"].names) == { "primary", "secondary", "sorted_column", } factory = DatasetFactory("output_dataset_uuid", store_factory) factory.load_all_indices() if bucket_by: ind_df = factory.get_indices_as_dataframe(["primary", bucket_by]) assert not ind_df.duplicated().any() for data_dct in read_dataset_as_dataframes__iterator( dataset_uuid=dataset.uuid, store=store_factory): df = data_dct["core"] assert len(df.primary.unique()) == 1 assert df.sorted_column.is_monotonic # update the dataset # do not use partition_on since it should be interfered from the existing dataset tasks = update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, repartition_ratio=repartition, num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, bucket_by=bucket_by, ) s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) tasks = pickle.loads(s) updated_dataset = tasks.compute() assert len(updated_dataset.partitions) == 2 * len(dataset.partitions) # Not allowed to use different partition_on with pytest.raises( ValueError, match="Incompatible set of partition keys encountered."): update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, repartition_ratio=repartition, partition_on=["sorted_column"], num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, ) # Not allowed to update with indices which do not yet exist in dataset with pytest.raises(ValueError, match="indices"): update_dataset_from_ddf( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, partition_on=["primary"], repartition_ratio=repartition, secondary_indices=["sorted_column"], num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, ) # Check that delayed objects are allowed as delete scope. tasks = update_dataset_from_ddf( None, store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=True, repartition_ratio=repartition, num_buckets=num_buckets, sort_partitions_by="sorted_column", default_metadata_version=metadata_version, delete_scope=dask.delayed(_return_none)(), bucket_by=bucket_by, ) s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) tasks = pickle.loads(s) tasks.compute()
def dispatch_metapartitions_from_factory( dataset_factory: DatasetFactory, label_filter: Optional[Callable] = None, concat_partitions_on_primary_index: bool = False, predicates: PredicatesType = None, store: Optional[StoreInput] = None, dispatch_by: Optional[List[str]] = None, dispatch_metadata: bool = False, ) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]: """ :meta private: """ if dispatch_by is not None and concat_partitions_on_primary_index: raise ValueError( "Both `dispatch_by` and `concat_partitions_on_primary_index` are provided, " "`concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. " "Please only provide the `dispatch_by` argument. " ) if concat_partitions_on_primary_index: dispatch_by = dataset_factory.partition_keys if dispatch_by is not None and not set(dispatch_by).issubset( set(dataset_factory.index_columns) ): raise RuntimeError( f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}" ) check_predicates(predicates) # Determine which indices need to be loaded. index_cols: Set[str] = set() if dispatch_by: index_cols |= set(dispatch_by) if predicates: predicate_cols = set(columns_in_predicates(predicates)) predicate_index_cols = predicate_cols & set(dataset_factory.index_columns) index_cols |= predicate_index_cols for col in index_cols: dataset_factory.load_index(col) base_df = dataset_factory.get_indices_as_dataframe( list(index_cols), predicates=predicates ) if label_filter: base_df = base_df[base_df.index.map(label_filter)] indices_to_dispatch = { name: ix.unload() for name, ix in dataset_factory.indices.items() if isinstance(ix, ExplicitSecondaryIndex) } if dispatch_by is not None: base_df = cast(pd.DataFrame, base_df) if len(dispatch_by) == 0: merged_partitions = [((""), base_df)] else: # Group the resulting MetaParitions by partition keys or a subset of those keys merged_partitions = base_df.groupby( by=list(dispatch_by), sort=True, as_index=False ) for group_name, group in merged_partitions: if not isinstance(group_name, tuple): group_name = (group_name,) # type: ignore mps = [] logical_conjunction = list( zip(dispatch_by, ["=="] * len(dispatch_by), group_name) ) for label in group.index.unique(): mps.append( MetaPartition.from_partition( partition=dataset_factory.partitions[label], dataset_metadata=dataset_factory.metadata if dispatch_metadata else None, indices=indices_to_dispatch if dispatch_metadata else None, metadata_version=dataset_factory.metadata_version, table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, logical_conjunction=logical_conjunction, ) ) yield mps else: for part_label in base_df.index.unique(): part = dataset_factory.partitions[part_label] yield MetaPartition.from_partition( partition=part, dataset_metadata=dataset_factory.metadata if dispatch_metadata else None, indices=indices_to_dispatch if dispatch_metadata else None, metadata_version=dataset_factory.metadata_version, table_meta=dataset_factory.table_meta, partition_keys=dataset_factory.partition_keys, )
def test_update_shuffle_buckets( store_factory, unique_primaries, unique_secondaries, num_buckets, repartition, npartitions, bucket_by, func, ): """ Assert that certain properties are always given for the output dataset no matter how the input data distribution looks like Properties to assert: * All partitions have a unique value for its correspondent primary key * number of partitions is at least one per unique partition value, at most ``num_buckets`` per primary partition value. * If we demand a column to be sorted it is per partition monotonic """ primaries = np.arange(unique_primaries) secondary = np.arange(unique_secondaries) num_rows = 100 primaries = np.repeat(primaries, np.ceil(num_rows / unique_primaries))[:num_rows] secondary = np.repeat(secondary, np.ceil(num_rows / unique_secondaries))[:num_rows] # ensure that there is an unsorted column uncorrelated # to the primary and secondary columns which can be sorted later on per partition unsorted_column = np.repeat(np.arange(100 / 10), 10) np.random.shuffle(unsorted_column) np.random.shuffle(primaries) np.random.shuffle(secondary) df = pd.DataFrame( {"primary": primaries, "secondary": secondary, "sorted_column": unsorted_column} ) secondary_indices = ["secondary"] expected_num_indices = 2 # One primary # used for tests later on to if bucket_by: secondary_indices.append(bucket_by) expected_num_indices = 3 # shuffle all rows. properties of result should be reproducible df = df.sample(frac=1).reset_index(drop=True) ddf = dd.from_pandas(df, npartitions=npartitions) dataset_comp = func( ddf, store_factory, dataset_uuid="output_dataset_uuid", table="core", secondary_indices=secondary_indices, shuffle=True, bucket_by=bucket_by, repartition_ratio=repartition, num_buckets=num_buckets, sort_partitions_by="sorted_column", partition_on=["primary"], ) s = pickle.dumps(dataset_comp, pickle.HIGHEST_PROTOCOL) dataset_comp = pickle.loads(s) dataset = dataset_comp.compute() dataset = dataset.load_all_indices(store_factory()) assert len(dataset.partitions) <= num_buckets * unique_primaries assert len(dataset.partitions) >= unique_primaries assert len(dataset.indices) == expected_num_indices assert set(dataset.indices["primary"].index_dct.keys()) == set( range(unique_primaries) ) assert ( list(map(lambda x: len(x), dataset.indices["primary"].index_dct.values())) <= [num_buckets] * unique_primaries ) assert set(dataset.indices["secondary"].index_dct.keys()) == set( range(unique_secondaries) ) assert set(dataset.schema.names) == { "primary", "secondary", "sorted_column", } factory = DatasetFactory("output_dataset_uuid", store_factory) factory.load_all_indices() if bucket_by: ind_df = factory.get_indices_as_dataframe(["primary", bucket_by]) assert not ind_df.duplicated().any() for df in read_dataset_as_dataframes__iterator( dataset_uuid=dataset.uuid, store=store_factory ): assert len(df.primary.unique()) == 1 assert df.sorted_column.is_monotonic