Beispiel #1
0
def metadata_factory_from_dataset(dataset, with_schema=True, store=None):
    """
    Create :py:class:`DatasetFactory` from :py:class:`DatasetMetadata`.

    Parameters
    ----------
    dataset: DatasetMetadata
        Already loaded dataset.
    with_schema: bool
        If dataset was loaded with ``load_schema``.
    store: Optional[Callable[[], simplekv.KeyValueStore]]
        Optional store factory.

    Returns
    -------
    factory: DatasetFactory
        Metadata factory w/ caches pre-filled.
    """
    factory = DatasetFactory(
        dataset_uuid=dataset.uuid,
        store_factory=store or _dummy_store_factory,
        load_schema=with_schema,
    )
    factory._cache_metadata = dataset
    factory.is_loaded = True
    return factory
Beispiel #2
0
def assert_target_keys(src_store, src_uuid, tgt_store, tgt_uuid):
    """
    Check that the expected keys exist in the target data set, and the corresponding
    values are equal to the source data set (or modified as expected)
    """
    df_source = DatasetFactory(
        dataset_uuid=src_uuid, store_factory=lazy_store(src_store),
    )
    src_keys = get_dataset_keys(df_source.dataset_metadata)
    df_target = DatasetFactory(
        dataset_uuid=tgt_uuid, store_factory=lazy_store(tgt_store),
    )
    tgt_keys = get_dataset_keys(df_target.dataset_metadata)

    for src_key in src_keys:
        # check for each source key if the corresponding target key exists
        tgt_key = src_key.replace(src_uuid, tgt_uuid)
        assert tgt_key in tgt_keys

        # check if the files for source and target key are equal (exception:
        # metadata => here the target must contain the modified metadata)
        b1 = src_store.get(src_key)
        b2 = tgt_store.get(tgt_key)

        if tgt_key.endswith("by-dataset-metadata.json"):
            b1_mod = b1.decode("utf-8").replace(src_uuid, tgt_uuid).encode("utf-8")
            assert b1_mod == b2
        else:
            assert b1 == b2
def test_indices_uints(store_factory, metadata_version,
                       bound_build_dataset_indices):
    dataset_uuid = "dataset_uuid"

    # min uint64
    p1 = 0

    # max uint64 => cannot even be cast to float32
    p2 = int(~np.uint64(0))

    # number that would be cut if converted to float64 and back
    p3 = 17128351978467489013

    partitions = [
        pd.DataFrame({"p": pd.Series([p1], dtype=np.uint64)}),
        pd.DataFrame({"p": pd.Series([p2], dtype=np.uint64)}),
        pd.DataFrame({"p": pd.Series([p3], dtype=np.uint64)}),
    ]

    def assert_expected(index_dct):
        assert len(index_dct) == 3
        referenced_partitions = []
        for val in index_dct.values():
            referenced_partitions.extend(val)
        assert len(referenced_partitions) == 3

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
    )
    dataset = dataset.load_all_indices(store=store_factory)
    assert not dataset.indices

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid,
                                     store_factory,
                                     load_all_indices=True)
    assert_expected(dataset_factory.indices["p"].index_dct)
    first_run = dataset_factory.indices["p"].index_dct.copy()

    # Re-create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid,
                                     store_factory,
                                     load_all_indices=True)
    assert_index_dct_equal(first_run, dataset_factory.indices["p"].index_dct)
Beispiel #4
0
def test_indices_uints(store_factory, metadata_version, bound_build_dataset_indices):
    dataset_uuid = "dataset_uuid"

    # min uint64
    p1 = 0

    # max uint64 => cannot even be cast to float32
    p2 = int(~np.uint64(0))

    # number that would be cut if converted to float64 and back
    p3 = 17128351978467489013

    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({"p": pd.Series([p1], dtype=np.uint64)}))],
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({"p": pd.Series([p2], dtype=np.uint64)}))],
        },
        {
            "label": "cluster_3",
            "data": [("core", pd.DataFrame({"p": pd.Series([p3], dtype=np.uint64)}))],
        },
    ]
    expected = {p1: ["cluster_1"], p2: ["cluster_2"], p3: ["cluster_3"]}

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
    )
    dataset = dataset.load_all_indices(store=store_factory)
    assert not dataset.indices

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True)
    assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)

    # Re-create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True)
    assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)
Beispiel #5
0
def test_repr(store_factory):
    factory = DatasetFactory(
        dataset_uuid="dataset_uuid",
        store_factory=store_factory  # does not exist
    )
    assert repr(
        factory) == "<DatasetFactory: uuid=dataset_uuid is_loaded=False>"
Beispiel #6
0
def dispatch_metapartitions(
    dataset_uuid: str,
    store: StoreInput,
    load_dataset_metadata: bool = True,
    keep_indices: bool = True,
    keep_table_meta: bool = True,
    label_filter: Optional[Callable] = None,
    concat_partitions_on_primary_index: bool = False,
    predicates: PredicatesType = None,
    dispatch_by: Optional[List[str]] = None,
    dispatch_metadata: bool = False,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    dataset_factory = DatasetFactory(
        dataset_uuid=dataset_uuid,
        store_factory=store,
        load_schema=True,
        load_all_indices=False,
        load_dataset_metadata=load_dataset_metadata,
    )

    return dispatch_metapartitions_from_factory(
        dataset_factory=dataset_factory,
        store=None,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        dispatch_metadata=dispatch_metadata,
    )
def test_add_column_to_existing_index(store_factory, metadata_version,
                                      bound_build_dataset_indices):
    dataset_uuid = "dataset_uuid"
    partitions = [
        pd.DataFrame({
            "p": [1, 2],
            "x": [100, 4500]
        }),
        pd.DataFrame({
            "p": [4, 3],
            "x": [500, 10]
        }),
    ]

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
        secondary_indices="p",
    )
    assert dataset.load_all_indices(store=store_factory()).indices.keys() == {
        "p"
    }

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["x"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid,
                                     store_factory,
                                     load_all_indices=True)
    assert dataset_factory.indices.keys() == {"p", "x"}
def test_empty_partitions(store_factory, metadata_version,
                          bound_build_dataset_indices):
    dataset_uuid = "dataset_uuid"

    partitions = [
        pd.DataFrame({"p": pd.Series([], dtype=np.int8)}),
        pd.DataFrame({"p": pd.Series([1], dtype=np.int8)}),
    ]

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
    )
    dataset = dataset.load_all_indices(store=store_factory)
    assert not dataset.indices

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid,
                                     store_factory,
                                     load_all_indices=True)
    assert len(dataset_factory.indices["p"].index_dct) == 1
def test_build_indices(store_factory, metadata_version,
                       bound_build_dataset_indices):
    dataset_uuid = "dataset_uuid"
    partitions = [
        pd.DataFrame({"p": [1, 2]}),
        pd.DataFrame({"p": [2, 3]}),
    ]

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
    )
    dataset = dataset.load_all_indices(store=store_factory)
    assert not dataset.indices

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid,
                                     store_factory,
                                     load_all_indices=True)
    index_dct = dataset_factory.indices["p"].index_dct

    assert len(index_dct[1]) == 1
    assert len(index_dct[2]) == 2
    assert len(index_dct[3]) == 1

    assert len(set(index_dct[3]) & set(index_dct[2])) == 1
    assert len(set(index_dct[1]) & set(index_dct[2])) == 1
    assert len(set(index_dct[1]) & set(index_dct[3])) == 0
Beispiel #10
0
def test_empty_partitions(store_factory, metadata_version, bound_build_dataset_indices):
    dataset_uuid = "dataset_uuid"

    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({"p": pd.Series([], dtype=np.int8)}))],
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({"p": pd.Series([1], dtype=np.int8)}))],
        },
    ]
    expected = {1: ["cluster_2"]}

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
    )
    dataset = dataset.load_all_indices(store=store_factory)
    assert not dataset.indices

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True)
    assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)
Beispiel #11
0
def dispatch_metapartitions(
    dataset_uuid,
    store,
    load_dataset_metadata=True,
    keep_indices=True,
    keep_table_meta=True,
    label_filter=None,
    concat_partitions_on_primary_index=False,
    predicates=None,
    dispatch_by=None,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    dataset_factory = DatasetFactory(
        dataset_uuid=dataset_uuid,
        store_factory=_make_callable(store),
        load_schema=True,
        load_all_indices=False,
        load_dataset_metadata=load_dataset_metadata,
    )

    return dispatch_metapartitions_from_factory(
        dataset_factory=dataset_factory,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
    )
def dataset_with_index_factory(dataset_with_index, store_session_factory):
    return DatasetFactory(
        dataset_uuid=dataset_with_index.uuid,
        store_factory=store_session_factory,
        load_schema=True,
        load_all_indices=False,
    )
Beispiel #13
0
def test_add_column_to_existing_index(store_factory, metadata_version,
                                      bound_build_dataset_indices):
    dataset_uuid = "dataset_uuid"
    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({
                "p": [1, 2],
                "x": [100, 4500]
            }))],
            "indices": {
                "p":
                ExplicitSecondaryIndex("p",
                                       index_dct={
                                           1: ["cluster_1"],
                                           2: ["cluster_1"]
                                       })
            },
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({
                "p": [4, 3],
                "x": [500, 10]
            }))],
            "indices": {
                "p":
                ExplicitSecondaryIndex("p",
                                       index_dct={
                                           4: ["cluster_2"],
                                           3: ["cluster_2"]
                                       })
            },
        },
    ]

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
    )
    assert dataset.load_all_indices(store=store_factory()).indices.keys() == {
        "p"
    }

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["x"])

    # Assert indices are properly created
    mps = read_dataset_as_metapartitions(store=store_factory,
                                         dataset_uuid=dataset_uuid)
    for column_name in ["p", "x"]:
        assert all([mp.indices[column_name] for mp in mps])

    dataset_factory = DatasetFactory(dataset_uuid,
                                     store_factory,
                                     load_all_indices=True)
    assert dataset_factory.indices.keys() == {"p", "x"}
Beispiel #14
0
def dataset_partition_keys_factory(dataset_partition_keys, store_session_factory):
    return DatasetFactory(
        dataset_uuid=dataset_partition_keys.uuid,
        store_factory=store_session_factory,
        load_schema=True,
        load_all_indices=False,
        load_dataset_metadata=True,
    )
Beispiel #15
0
def dataset_factory_alternative_table_name(dataset_alternative_table_name,
                                           store_factory):
    return DatasetFactory(
        dataset_uuid=dataset_alternative_table_name.uuid,
        store_factory=store_factory,
        load_schema=True,
        load_all_indices=False,
        load_dataset_metadata=True,
    )
Beispiel #16
0
def test_pickle(count_store, dataset_function):
    factory = DatasetFactory(dataset_uuid="dataset_uuid", store_factory=count_store)
    assert factory._cache_store is None
    assert factory._cache_metadata is None

    factory.store
    factory.dataset_metadata
    assert factory._cache_store is not None
    assert factory._cache_metadata is not None

    factory2 = pickle.loads(pickle.dumps(factory, pickle.HIGHEST_PROTOCOL))
    assert factory2._cache_store is None
    assert factory2._cache_metadata is None
Beispiel #17
0
def test_get_metadata(count_store, dataset_function):
    factory = DatasetFactory(dataset_uuid="dataset_uuid", store_factory=count_store)
    store = factory.store
    assert store.get_count == 0

    metadata = factory.dataset_metadata
    assert hasattr(metadata, "metadata")

    initial_count = store.get_count

    # second get should cache
    metadata = factory.dataset_metadata
    assert store.get_count == initial_count
Beispiel #18
0
def test_dill(count_store, dataset_function):
    factory = DatasetFactory(dataset_uuid="dataset_uuid", store_factory=count_store)
    assert factory._cache_store is None
    assert factory._cache_metadata is None

    factory.store
    factory.dataset_metadata
    assert factory._cache_store is not None
    assert factory._cache_metadata is not None

    factory2 = dill.loads(dill.dumps(factory))
    assert factory2._cache_store is None
    assert factory2._cache_metadata is None
Beispiel #19
0
def dispatch_metapartitions(
    dataset_uuid: str,
    store: StoreInput,
    predicates: PredicatesType = None,
    dispatch_by: Optional[List[str]] = None,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    dataset_factory = DatasetFactory(
        dataset_uuid=dataset_uuid,
        store_factory=store,
        load_schema=True,
        load_all_indices=False,
    )

    return dispatch_metapartitions_from_factory(
        dataset_factory=dataset_factory, predicates=predicates, dispatch_by=dispatch_by,
    )
Beispiel #20
0
def test_store_init(count_store, dataset_function):
    factory = DatasetFactory(dataset_uuid="dataset_uuid", store_factory=count_store)
    assert count_store.count == 0

    store = factory.store
    assert hasattr(store, "get")

    assert count_store.count == 1
    assert count_store.last == store

    assert store.get_count == 0

    # second get should cache
    store = factory.store
    assert count_store.count == 1
    assert count_store.last == store
Beispiel #21
0
def test_build_indices(store_factory, metadata_version, bound_build_dataset_indices):
    dataset_uuid = "dataset_uuid"
    partitions = [
        {"label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))]},
        {"label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))]},
    ]

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
    )
    dataset = dataset.load_all_indices(store=store_factory)
    assert not dataset.indices

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True)
    expected = {2: ["cluster_1", "cluster_2"], 3: ["cluster_2"], 1: ["cluster_1"]}
    assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)
Beispiel #22
0
def dispatch_metapartitions_from_factory(
    dataset_factory: DatasetFactory,
    predicates: PredicatesType = None,
    dispatch_by: Optional[List[str]] = None,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    """

    :meta private:
    """

    if dispatch_by is not None and not set(dispatch_by).issubset(
        set(dataset_factory.index_columns)
    ):
        raise RuntimeError(
            f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}"
        )
    check_predicates(predicates)

    # Determine which indices need to be loaded.
    index_cols: Set[str] = set()
    if dispatch_by:
        index_cols |= set(dispatch_by)

    if predicates:
        predicate_cols = set(columns_in_predicates(predicates))
        predicate_index_cols = predicate_cols & set(dataset_factory.index_columns)
        index_cols |= predicate_index_cols

    for col in index_cols:
        dataset_factory.load_index(col)

    base_df = dataset_factory.get_indices_as_dataframe(
        list(index_cols), predicates=predicates
    )

    if dispatch_by is not None:
        base_df = cast(pd.DataFrame, base_df)

        if len(dispatch_by) == 0:
            merged_partitions = [((""), base_df)]
        else:
            # Group the resulting MetaParitions by partition keys or a subset of those keys
            merged_partitions = base_df.groupby(
                by=list(dispatch_by), sort=True, as_index=False
            )

        for group_name, group in merged_partitions:
            if not isinstance(group_name, tuple):
                group_name = (group_name,)  # type: ignore
            mps = []
            logical_conjunction = list(
                zip(dispatch_by, ["=="] * len(dispatch_by), group_name)
            )
            for label in group.index.unique():
                mps.append(
                    MetaPartition.from_partition(
                        partition=dataset_factory.partitions[label],
                        metadata_version=dataset_factory.metadata_version,
                        schema=dataset_factory.schema,
                        partition_keys=dataset_factory.partition_keys,
                        logical_conjunction=logical_conjunction,
                        table_name=dataset_factory.table_name,
                    )
                )
            yield mps
    else:
        for part_label in base_df.index.unique():
            part = dataset_factory.partitions[part_label]

            yield MetaPartition.from_partition(
                partition=part,
                metadata_version=dataset_factory.metadata_version,
                schema=dataset_factory.schema,
                partition_keys=dataset_factory.partition_keys,
                table_name=dataset_factory.table_name,
            )
Beispiel #23
0
def test_uuid(count_store, dataset_function):
    factory = DatasetFactory(dataset_uuid="dataset_uuid", store_factory=count_store)
    assert factory.dataset_uuid == "dataset_uuid"
Beispiel #24
0
def test_update_shuffle_buckets(
    store_factory,
    metadata_version,
    unique_primaries,
    unique_secondaries,
    num_buckets,
    repartition,
    npartitions,
    bucket_by,
):
    """
    Assert that certain properties are always given for the output dataset
    no matter how the input data distribution looks like

    Properties to assert:
    * All partitions have a unique value for its correspondent primary key
    * number of partitions is at least one per unique partition value, at
      most ``num_buckets`` per primary partition value.
    * If we demand a column to be sorted it is per partition monotonic
    """
    primaries = np.arange(unique_primaries)
    secondary = np.arange(unique_secondaries)
    num_rows = 100
    primaries = np.repeat(primaries,
                          np.ceil(num_rows / unique_primaries))[:num_rows]
    secondary = np.repeat(secondary,
                          np.ceil(num_rows / unique_secondaries))[:num_rows]
    # ensure that there is an unsorted column uncorrelated
    # to the primary and secondary columns which can be sorted later on per partition
    unsorted_column = np.repeat(np.arange(100 / 10), 10)
    np.random.shuffle(unsorted_column)
    np.random.shuffle(primaries)
    np.random.shuffle(secondary)

    df = pd.DataFrame({
        "primary": primaries,
        "secondary": secondary,
        "sorted_column": unsorted_column
    })
    secondary_indices = ["secondary"]
    expected_num_indices = 2  # One primary

    # used for tests later on to
    if bucket_by:
        secondary_indices.append(bucket_by)
        expected_num_indices = 3

    # shuffle all rows. properties of result should be reproducible
    df = df.sample(frac=1).reset_index(drop=True)
    ddf = dd.from_pandas(df, npartitions=npartitions)

    dataset_comp = update_dataset_from_ddf(
        ddf,
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="core",
        secondary_indices=secondary_indices,
        shuffle=True,
        bucket_by=bucket_by,
        repartition_ratio=repartition,
        num_buckets=num_buckets,
        sort_partitions_by="sorted_column",
        default_metadata_version=metadata_version,
        partition_on=["primary"],
    )

    s = pickle.dumps(dataset_comp, pickle.HIGHEST_PROTOCOL)
    dataset_comp = pickle.loads(s)

    dataset = dataset_comp.compute()
    dataset = dataset.load_all_indices(store_factory())

    assert len(dataset.partitions) <= num_buckets * unique_primaries
    assert len(dataset.partitions) >= unique_primaries

    assert len(dataset.indices) == expected_num_indices

    assert set(dataset.indices["primary"].index_dct.keys()) == set(
        range(unique_primaries))
    assert (list(
        map(lambda x: len(x), dataset.indices["primary"].index_dct.values()))
            <= [num_buckets] * unique_primaries)

    assert set(dataset.indices["secondary"].index_dct.keys()) == set(
        range(unique_secondaries))

    assert set(dataset.table_meta["core"].names) == {
        "primary",
        "secondary",
        "sorted_column",
    }

    factory = DatasetFactory("output_dataset_uuid", store_factory)
    factory.load_all_indices()

    if bucket_by:
        ind_df = factory.get_indices_as_dataframe(["primary", bucket_by])

        assert not ind_df.duplicated().any()

    for data_dct in read_dataset_as_dataframes__iterator(
            dataset_uuid=dataset.uuid, store=store_factory):
        df = data_dct["core"]
        assert len(df.primary.unique()) == 1
        assert df.sorted_column.is_monotonic

    # update the dataset
    # do not use partition_on since it should be interfered from the existing dataset
    tasks = update_dataset_from_ddf(
        ddf,
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="core",
        shuffle=True,
        repartition_ratio=repartition,
        num_buckets=num_buckets,
        sort_partitions_by="sorted_column",
        default_metadata_version=metadata_version,
        bucket_by=bucket_by,
    )

    s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL)
    tasks = pickle.loads(s)

    updated_dataset = tasks.compute()

    assert len(updated_dataset.partitions) == 2 * len(dataset.partitions)

    # Not allowed to use different partition_on
    with pytest.raises(
            ValueError,
            match="Incompatible set of partition keys encountered."):
        update_dataset_from_ddf(
            ddf,
            store_factory,
            dataset_uuid="output_dataset_uuid",
            table="core",
            shuffle=True,
            repartition_ratio=repartition,
            partition_on=["sorted_column"],
            num_buckets=num_buckets,
            sort_partitions_by="sorted_column",
            default_metadata_version=metadata_version,
        )

    # Not allowed to update with indices which do not yet exist in dataset
    with pytest.raises(ValueError, match="indices"):
        update_dataset_from_ddf(
            ddf,
            store_factory,
            dataset_uuid="output_dataset_uuid",
            table="core",
            shuffle=True,
            partition_on=["primary"],
            repartition_ratio=repartition,
            secondary_indices=["sorted_column"],
            num_buckets=num_buckets,
            sort_partitions_by="sorted_column",
            default_metadata_version=metadata_version,
        )

    # Check that delayed objects are allowed as delete scope.
    tasks = update_dataset_from_ddf(
        None,
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="core",
        shuffle=True,
        repartition_ratio=repartition,
        num_buckets=num_buckets,
        sort_partitions_by="sorted_column",
        default_metadata_version=metadata_version,
        delete_scope=dask.delayed(_return_none)(),
        bucket_by=bucket_by,
    )

    s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL)
    tasks = pickle.loads(s)

    tasks.compute()
Beispiel #25
0
def dispatch_metapartitions_from_factory(
    dataset_factory: DatasetFactory,
    label_filter: Optional[Callable] = None,
    concat_partitions_on_primary_index: bool = False,
    predicates: PredicatesType = None,
    store: Optional[StoreInput] = None,
    dispatch_by: Optional[List[str]] = None,
    dispatch_metadata: bool = False,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    """

    :meta private:
    """

    if dispatch_by is not None and concat_partitions_on_primary_index:
        raise ValueError(
            "Both `dispatch_by` and `concat_partitions_on_primary_index` are provided, "
            "`concat_partitions_on_primary_index` is deprecated and will be removed in the next major release. "
            "Please only provide the `dispatch_by` argument. "
        )
    if concat_partitions_on_primary_index:
        dispatch_by = dataset_factory.partition_keys

    if dispatch_by is not None and not set(dispatch_by).issubset(
        set(dataset_factory.index_columns)
    ):
        raise RuntimeError(
            f"Dispatch columns must be indexed.\nRequested index: {dispatch_by} but available index columns: {sorted(dataset_factory.index_columns)}"
        )
    check_predicates(predicates)

    # Determine which indices need to be loaded.
    index_cols: Set[str] = set()
    if dispatch_by:
        index_cols |= set(dispatch_by)

    if predicates:
        predicate_cols = set(columns_in_predicates(predicates))
        predicate_index_cols = predicate_cols & set(dataset_factory.index_columns)
        index_cols |= predicate_index_cols

    for col in index_cols:
        dataset_factory.load_index(col)

    base_df = dataset_factory.get_indices_as_dataframe(
        list(index_cols), predicates=predicates
    )

    if label_filter:
        base_df = base_df[base_df.index.map(label_filter)]

    indices_to_dispatch = {
        name: ix.unload()
        for name, ix in dataset_factory.indices.items()
        if isinstance(ix, ExplicitSecondaryIndex)
    }

    if dispatch_by is not None:
        base_df = cast(pd.DataFrame, base_df)

        if len(dispatch_by) == 0:
            merged_partitions = [((""), base_df)]
        else:
            # Group the resulting MetaParitions by partition keys or a subset of those keys
            merged_partitions = base_df.groupby(
                by=list(dispatch_by), sort=True, as_index=False
            )

        for group_name, group in merged_partitions:
            if not isinstance(group_name, tuple):
                group_name = (group_name,)  # type: ignore
            mps = []
            logical_conjunction = list(
                zip(dispatch_by, ["=="] * len(dispatch_by), group_name)
            )
            for label in group.index.unique():
                mps.append(
                    MetaPartition.from_partition(
                        partition=dataset_factory.partitions[label],
                        dataset_metadata=dataset_factory.metadata
                        if dispatch_metadata
                        else None,
                        indices=indices_to_dispatch if dispatch_metadata else None,
                        metadata_version=dataset_factory.metadata_version,
                        table_meta=dataset_factory.table_meta,
                        partition_keys=dataset_factory.partition_keys,
                        logical_conjunction=logical_conjunction,
                    )
                )
            yield mps
    else:
        for part_label in base_df.index.unique():
            part = dataset_factory.partitions[part_label]

            yield MetaPartition.from_partition(
                partition=part,
                dataset_metadata=dataset_factory.metadata
                if dispatch_metadata
                else None,
                indices=indices_to_dispatch if dispatch_metadata else None,
                metadata_version=dataset_factory.metadata_version,
                table_meta=dataset_factory.table_meta,
                partition_keys=dataset_factory.partition_keys,
            )
Beispiel #26
0
def test_update_shuffle_buckets(
    store_factory,
    unique_primaries,
    unique_secondaries,
    num_buckets,
    repartition,
    npartitions,
    bucket_by,
    func,
):
    """
    Assert that certain properties are always given for the output dataset
    no matter how the input data distribution looks like

    Properties to assert:
    * All partitions have a unique value for its correspondent primary key
    * number of partitions is at least one per unique partition value, at
      most ``num_buckets`` per primary partition value.
    * If we demand a column to be sorted it is per partition monotonic
    """

    primaries = np.arange(unique_primaries)
    secondary = np.arange(unique_secondaries)
    num_rows = 100
    primaries = np.repeat(primaries, np.ceil(num_rows / unique_primaries))[:num_rows]
    secondary = np.repeat(secondary, np.ceil(num_rows / unique_secondaries))[:num_rows]
    # ensure that there is an unsorted column uncorrelated
    # to the primary and secondary columns which can be sorted later on per partition
    unsorted_column = np.repeat(np.arange(100 / 10), 10)
    np.random.shuffle(unsorted_column)
    np.random.shuffle(primaries)
    np.random.shuffle(secondary)

    df = pd.DataFrame(
        {"primary": primaries, "secondary": secondary, "sorted_column": unsorted_column}
    )
    secondary_indices = ["secondary"]
    expected_num_indices = 2  # One primary

    # used for tests later on to
    if bucket_by:
        secondary_indices.append(bucket_by)
        expected_num_indices = 3

    # shuffle all rows. properties of result should be reproducible
    df = df.sample(frac=1).reset_index(drop=True)
    ddf = dd.from_pandas(df, npartitions=npartitions)

    dataset_comp = func(
        ddf,
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="core",
        secondary_indices=secondary_indices,
        shuffle=True,
        bucket_by=bucket_by,
        repartition_ratio=repartition,
        num_buckets=num_buckets,
        sort_partitions_by="sorted_column",
        partition_on=["primary"],
    )

    s = pickle.dumps(dataset_comp, pickle.HIGHEST_PROTOCOL)
    dataset_comp = pickle.loads(s)

    dataset = dataset_comp.compute()
    dataset = dataset.load_all_indices(store_factory())

    assert len(dataset.partitions) <= num_buckets * unique_primaries
    assert len(dataset.partitions) >= unique_primaries

    assert len(dataset.indices) == expected_num_indices

    assert set(dataset.indices["primary"].index_dct.keys()) == set(
        range(unique_primaries)
    )
    assert (
        list(map(lambda x: len(x), dataset.indices["primary"].index_dct.values()))
        <= [num_buckets] * unique_primaries
    )

    assert set(dataset.indices["secondary"].index_dct.keys()) == set(
        range(unique_secondaries)
    )

    assert set(dataset.schema.names) == {
        "primary",
        "secondary",
        "sorted_column",
    }

    factory = DatasetFactory("output_dataset_uuid", store_factory)
    factory.load_all_indices()

    if bucket_by:
        ind_df = factory.get_indices_as_dataframe(["primary", bucket_by])

        assert not ind_df.duplicated().any()

    for df in read_dataset_as_dataframes__iterator(
        dataset_uuid=dataset.uuid, store=store_factory
    ):
        assert len(df.primary.unique()) == 1
        assert df.sorted_column.is_monotonic