Ejemplo n.º 1
0
def test_dispatch_metapartitions_concat_regression(store):
    dataset = store_dataframes_as_dataset(
        dfs=[pd.DataFrame({"p": [0], "x": [0]}), pd.DataFrame({"p": [0], "x": [1]})],
        dataset_uuid="test",
        store=store,
        partition_on=["p"],
    )

    mps = list(
        dispatch_metapartitions(
            dataset.uuid, store, concat_partitions_on_primary_index=False
        )
    )
    assert len(mps) == 2

    with pytest.deprecated_call():
        mps = list(
            dispatch_metapartitions(
                dataset.uuid, store, concat_partitions_on_primary_index=True
            )
        )
        assert len(mps) == 1

    mps = list(dispatch_metapartitions(dataset.uuid, store, dispatch_by=["p"]))
    assert len(mps) == 1
Ejemplo n.º 2
0
def test_dispatch_metapartitions_dups_with_predicates(store):
    dataset = store_dataframes_as_dataset(
        dfs=[pd.DataFrame({"p": [0, 1], "x": 0})],
        dataset_uuid="test",
        store=store,
        secondary_indices=["p"],
    )

    wout_preds = list(dispatch_metapartitions(dataset.uuid, store))
    w_preds = list(
        dispatch_metapartitions(dataset.uuid, store, predicates=[[("p", "in", [0, 1])]])
    )

    assert wout_preds == w_preds
Ejemplo n.º 3
0
def test_dispatch_metapartition_undefined_behaviour(
    dataset, store_session, predicates, error_msg
):
    with pytest.raises(ValueError, match=error_msg):
        list(
            dispatch_metapartitions(dataset.uuid, store_session, predicates=predicates)
        )
Ejemplo n.º 4
0
def test_dispatch_metapartitions_sorted_dispatch_by(store):
    df = pd.DataFrame({
        "p":
        np.random.randint(high=100000, low=-100000, size=(100, )),
        "x":
        0
    })
    # Integers are sorted when using too small values (maybe connected to the
    # singleton implementation of integers in CPython??)
    # Verify this is not happening, otherwise we'll get immediately a sorted
    # index (which is nice in this case but not generally true, of course)
    arr = set(df["p"].unique())
    assert list(arr) != sorted(arr)

    dataset = store_dataframes_as_dataset(dfs=[df],
                                          dataset_uuid="test",
                                          store=store,
                                          secondary_indices=["p", "x"])

    wout_preds = list(
        dispatch_metapartitions(dataset.uuid, store, dispatch_by="p"))
    last = -math.inf
    for mps in wout_preds:
        for mp in mps:
            current = mp.logical_conjunction
            assert len(current) == 1
            current = current[0][2]
            assert current > last
            last = current
Ejemplo n.º 5
0
def test_dispatch_metapartitions_query_partition_on(dataset_partition_keys,
                                                    store_session, predicates):
    generator = dispatch_metapartitions(dataset_partition_keys.uuid,
                                        store_session,
                                        predicates=predicates)
    partitions = list(generator)
    assert len(partitions) == 1
    assert partitions[0].label == "P=2/cluster_2"
Ejemplo n.º 6
0
def test_dispatch_metapartition_undefined_behaviour(dataset, store_session,
                                                    predicates):
    with pytest.raises(ValueError) as exc:
        list(
            dispatch_metapartitions(dataset.uuid,
                                    store_session,
                                    predicates=predicates))
    assert "The behaviour on an empty" in str(exc.value)
Ejemplo n.º 7
0
def test_dispatch_metapartitions_query_no_effect(dataset_partition_keys,
                                                 store_session, predicates):
    # These predicates should still lead to loading the whole set of partitionss
    generator = dispatch_metapartitions(dataset_partition_keys.uuid,
                                        store_session,
                                        predicates=predicates)
    partitions = list(generator)
    assert len(partitions) == 2
Ejemplo n.º 8
0
def test_dispatch_metapartitions(dataset, store_session):
    part_generator = dispatch_metapartitions(dataset.uuid, store_session)

    assert isinstance(part_generator, types.GeneratorType)
    partitions = list(part_generator)

    assert len(partitions) == 2
    assert len({mp.label for mp in partitions}) == 2
    for mp in partitions:
        assert isinstance(mp, MetaPartition)
        assert mp.table_name == SINGLE_TABLE
Ejemplo n.º 9
0
def test_dispatch_metapartitions_concat_regression(store):
    dataset = store_dataframes_as_dataset(
        dfs=[
            pd.DataFrame({
                "p": [0],
                "x": [0]
            }),
            pd.DataFrame({
                "p": [0],
                "x": [1]
            })
        ],
        dataset_uuid="test",
        store=store,
        partition_on=["p"],
    )

    mps = list(dispatch_metapartitions(dataset.uuid, store))
    assert len(mps) == 2

    mps = list(dispatch_metapartitions(dataset.uuid, store, dispatch_by=["p"]))
    assert len(mps) == 1
Ejemplo n.º 10
0
def test_dispatch_metapartitions_label_filter(dataset, store_session):
    def label_filter(part_label):
        return "cluster_1" in part_label

    part_generator = dispatch_metapartitions(
        dataset.uuid, store_session, label_filter=label_filter
    )

    assert isinstance(part_generator, types.GeneratorType)
    partitions = OrderedDict([(part.label, part) for part in part_generator])

    assert len(partitions) == 1
    mp = partitions["cluster_1"]
    assert isinstance(mp, MetaPartition)
Ejemplo n.º 11
0
def test_dispatch_metapartitions(dataset, store_session):
    part_generator = dispatch_metapartitions(dataset.uuid, store_session)

    assert isinstance(part_generator, types.GeneratorType)
    partitions = OrderedDict([(part.label, part) for part in part_generator])

    assert len(partitions) == 2
    mp = partitions["cluster_1"]
    assert isinstance(mp, MetaPartition)

    mp = partitions["cluster_2"]
    assert isinstance(mp, MetaPartition)

    assert set(mp.table_meta.keys()) == {SINGLE_TABLE, "helper"}
Ejemplo n.º 12
0
def test_dispatch_metapartitions_without_dataset_metadata(dataset, store_session):
    part_generator = dispatch_metapartitions(
        dataset.uuid, store_session, load_dataset_metadata=False
    )

    assert isinstance(part_generator, types.GeneratorType)
    partitions = list(part_generator)

    assert len(partitions) == 2

    mp = partitions[0]
    assert mp.dataset_metadata == {}

    mp = partitions[1]
    assert mp.dataset_metadata == {}
Ejemplo n.º 13
0
def test_get_physical_partition_stats(function_store, ds):
    mps_list = list(
        dispatch_metapartitions(
            dataset_uuid=ds.uuid, store=function_store, dispatch_by=["p"]
        )
    )
    assert len(mps_list) == 2

    for i, mps in enumerate(mps_list):
        actual = get_physical_partition_stats(mps, function_store)
        blobsize = sum(
            len(function_store().get(f))
            for f in function_store().iter_keys()
            if "p={}".format(i) in f
        )
        expected = {"partitions": 1, "files": 2, "rows": 2, "blobsize": blobsize}
        assert actual == expected
Ejemplo n.º 14
0
def test_dispatch_metapartitions_complex_or_predicates(store_factory):
    dataset_uuid = "test"
    df = pd.DataFrame({"A": range(10), "B": ["A", "B"] * 5, "C": range(-10, 0)})

    store_dataframes_as_dataset(
        store=store_factory,
        dataset_uuid=dataset_uuid,
        dfs=[df],
        partition_on=["A", "B"],
    )
    predicates = [[("A", "<", 3)], [("B", "==", "B")]]
    mps = [
        mp.load_dataframes(store_factory)
        for mp in dispatch_metapartitions(
            dataset_uuid, store_factory, predicates=predicates
        )
    ]
    actual = pd.concat([mp.data["table"] for mp in mps])
    actual = actual.sort_values(by="A", ignore_index=True)
    expected = pd.DataFrame(
        data={
            "A": [0, 1, 2, 3, 5, 7, 9],
            "B": ["A", "B", "A", "B", "B", "B", "B"],
            "C": [-10, -9, -8, -7, -5, -3, -1],
        }
    )
    pd.testing.assert_frame_equal(actual, expected)

    predicates = [[("A", "<", 3)], [("B", "==", "notthere")]]
    mps = [
        mp.load_dataframes(store_factory)
        for mp in dispatch_metapartitions(
            dataset_uuid, store_factory, predicates=predicates
        )
    ]
    actual = pd.concat([mp.data["table"] for mp in mps])
    actual = actual.sort_values(by="A", ignore_index=True)
    expected = pd.DataFrame(
        data={"A": [0, 1, 2], "B": ["A", "B", "A"], "C": [-10, -9, -8]}
    )
    pd.testing.assert_frame_equal(actual, expected)

    predicates = [[("A", "<", 3), ("B", "==", "A")], [("B", "==", "B"), ("A", ">", 2)]]
    mps = [
        mp.load_dataframes(store_factory)
        for mp in dispatch_metapartitions(
            dataset_uuid, store_factory, predicates=predicates
        )
    ]
    actual = pd.concat([mp.data["table"] for mp in mps])
    actual = actual.sort_values(by="A", ignore_index=True)
    expected = pd.DataFrame(
        data={
            "A": [0, 2, 3, 5, 7, 9],
            "B": ["A", "A", "B", "B", "B", "B"],
            "C": [-10, -8, -7, -5, -3, -1],
        }
    )
    pd.testing.assert_frame_equal(actual, expected)

    predicates = [[("A", "<", 3)], [("B", "==", "B"), ("A", ">", 2)]]
    mps = [
        mp.load_dataframes(store_factory)
        for mp in dispatch_metapartitions(
            dataset_uuid, store_factory, predicates=predicates
        )
    ]
    actual = pd.concat([mp.data["table"] for mp in mps])
    actual = actual.sort_values(by="A", ignore_index=True)
    expected = pd.DataFrame(
        data={
            "A": [0, 1, 2, 3, 5, 7, 9],
            "B": ["A", "B", "A", "B", "B", "B", "B"],
            "C": [-10, -9, -8, -7, -5, -3, -1],
        }
    )
    pd.testing.assert_frame_equal(actual, expected)