Exemple #1
0
 def test_fails_colcond_scalar(self):
     c1 = C("foö")
     c2 = C("bar")
     cond = c2 == 42
     with pytest.raises(TypeError) as exc:
         c1 == cond
     assert str(exc.value) == "Cannot use nested conditions."
Exemple #2
0
 def test_fails_colconj(self):
     col1 = C("foö")
     col2 = C("bar")
     conj = (col2 == 42) & (col2 == 10)
     with pytest.raises(TypeError) as exc:
         col1.in_interval(10, conj)
     assert str(exc.value) == "Cannot use nested conditions."
Exemple #3
0
 def test_fails_colconj_list(self):
     c1 = C("foö")
     c2 = C("bar")
     conj = (c2 == 42) & (c2 == 10)
     with pytest.raises(TypeError) as exc:
         c1.isin([conj])
     assert str(exc.value) == "Cannot use nested conditions."
Exemple #4
0
 def test_filter_df_some(self):
     cond = (C("foö") == 42) & (C("bar") == 2)
     df = pd.DataFrame({
         "foö": [13, 42, 42, 100],
         "bar": [1, 2, 3, 4],
         "z": 0.0
     })
     df_actual = cond.filter_df(df)
     df_expected = df.loc[(df["foö"] == 42) & (df["bar"] == 2)]
     pdt.assert_frame_equal(df_actual, df_expected)
Exemple #5
0
def test_fail_missing_condition_columns(driver, module_store, test_cube, test_df):
    with pytest.raises(ValueError) as exc:
        driver(
            cube=test_cube,
            store=module_store,
            conditions=(C("foo") == 1) & (C("bar") == 2),
        )
    assert (
        "Following condition columns are required but are missing from the cube: bar, foo"
        in str(exc.value)
    )
Exemple #6
0
 def test_filter_df_nulls(self):
     cond = (C("foö") != 42.0) & (C("bar") != 2.0)
     df = pd.DataFrame({
         "foö": [13, 42, np.nan, np.nan],
         "bar": [1, 2, 3, np.nan],
         "z": np.nan
     })
     df_actual = cond.filter_df(df)
     df_expected = pd.DataFrame({
         "foö": [13.0],
         "bar": [1.0],
         "z": [np.nan]
     })
     pdt.assert_frame_equal(df_actual, df_expected)
Exemple #7
0
def test_stresstest_index_select_row(driver, function_store):
    n_indices = 100
    n_rows = 1000

    data = {"x": np.arange(n_rows), "p": 0}
    for i in range(n_indices):
        data["i{}".format(i)] = np.arange(n_rows)
    df = pd.DataFrame(data)

    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        index_columns=["i{}".format(i) for i in range(n_indices)],
    )

    build_cube(data=df, cube=cube, store=function_store)

    conditions = Conjunction([(C("i{}".format(i)) == 0) for i in range(n_indices)])

    result = driver(
        cube=cube,
        store=function_store,
        conditions=conditions,
        payload_columns=["p", "x"],
    )
    assert len(result) == 1
    df_actual = result[0]
    df_expected = df.loc[df["x"] == 0].reindex(columns=["p", "x"])
    pdt.assert_frame_equal(df_actual, df_expected)
Exemple #8
0
def test_single_rowgroup_when_df_serializer_is_not_passed_to_update_cube(
    driver, function_store
):
    """
    Test that the dataset has a single row group as default path
    """
    # Build cube
    df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],)
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
    build_cube(
        data=df, cube=cube, store=function_store,
    )

    # Update cube - replace p=1 and append p=2 partitions
    df_update = pd.DataFrame(
        data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_update},
        remove_conditions=(C("p") == 1),  # Remove p=1 partition
        cube=cube,
        store=function_store,
    )
    dataset = result["seed"].load_all_indices(function_store())

    part_num_rows = {0: 1, 1: 2, 2: 2}
    part_chunk_size = {0: None, 1: None, 2: None}

    assert len(dataset.partitions) == 3
    assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)
Exemple #9
0
def test_compression_is_compatible_on_update_cube(driver, function_store):
    """
    Test that partitons written with different compression algorithms are compatible

    The compression algorithms are not parametrized because their availability depends
    on the arrow build. 'SNAPPY' and 'GZIP' are already assumed to be available in parts
    of the code. A fully parametrized test would also increase runtime and test complexity
    unnecessarily.
    """
    # Build cube
    df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],)
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
    build_cube(
        data=df,
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(compression="SNAPPY"),
    )

    # Update cube - replace p=1 and append p=2 partitions
    df_update = pd.DataFrame(
        data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_update},
        remove_conditions=(C("p") == 1),  # Remove p=1 partition
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(compression="GZIP"),
    )
    dataset = result["seed"].load_all_indices(function_store())

    assert len(dataset.partitions) == 3
Exemple #10
0
 def test_fails_nocond(self):
     col = C("foö")
     cond1 = col < 10
     with pytest.raises(TypeError) as exc:
         cond1 & col
     assert str(
         exc.value) == "Can only build conjunction out of conditions."
Exemple #11
0
 def test_simple(self):
     cond = C("foö").in_interval(10, 20)
     assert isinstance(cond, InIntervalCondition)
     assert str(cond) == "foö.in_interval(10, 20)"
     assert cond.predicate_part == [("foö", ">=", 10), ("foö", "<", 20)]
     assert cond.active
     hash(cond)
Exemple #12
0
def test_delayed_index_build_correction_restriction(driver, function_store):
    """
    Ensure that adding extra indices for dimension columns does not mark other datasets as restrictive.
    """
    df_seed = pd.DataFrame({"x": [0, 1, 2, 3, 4, 5], "p": [0, 0, 1, 1, 2, 2]})
    df_extend = pd.DataFrame({"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 1, 2]})
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="delayed_index_cube",
        index_columns=[],
    )
    build_cube(
        data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube
    )

    build_dataset_indices(
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid("extend"),
        columns=["x"],
    )

    results = driver(cube=cube, store=function_store, conditions=C("x") >= 0)
    assert len(results) == 1

    df_actual = results[0]
    df_expected = pd.DataFrame(
        {
            "x": [0, 1, 2, 3, 4, 5],
            "p": [0, 0, 1, 1, 2, 2],
            "v": [0, 1, 2, np.nan, np.nan, np.nan],
        },
        columns=["p", "v", "x"],
    )
    pdt.assert_frame_equal(df_actual, df_expected)
Exemple #13
0
def test_conditions(driver, function_store, existing_cube):
    parts_source1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("source"), function_store()
        ).partitions
    )
    parts_enrich1 = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("enrich"), function_store()
        ).partitions
    )

    parts_source_to_delete = {part for part in parts_source1 if "p=0" not in part}

    result = driver(
        cube=existing_cube,
        store=function_store,
        ktk_cube_dataset_ids=["source"],
        conditions=C("p") > 0,
    )

    assert set(result.keys()) == {"source", "enrich"}

    ds_source = result["source"]
    ds_enrich = result["enrich"]

    parts_source2 = set(ds_source.partitions)
    parts_enrich2 = set(ds_enrich.partitions)

    assert parts_enrich1 == parts_enrich2
    assert parts_source1 - parts_source_to_delete == parts_source2
Exemple #14
0
 def test_op(self, f, t, op, value):
     c = C("foö")
     cond = f(c)
     assert isinstance(cond, t)
     assert cond.OP == op
     assert str(cond) == "foö {} {}".format(op, value)
     assert cond.predicate_part == [("foö", op, value)]
     assert cond.active
     hash(cond)
Exemple #15
0
 def test_nested_cond_conj(self):
     col = C("foö")
     cond1 = col < 10
     cond2 = col > 0
     cond3 = col != 10
     conj1 = cond2 & cond3
     conj2 = cond1 & conj1
     assert isinstance(conj2, Conjunction)
     assert conj2.conditions == (cond1, cond2, cond3)
Exemple #16
0
def test_wrong_condition_type(driver, function_store, driver_name):
    types = {
        "int": pd.Series([-1], dtype=np.int64),
        "uint": pd.Series([1], dtype=np.uint64),
        "float": pd.Series([1.3], dtype=np.float64),
        "bool": pd.Series([True], dtype=np.bool_),
        "str": pd.Series(["foo"], dtype=object),
    }
    cube = Cube(
        dimension_columns=["d_{}".format(t) for t in sorted(types.keys())],
        partition_columns=["p_{}".format(t) for t in sorted(types.keys())],
        uuid_prefix="typed_cube",
        index_columns=["i_{}".format(t) for t in sorted(types.keys())],
    )
    data = {
        "seed": pd.DataFrame(
            {
                "{}_{}".format(prefix, t): types[t]
                for t in sorted(types.keys())
                for prefix in ["d", "p", "v1"]
            }
        ),
        "enrich": pd.DataFrame(
            {
                "{}_{}".format(prefix, t): types[t]
                for t in sorted(types.keys())
                for prefix in ["d", "p", "i", "v2"]
            }
        ),
    }
    build_cube(data=data, store=function_store, cube=cube)

    df = pd.DataFrame(
        {
            "{}_{}".format(prefix, t): types[t]
            for t in sorted(types.keys())
            for prefix in ["d", "p", "i", "v1", "v2"]
        }
    )

    for col in df.columns:
        t1 = col.split("_")[1]

        for t2 in sorted(types.keys()):
            cond = C(col) == types[t2].values[0]

            if t1 == t2:
                result = driver(cube=cube, store=function_store, conditions=cond)
                assert len(result) == 1
                df_actual = result[0]
                df_expected = cond.filter_df(df).reset_index(drop=True)
                pdt.assert_frame_equal(df_actual, df_expected, check_like=True)
            else:
                with pytest.raises(TypeError) as exc:
                    driver(cube=cube, store=function_store, conditions=cond)
                assert "has wrong type" in str(exc.value)
Exemple #17
0
 def test_hash(self):
     col = C("foö")
     cond1 = col < 10
     cond2 = col > 0
     cond3 = col != 10
     conj1a = cond1 & cond2
     conj1b = cond1 & cond2
     conj2 = cond1 & cond3
     assert hash(conj1a) == hash(conj1b)
     assert hash(conj1a) != hash(conj2)
Exemple #18
0
 def test_nested_conj_conj(self):
     col = C("foö")
     cond1 = col < 10
     cond2 = col > 0
     cond3 = col != 10
     cond4 = col != 11
     conj1 = cond1 & cond2
     conj2 = cond3 & cond4
     conj3 = conj1 & conj2
     assert isinstance(conj3, Conjunction)
     assert conj3.conditions == (cond1, cond2, cond3, cond4)
Exemple #19
0
 def test_simple(self):
     col = C("foö")
     cond1 = col < 10
     cond2 = col > 0
     conj = cond1 & cond2
     assert isinstance(conj, Conjunction)
     assert conj.conditions == (cond1, cond2)
     assert str(conj) == "(foö < 10) & (foö > 0)"
     assert conj.columns == {"foö"}
     assert conj.predicate == [("foö", "<", 10), ("foö", ">", 0)]
     assert conj.split_by_column() == {"foö": conj}
Exemple #20
0
def test_condition_on_null(driver, function_store):
    df = pd.DataFrame({
        "x":
        pd.Series([0, 1, 2], dtype=np.int64),
        "p":
        pd.Series([0, 0, 1], dtype=np.int64),
        "v_f1":
        pd.Series([0, np.nan, 2], dtype=np.float64),
        "v_f2":
        pd.Series([0, 1, np.nan], dtype=np.float64),
        "v_f3":
        pd.Series([np.nan, np.nan, np.nan], dtype=np.float64),
        "v_s1":
        pd.Series(["a", None, "c"], dtype=object),
        "v_s2":
        pd.Series(["a", "b", None], dtype=object),
        "v_s3":
        pd.Series([None, None, None], dtype=object),
    })
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="nulled_cube",
        index_columns=[],
    )
    build_cube(data=df, store=function_store, cube=cube)

    for col in df.columns:
        # only iterate over the value columns (not the dimension / partition column):
        if not col.startswith("v"):
            continue

        # col_type will be either 'f' for float or 's' for string; see column
        # names above
        col_type = col.split("_")[1][0]
        if col_type == "f":
            value = 1.2
        elif col_type == "s":
            value = "foo"
        else:
            raise RuntimeError("unknown type")

        cond = C(col) == value

        df_expected = cond.filter_df(df).reset_index(drop=True)

        result = driver(cube=cube, store=function_store, conditions=cond)

        if df_expected.empty:
            assert len(result) == 0
        else:
            assert len(result) == 1
            df_actual = result[0]
            pdt.assert_frame_equal(df_actual, df_expected, check_like=True)
Exemple #21
0
    def test_frozen(self):
        cond = C("foö") == 42

        with pytest.raises(AttributeError):
            cond.column = "bar"

        with pytest.raises(AttributeError):
            cond.value = 1337

        with pytest.raises(AttributeError):
            cond.OP = "x"
Exemple #22
0
def test_fail_wrong_condition(driver, function_store, existing_cube):
    with pytest.raises(
        ValueError,
        match="Can only remove partitions with conditions concerning cubes physical partition columns.",
    ):
        driver(
            cube=existing_cube,
            store=function_store,
            ktk_cube_dataset_ids=["source"],
            conditions=C("v1") >= 0,
        )
Exemple #23
0
 def test_multicol(self):
     col1 = C("foö")
     col2 = C("bar")
     cond1 = col1 < 10
     cond2 = col1 > 0
     cond3 = col2 != 10
     conj1 = cond1 & cond2
     conj2 = conj1 & cond3
     assert isinstance(conj2, Conjunction)
     assert conj2.conditions == (cond1, cond2, cond3)
     assert str(conj2) == "(foö < 10) & (foö > 0) & (bar != 10)"
     assert conj2.columns == {"foö", "bar"}
     assert conj2.predicate == [
         ("foö", "<", 10),
         ("foö", ">", 0),
         ("bar", "!=", 10),
     ]
     assert conj2.split_by_column() == {
         "foö": conj1,
         "bar": Conjunction([cond3])
     }
Exemple #24
0
def test_update_partitions(driver, function_store, remove_partitions, new_partitions):
    df_source, cube = _write_cube(function_store)

    df_source_new = pd.DataFrame(
        {
            "i1": range(200, 200 + len(new_partitions)),
            "p": np.array(new_partitions, np.int64),
            "v1": range(300, 300 + len(new_partitions)),
            "x": range(100, 100 + len(new_partitions)),
        }
    )

    # what should remain of the old data:
    df_source_of_old = df_source.loc[~df_source["p"].isin(set(remove_partitions))]
    df_source_expected_after = pd.concat(
        [df_source_of_old, df_source_new], sort=False, ignore_index=True
    )

    remove_conditions = C("p").isin(remove_partitions)

    result = driver(
        data={"source": df_source_new},
        remove_conditions=remove_conditions,
        cube=cube,
        store=function_store,
        ktk_cube_dataset_ids={"source"},
        metadata={"source": {"some_new_meta": 42}},
    )

    assert set(result.keys()) == {"source"}

    dm_source_after = DatasetMetadata.load_from_store(
        cube.ktk_dataset_uuid("source"), function_store(), load_all_indices=True
    )

    assert "some_new_meta" in dm_source_after.metadata
    assert "meta_at_create" in dm_source_after.metadata

    # check values for "p" are as expected:
    expected_p_source = (set(df_source["p"].unique()) - set(remove_partitions)) | set(
        new_partitions
    )
    assert set(dm_source_after.indices["p"].index_dct) == expected_p_source

    df_read = query_cube(cube, function_store)[0]

    assert set(df_read.columns) == set(df_source_expected_after.columns)

    for df in (df_read, df_source_expected_after):
        df.sort_values("x", inplace=True)
        df.reset_index(drop=True, inplace=True)

    pd.testing.assert_frame_equal(df_read, df_source_expected_after)
Exemple #25
0
def test_filter_select(driver, module_store, test_cube, test_df):
    result = driver(
        cube=test_cube,
        store=module_store,
        payload_columns=["v1", "v2"],
        conditions=(C("i3") >= 3),  # completely unrelated to the payload
    )
    assert len(result) == 1
    df_actual = result[0]
    df_expected = test_df.loc[
        test_df["i3"] >= 3, ["p", "q", "v1", "v2", "x", "y", "z"]
    ].reset_index(drop=True)
    pdt.assert_frame_equal(df_actual, df_expected)
Exemple #26
0
def test_remove_nonmatching_condition(driver, function_store, existing_cube):
    parts_source_before = set(
        DatasetMetadata.load_from_store(
            existing_cube.ktk_dataset_uuid("source"),
            function_store()).partitions)
    result = driver(
        cube=existing_cube,
        store=function_store,
        ktk_cube_dataset_ids=["source"],
        conditions=C("p") > 10000,
    )
    parts_source_after = set(result["source"].partitions)
    assert parts_source_before == parts_source_after
Exemple #27
0
 def test_nested_conj_cond(self):
     col = C("foö")
     cond1 = col < 10
     cond2 = col > 0
     cond3 = col != 10
     conj1 = cond1 & cond2
     conj2 = conj1 & cond3
     assert isinstance(conj2, Conjunction)
     assert conj2.conditions == (cond1, cond2, cond3)
     assert str(conj2) == "(foö < 10) & (foö > 0) & (foö != 10)"
     assert conj2.columns == {"foö"}
     assert conj2.predicate == [
         ("foö", "<", 10),
         ("foö", ">", 0),
         ("foö", "!=", 10),
     ]
     assert conj2.split_by_column() == {"foö": conj2}
Exemple #28
0
def test_rowgroups_are_applied_when_df_serializer_is_passed_to_update_cube(
    driver, function_store, chunk_size_build, chunk_size_update
):
    """
    Test that the dataset is split into row groups depending on the chunk size

    Partitions build with ``chunk_size=None`` should keep a single row group if they
    are not touched by the update. Partitions that are newly created or replaced with
    ``chunk_size>0`` should be split into row groups accordingly.
    """
    # Build cube
    df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],)
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
    build_cube(
        data=df,
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(chunk_size=chunk_size_build),
    )

    # Update cube - replace p=1 and append p=2 partitions
    df_update = pd.DataFrame(
        data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_update},
        remove_conditions=(C("p") == 1),  # Remove p=1 partition
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(chunk_size=chunk_size_update),
    )
    dataset = result["seed"].load_all_indices(function_store())

    part_num_rows = {0: 1, 1: 2, 2: 2}
    part_chunk_size = {
        0: chunk_size_build,
        1: chunk_size_update,
        2: chunk_size_update,
    }

    assert len(dataset.partitions) == 3
    assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)
Exemple #29
0
def test_update_respects_ktk_cube_dataset_ids(
    driver, function_store, ktk_cube_dataset_ids
):
    df_source, cube = _write_cube(function_store)
    df_ex = _extend_cube(cube, function_store)

    remove_conditions = C("p") == 0

    # This implicitly also tests that `data={}` behaves as expected and still deletes partitions
    # as requested via ktk_cube_dataset_ids and remove_conditions
    result = driver(
        data={},
        remove_conditions=remove_conditions,
        cube=cube,
        store=function_store,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
    )
    assert set(result) == ktk_cube_dataset_ids
    df_read = query_cube(cube, function_store)[0]

    # expected result: df_source left joined with df_ex; choosing the subset of p!=0 from each
    # that is in `ktk_cube_dataset_ids`:
    if "source" in ktk_cube_dataset_ids:
        df_source = df_source.loc[df_source["p"] != 0]
    if "ex" in ktk_cube_dataset_ids:
        df_ex = df_ex.loc[df_ex["p"] != 0]
    df_expected = df_source.merge(df_ex[["x", "a"]], how="left", on="x")
    df_expected = df_expected[sorted(df_expected.columns)]
    pd.testing.assert_frame_equal(df_read, df_expected)

    # test "ex" separately, because the test above based on the *left* merge does not tell us much about
    # "ex" in case the partitions were removed from "source"
    df_ex_read = read_table(cube.ktk_dataset_uuid("ex"), function_store)
    if "ex" in ktk_cube_dataset_ids:
        assert set(df_ex_read["p"]) == {1}
    else:
        assert set(df_ex_read["p"]) == {0, 1}
Exemple #30
0
def _regroup(df_aligned, intention, indexed_columns, datasets, cube):
    """
    Based on partition_by, form query groups.

    .. important::
        If tine intention does not contain a partition-by, this partition by the cube partition columns to speed up the
        query on parallel backends. In that case, the backend must concat and check the resulting dataframes before
        passing it to the user.

    Parameters
    ----------
    df_aligned: pandas.DataFrame
        aligned DataFrame, taken from :meth:`_create_aligned_partition_df`
    intention: kartothek.io_components.cube.query._intention.QueryIntention
        Query intention.
    indexed_columns: Dict[str, Set[str]]
        Indexed columns per ktk_cube dataset ID.
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that are processed by the regrouper.
    cube: Cube
        Cube specification.

    Returns
    -------
    label2gp: Dict[str, Dict[str, Tuple[int, int]]]
        Maps "dataset ID -> (label -> (group ID, partition ID))".
    group2cond: Dict[int, kartothek.core.cube.conditions.Conjunction]
        Condition per group.
    """
    partition_by = intention.partition_by
    if not partition_by:
        # special code to speed up the query
        partition_by = cube.partition_columns

    label2gp = defaultdict(lambda: defaultdict(list))
    group2cond = {}
    # figure out which datasets are affected by which additional condition
    extra_conditions_target = {}
    for ktk_cube_dataset_id, cols in indexed_columns.items():
        if ktk_cube_dataset_id not in datasets:
            # may be irrelevant
            continue
        for col in cols & set(partition_by):
            extra_conditions_target[col] = ktk_cube_dataset_id

    # generate groups
    for g, df_g in df_aligned.groupby(list(partition_by), sort=True):
        gid = g
        if len(partition_by) == 1:
            g = (g, )

        conditions_g = copy(intention.conditions_post)
        for g_part, col in zip(g, partition_by):
            if col in cube.partition_columns:
                # we do not need predicate pushdown for physical partition columns
                continue

            ktk_cube_dataset_id = extra_conditions_target[col]
            conditions_g[ktk_cube_dataset_id] = conditions_g.get(
                ktk_cube_dataset_id, Conjunction([])) & (C(col) == g_part)

        _aligned_df_to_label2gp(df_g, datasets, gid, label2gp)
        group2cond[gid] = conditions_g

    return label2gp, group2cond