def test_fails_colcond_scalar(self): c1 = C("foö") c2 = C("bar") cond = c2 == 42 with pytest.raises(TypeError) as exc: c1 == cond assert str(exc.value) == "Cannot use nested conditions."
def test_fails_colconj(self): col1 = C("foö") col2 = C("bar") conj = (col2 == 42) & (col2 == 10) with pytest.raises(TypeError) as exc: col1.in_interval(10, conj) assert str(exc.value) == "Cannot use nested conditions."
def test_fails_colconj_list(self): c1 = C("foö") c2 = C("bar") conj = (c2 == 42) & (c2 == 10) with pytest.raises(TypeError) as exc: c1.isin([conj]) assert str(exc.value) == "Cannot use nested conditions."
def test_filter_df_some(self): cond = (C("foö") == 42) & (C("bar") == 2) df = pd.DataFrame({ "foö": [13, 42, 42, 100], "bar": [1, 2, 3, 4], "z": 0.0 }) df_actual = cond.filter_df(df) df_expected = df.loc[(df["foö"] == 42) & (df["bar"] == 2)] pdt.assert_frame_equal(df_actual, df_expected)
def test_fail_missing_condition_columns(driver, module_store, test_cube, test_df): with pytest.raises(ValueError) as exc: driver( cube=test_cube, store=module_store, conditions=(C("foo") == 1) & (C("bar") == 2), ) assert ( "Following condition columns are required but are missing from the cube: bar, foo" in str(exc.value) )
def test_filter_df_nulls(self): cond = (C("foö") != 42.0) & (C("bar") != 2.0) df = pd.DataFrame({ "foö": [13, 42, np.nan, np.nan], "bar": [1, 2, 3, np.nan], "z": np.nan }) df_actual = cond.filter_df(df) df_expected = pd.DataFrame({ "foö": [13.0], "bar": [1.0], "z": [np.nan] }) pdt.assert_frame_equal(df_actual, df_expected)
def test_stresstest_index_select_row(driver, function_store): n_indices = 100 n_rows = 1000 data = {"x": np.arange(n_rows), "p": 0} for i in range(n_indices): data["i{}".format(i)] = np.arange(n_rows) df = pd.DataFrame(data) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", index_columns=["i{}".format(i) for i in range(n_indices)], ) build_cube(data=df, cube=cube, store=function_store) conditions = Conjunction([(C("i{}".format(i)) == 0) for i in range(n_indices)]) result = driver( cube=cube, store=function_store, conditions=conditions, payload_columns=["p", "x"], ) assert len(result) == 1 df_actual = result[0] df_expected = df.loc[df["x"] == 0].reindex(columns=["p", "x"]) pdt.assert_frame_equal(df_actual, df_expected)
def test_single_rowgroup_when_df_serializer_is_not_passed_to_update_cube( driver, function_store ): """ Test that the dataset has a single row group as default path """ # Build cube df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, cube=cube, store=function_store, ) # Update cube - replace p=1 and append p=2 partitions df_update = pd.DataFrame( data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"], ) result = driver( data={"seed": df_update}, remove_conditions=(C("p") == 1), # Remove p=1 partition cube=cube, store=function_store, ) dataset = result["seed"].load_all_indices(function_store()) part_num_rows = {0: 1, 1: 2, 2: 2} part_chunk_size = {0: None, 1: None, 2: None} assert len(dataset.partitions) == 3 assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)
def test_compression_is_compatible_on_update_cube(driver, function_store): """ Test that partitons written with different compression algorithms are compatible The compression algorithms are not parametrized because their availability depends on the arrow build. 'SNAPPY' and 'GZIP' are already assumed to be available in parts of the code. A fully parametrized test would also increase runtime and test complexity unnecessarily. """ # Build cube df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, cube=cube, store=function_store, df_serializer=ParquetSerializer(compression="SNAPPY"), ) # Update cube - replace p=1 and append p=2 partitions df_update = pd.DataFrame( data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"], ) result = driver( data={"seed": df_update}, remove_conditions=(C("p") == 1), # Remove p=1 partition cube=cube, store=function_store, df_serializer=ParquetSerializer(compression="GZIP"), ) dataset = result["seed"].load_all_indices(function_store()) assert len(dataset.partitions) == 3
def test_fails_nocond(self): col = C("foö") cond1 = col < 10 with pytest.raises(TypeError) as exc: cond1 & col assert str( exc.value) == "Can only build conjunction out of conditions."
def test_simple(self): cond = C("foö").in_interval(10, 20) assert isinstance(cond, InIntervalCondition) assert str(cond) == "foö.in_interval(10, 20)" assert cond.predicate_part == [("foö", ">=", 10), ("foö", "<", 20)] assert cond.active hash(cond)
def test_delayed_index_build_correction_restriction(driver, function_store): """ Ensure that adding extra indices for dimension columns does not mark other datasets as restrictive. """ df_seed = pd.DataFrame({"x": [0, 1, 2, 3, 4, 5], "p": [0, 0, 1, 1, 2, 2]}) df_extend = pd.DataFrame({"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 1, 2]}) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="delayed_index_cube", index_columns=[], ) build_cube( data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube ) build_dataset_indices( store=function_store, dataset_uuid=cube.ktk_dataset_uuid("extend"), columns=["x"], ) results = driver(cube=cube, store=function_store, conditions=C("x") >= 0) assert len(results) == 1 df_actual = results[0] df_expected = pd.DataFrame( { "x": [0, 1, 2, 3, 4, 5], "p": [0, 0, 1, 1, 2, 2], "v": [0, 1, 2, np.nan, np.nan, np.nan], }, columns=["p", "v", "x"], ) pdt.assert_frame_equal(df_actual, df_expected)
def test_conditions(driver, function_store, existing_cube): parts_source1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("source"), function_store() ).partitions ) parts_enrich1 = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("enrich"), function_store() ).partitions ) parts_source_to_delete = {part for part in parts_source1 if "p=0" not in part} result = driver( cube=existing_cube, store=function_store, ktk_cube_dataset_ids=["source"], conditions=C("p") > 0, ) assert set(result.keys()) == {"source", "enrich"} ds_source = result["source"] ds_enrich = result["enrich"] parts_source2 = set(ds_source.partitions) parts_enrich2 = set(ds_enrich.partitions) assert parts_enrich1 == parts_enrich2 assert parts_source1 - parts_source_to_delete == parts_source2
def test_op(self, f, t, op, value): c = C("foö") cond = f(c) assert isinstance(cond, t) assert cond.OP == op assert str(cond) == "foö {} {}".format(op, value) assert cond.predicate_part == [("foö", op, value)] assert cond.active hash(cond)
def test_nested_cond_conj(self): col = C("foö") cond1 = col < 10 cond2 = col > 0 cond3 = col != 10 conj1 = cond2 & cond3 conj2 = cond1 & conj1 assert isinstance(conj2, Conjunction) assert conj2.conditions == (cond1, cond2, cond3)
def test_wrong_condition_type(driver, function_store, driver_name): types = { "int": pd.Series([-1], dtype=np.int64), "uint": pd.Series([1], dtype=np.uint64), "float": pd.Series([1.3], dtype=np.float64), "bool": pd.Series([True], dtype=np.bool_), "str": pd.Series(["foo"], dtype=object), } cube = Cube( dimension_columns=["d_{}".format(t) for t in sorted(types.keys())], partition_columns=["p_{}".format(t) for t in sorted(types.keys())], uuid_prefix="typed_cube", index_columns=["i_{}".format(t) for t in sorted(types.keys())], ) data = { "seed": pd.DataFrame( { "{}_{}".format(prefix, t): types[t] for t in sorted(types.keys()) for prefix in ["d", "p", "v1"] } ), "enrich": pd.DataFrame( { "{}_{}".format(prefix, t): types[t] for t in sorted(types.keys()) for prefix in ["d", "p", "i", "v2"] } ), } build_cube(data=data, store=function_store, cube=cube) df = pd.DataFrame( { "{}_{}".format(prefix, t): types[t] for t in sorted(types.keys()) for prefix in ["d", "p", "i", "v1", "v2"] } ) for col in df.columns: t1 = col.split("_")[1] for t2 in sorted(types.keys()): cond = C(col) == types[t2].values[0] if t1 == t2: result = driver(cube=cube, store=function_store, conditions=cond) assert len(result) == 1 df_actual = result[0] df_expected = cond.filter_df(df).reset_index(drop=True) pdt.assert_frame_equal(df_actual, df_expected, check_like=True) else: with pytest.raises(TypeError) as exc: driver(cube=cube, store=function_store, conditions=cond) assert "has wrong type" in str(exc.value)
def test_hash(self): col = C("foö") cond1 = col < 10 cond2 = col > 0 cond3 = col != 10 conj1a = cond1 & cond2 conj1b = cond1 & cond2 conj2 = cond1 & cond3 assert hash(conj1a) == hash(conj1b) assert hash(conj1a) != hash(conj2)
def test_nested_conj_conj(self): col = C("foö") cond1 = col < 10 cond2 = col > 0 cond3 = col != 10 cond4 = col != 11 conj1 = cond1 & cond2 conj2 = cond3 & cond4 conj3 = conj1 & conj2 assert isinstance(conj3, Conjunction) assert conj3.conditions == (cond1, cond2, cond3, cond4)
def test_simple(self): col = C("foö") cond1 = col < 10 cond2 = col > 0 conj = cond1 & cond2 assert isinstance(conj, Conjunction) assert conj.conditions == (cond1, cond2) assert str(conj) == "(foö < 10) & (foö > 0)" assert conj.columns == {"foö"} assert conj.predicate == [("foö", "<", 10), ("foö", ">", 0)] assert conj.split_by_column() == {"foö": conj}
def test_condition_on_null(driver, function_store): df = pd.DataFrame({ "x": pd.Series([0, 1, 2], dtype=np.int64), "p": pd.Series([0, 0, 1], dtype=np.int64), "v_f1": pd.Series([0, np.nan, 2], dtype=np.float64), "v_f2": pd.Series([0, 1, np.nan], dtype=np.float64), "v_f3": pd.Series([np.nan, np.nan, np.nan], dtype=np.float64), "v_s1": pd.Series(["a", None, "c"], dtype=object), "v_s2": pd.Series(["a", "b", None], dtype=object), "v_s3": pd.Series([None, None, None], dtype=object), }) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="nulled_cube", index_columns=[], ) build_cube(data=df, store=function_store, cube=cube) for col in df.columns: # only iterate over the value columns (not the dimension / partition column): if not col.startswith("v"): continue # col_type will be either 'f' for float or 's' for string; see column # names above col_type = col.split("_")[1][0] if col_type == "f": value = 1.2 elif col_type == "s": value = "foo" else: raise RuntimeError("unknown type") cond = C(col) == value df_expected = cond.filter_df(df).reset_index(drop=True) result = driver(cube=cube, store=function_store, conditions=cond) if df_expected.empty: assert len(result) == 0 else: assert len(result) == 1 df_actual = result[0] pdt.assert_frame_equal(df_actual, df_expected, check_like=True)
def test_frozen(self): cond = C("foö") == 42 with pytest.raises(AttributeError): cond.column = "bar" with pytest.raises(AttributeError): cond.value = 1337 with pytest.raises(AttributeError): cond.OP = "x"
def test_fail_wrong_condition(driver, function_store, existing_cube): with pytest.raises( ValueError, match="Can only remove partitions with conditions concerning cubes physical partition columns.", ): driver( cube=existing_cube, store=function_store, ktk_cube_dataset_ids=["source"], conditions=C("v1") >= 0, )
def test_multicol(self): col1 = C("foö") col2 = C("bar") cond1 = col1 < 10 cond2 = col1 > 0 cond3 = col2 != 10 conj1 = cond1 & cond2 conj2 = conj1 & cond3 assert isinstance(conj2, Conjunction) assert conj2.conditions == (cond1, cond2, cond3) assert str(conj2) == "(foö < 10) & (foö > 0) & (bar != 10)" assert conj2.columns == {"foö", "bar"} assert conj2.predicate == [ ("foö", "<", 10), ("foö", ">", 0), ("bar", "!=", 10), ] assert conj2.split_by_column() == { "foö": conj1, "bar": Conjunction([cond3]) }
def test_update_partitions(driver, function_store, remove_partitions, new_partitions): df_source, cube = _write_cube(function_store) df_source_new = pd.DataFrame( { "i1": range(200, 200 + len(new_partitions)), "p": np.array(new_partitions, np.int64), "v1": range(300, 300 + len(new_partitions)), "x": range(100, 100 + len(new_partitions)), } ) # what should remain of the old data: df_source_of_old = df_source.loc[~df_source["p"].isin(set(remove_partitions))] df_source_expected_after = pd.concat( [df_source_of_old, df_source_new], sort=False, ignore_index=True ) remove_conditions = C("p").isin(remove_partitions) result = driver( data={"source": df_source_new}, remove_conditions=remove_conditions, cube=cube, store=function_store, ktk_cube_dataset_ids={"source"}, metadata={"source": {"some_new_meta": 42}}, ) assert set(result.keys()) == {"source"} dm_source_after = DatasetMetadata.load_from_store( cube.ktk_dataset_uuid("source"), function_store(), load_all_indices=True ) assert "some_new_meta" in dm_source_after.metadata assert "meta_at_create" in dm_source_after.metadata # check values for "p" are as expected: expected_p_source = (set(df_source["p"].unique()) - set(remove_partitions)) | set( new_partitions ) assert set(dm_source_after.indices["p"].index_dct) == expected_p_source df_read = query_cube(cube, function_store)[0] assert set(df_read.columns) == set(df_source_expected_after.columns) for df in (df_read, df_source_expected_after): df.sort_values("x", inplace=True) df.reset_index(drop=True, inplace=True) pd.testing.assert_frame_equal(df_read, df_source_expected_after)
def test_filter_select(driver, module_store, test_cube, test_df): result = driver( cube=test_cube, store=module_store, payload_columns=["v1", "v2"], conditions=(C("i3") >= 3), # completely unrelated to the payload ) assert len(result) == 1 df_actual = result[0] df_expected = test_df.loc[ test_df["i3"] >= 3, ["p", "q", "v1", "v2", "x", "y", "z"] ].reset_index(drop=True) pdt.assert_frame_equal(df_actual, df_expected)
def test_remove_nonmatching_condition(driver, function_store, existing_cube): parts_source_before = set( DatasetMetadata.load_from_store( existing_cube.ktk_dataset_uuid("source"), function_store()).partitions) result = driver( cube=existing_cube, store=function_store, ktk_cube_dataset_ids=["source"], conditions=C("p") > 10000, ) parts_source_after = set(result["source"].partitions) assert parts_source_before == parts_source_after
def test_nested_conj_cond(self): col = C("foö") cond1 = col < 10 cond2 = col > 0 cond3 = col != 10 conj1 = cond1 & cond2 conj2 = conj1 & cond3 assert isinstance(conj2, Conjunction) assert conj2.conditions == (cond1, cond2, cond3) assert str(conj2) == "(foö < 10) & (foö > 0) & (foö != 10)" assert conj2.columns == {"foö"} assert conj2.predicate == [ ("foö", "<", 10), ("foö", ">", 0), ("foö", "!=", 10), ] assert conj2.split_by_column() == {"foö": conj2}
def test_rowgroups_are_applied_when_df_serializer_is_passed_to_update_cube( driver, function_store, chunk_size_build, chunk_size_update ): """ Test that the dataset is split into row groups depending on the chunk size Partitions build with ``chunk_size=None`` should keep a single row group if they are not touched by the update. Partitions that are newly created or replaced with ``chunk_size>0`` should be split into row groups accordingly. """ # Build cube df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, cube=cube, store=function_store, df_serializer=ParquetSerializer(chunk_size=chunk_size_build), ) # Update cube - replace p=1 and append p=2 partitions df_update = pd.DataFrame( data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"], ) result = driver( data={"seed": df_update}, remove_conditions=(C("p") == 1), # Remove p=1 partition cube=cube, store=function_store, df_serializer=ParquetSerializer(chunk_size=chunk_size_update), ) dataset = result["seed"].load_all_indices(function_store()) part_num_rows = {0: 1, 1: 2, 2: 2} part_chunk_size = { 0: chunk_size_build, 1: chunk_size_update, 2: chunk_size_update, } assert len(dataset.partitions) == 3 assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)
def test_update_respects_ktk_cube_dataset_ids( driver, function_store, ktk_cube_dataset_ids ): df_source, cube = _write_cube(function_store) df_ex = _extend_cube(cube, function_store) remove_conditions = C("p") == 0 # This implicitly also tests that `data={}` behaves as expected and still deletes partitions # as requested via ktk_cube_dataset_ids and remove_conditions result = driver( data={}, remove_conditions=remove_conditions, cube=cube, store=function_store, ktk_cube_dataset_ids=ktk_cube_dataset_ids, ) assert set(result) == ktk_cube_dataset_ids df_read = query_cube(cube, function_store)[0] # expected result: df_source left joined with df_ex; choosing the subset of p!=0 from each # that is in `ktk_cube_dataset_ids`: if "source" in ktk_cube_dataset_ids: df_source = df_source.loc[df_source["p"] != 0] if "ex" in ktk_cube_dataset_ids: df_ex = df_ex.loc[df_ex["p"] != 0] df_expected = df_source.merge(df_ex[["x", "a"]], how="left", on="x") df_expected = df_expected[sorted(df_expected.columns)] pd.testing.assert_frame_equal(df_read, df_expected) # test "ex" separately, because the test above based on the *left* merge does not tell us much about # "ex" in case the partitions were removed from "source" df_ex_read = read_table(cube.ktk_dataset_uuid("ex"), function_store) if "ex" in ktk_cube_dataset_ids: assert set(df_ex_read["p"]) == {1} else: assert set(df_ex_read["p"]) == {0, 1}
def _regroup(df_aligned, intention, indexed_columns, datasets, cube): """ Based on partition_by, form query groups. .. important:: If tine intention does not contain a partition-by, this partition by the cube partition columns to speed up the query on parallel backends. In that case, the backend must concat and check the resulting dataframes before passing it to the user. Parameters ---------- df_aligned: pandas.DataFrame aligned DataFrame, taken from :meth:`_create_aligned_partition_df` intention: kartothek.io_components.cube.query._intention.QueryIntention Query intention. indexed_columns: Dict[str, Set[str]] Indexed columns per ktk_cube dataset ID. datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that are processed by the regrouper. cube: Cube Cube specification. Returns ------- label2gp: Dict[str, Dict[str, Tuple[int, int]]] Maps "dataset ID -> (label -> (group ID, partition ID))". group2cond: Dict[int, kartothek.core.cube.conditions.Conjunction] Condition per group. """ partition_by = intention.partition_by if not partition_by: # special code to speed up the query partition_by = cube.partition_columns label2gp = defaultdict(lambda: defaultdict(list)) group2cond = {} # figure out which datasets are affected by which additional condition extra_conditions_target = {} for ktk_cube_dataset_id, cols in indexed_columns.items(): if ktk_cube_dataset_id not in datasets: # may be irrelevant continue for col in cols & set(partition_by): extra_conditions_target[col] = ktk_cube_dataset_id # generate groups for g, df_g in df_aligned.groupby(list(partition_by), sort=True): gid = g if len(partition_by) == 1: g = (g, ) conditions_g = copy(intention.conditions_post) for g_part, col in zip(g, partition_by): if col in cube.partition_columns: # we do not need predicate pushdown for physical partition columns continue ktk_cube_dataset_id = extra_conditions_target[col] conditions_g[ktk_cube_dataset_id] = conditions_g.get( ktk_cube_dataset_id, Conjunction([])) & (C(col) == g_part) _aligned_df_to_label2gp(df_g, datasets, gid, label2gp) group2cond[gid] = conditions_g return label2gp, group2cond