def assert_target_cube_readable(tgt_cube_uuid, tgt_store, df_seed, df_enrich): tgt_cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix=tgt_cube_uuid ) tgt_cube_res = query_cube(cube=tgt_cube, store=tgt_store)[0] assert tgt_cube_res is not None assert tgt_cube_res[["x", "p", "v1"]].equals(df_seed) assert tgt_cube_res[["x", "p", "v2"]].equals(df_enrich)
def test_update_partitions(driver, function_store, remove_partitions, new_partitions): df_source, cube = _write_cube(function_store) df_source_new = pd.DataFrame( { "i1": range(200, 200 + len(new_partitions)), "p": np.array(new_partitions, np.int64), "v1": range(300, 300 + len(new_partitions)), "x": range(100, 100 + len(new_partitions)), } ) # what should remain of the old data: df_source_of_old = df_source.loc[~df_source["p"].isin(set(remove_partitions))] df_source_expected_after = pd.concat( [df_source_of_old, df_source_new], sort=False, ignore_index=True ) remove_conditions = C("p").isin(remove_partitions) result = driver( data={"source": df_source_new}, remove_conditions=remove_conditions, cube=cube, store=function_store, ktk_cube_dataset_ids={"source"}, metadata={"source": {"some_new_meta": 42}}, ) assert set(result.keys()) == {"source"} dm_source_after = DatasetMetadata.load_from_store( cube.ktk_dataset_uuid("source"), function_store(), load_all_indices=True ) assert "some_new_meta" in dm_source_after.metadata assert "meta_at_create" in dm_source_after.metadata # check values for "p" are as expected: expected_p_source = (set(df_source["p"].unique()) - set(remove_partitions)) | set( new_partitions ) assert set(dm_source_after.indices["p"].index_dct) == expected_p_source df_read = query_cube(cube, function_store)[0] assert set(df_read.columns) == set(df_source_expected_after.columns) for df in (df_read, df_source_expected_after): df.sort_values("x", inplace=True) df.reset_index(drop=True, inplace=True) pd.testing.assert_frame_equal(df_read, df_source_expected_after)
def test_update_respects_ktk_cube_dataset_ids( driver, function_store, ktk_cube_dataset_ids ): df_source, cube = _write_cube(function_store) df_ex = _extend_cube(cube, function_store) remove_conditions = C("p") == 0 # This implicitly also tests that `data={}` behaves as expected and still deletes partitions # as requested via ktk_cube_dataset_ids and remove_conditions result = driver( data={}, remove_conditions=remove_conditions, cube=cube, store=function_store, ktk_cube_dataset_ids=ktk_cube_dataset_ids, ) assert set(result) == ktk_cube_dataset_ids df_read = query_cube(cube, function_store)[0] # expected result: df_source left joined with df_ex; choosing the subset of p!=0 from each # that is in `ktk_cube_dataset_ids`: if "source" in ktk_cube_dataset_ids: df_source = df_source.loc[df_source["p"] != 0] if "ex" in ktk_cube_dataset_ids: df_ex = df_ex.loc[df_ex["p"] != 0] df_expected = df_source.merge(df_ex[["x", "a"]], how="left", on="x") df_expected = df_expected[sorted(df_expected.columns)] pd.testing.assert_frame_equal(df_read, df_expected) # test "ex" separately, because the test above based on the *left* merge does not tell us much about # "ex" in case the partitions were removed from "source" df_ex_read = read_table(cube.ktk_dataset_uuid("ex"), function_store) if "ex" in ktk_cube_dataset_ids: assert set(df_ex_read["p"]) == {1} else: assert set(df_ex_read["p"]) == {0, 1}