Exemple #1
0
def assert_target_cube_readable(tgt_cube_uuid, tgt_store, df_seed, df_enrich):
    tgt_cube = Cube(
        dimension_columns=["x"], partition_columns=["p"], uuid_prefix=tgt_cube_uuid
    )
    tgt_cube_res = query_cube(cube=tgt_cube, store=tgt_store)[0]
    assert tgt_cube_res is not None
    assert tgt_cube_res[["x", "p", "v1"]].equals(df_seed)
    assert tgt_cube_res[["x", "p", "v2"]].equals(df_enrich)
Exemple #2
0
def test_update_partitions(driver, function_store, remove_partitions, new_partitions):
    df_source, cube = _write_cube(function_store)

    df_source_new = pd.DataFrame(
        {
            "i1": range(200, 200 + len(new_partitions)),
            "p": np.array(new_partitions, np.int64),
            "v1": range(300, 300 + len(new_partitions)),
            "x": range(100, 100 + len(new_partitions)),
        }
    )

    # what should remain of the old data:
    df_source_of_old = df_source.loc[~df_source["p"].isin(set(remove_partitions))]
    df_source_expected_after = pd.concat(
        [df_source_of_old, df_source_new], sort=False, ignore_index=True
    )

    remove_conditions = C("p").isin(remove_partitions)

    result = driver(
        data={"source": df_source_new},
        remove_conditions=remove_conditions,
        cube=cube,
        store=function_store,
        ktk_cube_dataset_ids={"source"},
        metadata={"source": {"some_new_meta": 42}},
    )

    assert set(result.keys()) == {"source"}

    dm_source_after = DatasetMetadata.load_from_store(
        cube.ktk_dataset_uuid("source"), function_store(), load_all_indices=True
    )

    assert "some_new_meta" in dm_source_after.metadata
    assert "meta_at_create" in dm_source_after.metadata

    # check values for "p" are as expected:
    expected_p_source = (set(df_source["p"].unique()) - set(remove_partitions)) | set(
        new_partitions
    )
    assert set(dm_source_after.indices["p"].index_dct) == expected_p_source

    df_read = query_cube(cube, function_store)[0]

    assert set(df_read.columns) == set(df_source_expected_after.columns)

    for df in (df_read, df_source_expected_after):
        df.sort_values("x", inplace=True)
        df.reset_index(drop=True, inplace=True)

    pd.testing.assert_frame_equal(df_read, df_source_expected_after)
Exemple #3
0
def test_update_respects_ktk_cube_dataset_ids(
    driver, function_store, ktk_cube_dataset_ids
):
    df_source, cube = _write_cube(function_store)
    df_ex = _extend_cube(cube, function_store)

    remove_conditions = C("p") == 0

    # This implicitly also tests that `data={}` behaves as expected and still deletes partitions
    # as requested via ktk_cube_dataset_ids and remove_conditions
    result = driver(
        data={},
        remove_conditions=remove_conditions,
        cube=cube,
        store=function_store,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
    )
    assert set(result) == ktk_cube_dataset_ids
    df_read = query_cube(cube, function_store)[0]

    # expected result: df_source left joined with df_ex; choosing the subset of p!=0 from each
    # that is in `ktk_cube_dataset_ids`:
    if "source" in ktk_cube_dataset_ids:
        df_source = df_source.loc[df_source["p"] != 0]
    if "ex" in ktk_cube_dataset_ids:
        df_ex = df_ex.loc[df_ex["p"] != 0]
    df_expected = df_source.merge(df_ex[["x", "a"]], how="left", on="x")
    df_expected = df_expected[sorted(df_expected.columns)]
    pd.testing.assert_frame_equal(df_read, df_expected)

    # test "ex" separately, because the test above based on the *left* merge does not tell us much about
    # "ex" in case the partitions were removed from "source"
    df_ex_read = read_table(cube.ktk_dataset_uuid("ex"), function_store)
    if "ex" in ktk_cube_dataset_ids:
        assert set(df_ex_read["p"]) == {1}
    else:
        assert set(df_ex_read["p"]) == {0, 1}