def test_split(driver, function_store):
    """
    Imagine the user already splits the data.
    """
    df_source1 = pd.DataFrame({"x": [0, 1], "p": [0, 0], "v1": [10, 11]})
    df_source2 = pd.DataFrame({"x": [2, 3], "p": [1, 1], "v1": [12, 13]})
    df_enrich = pd.DataFrame({"x": [0, 1], "p": [0, 0], "v2": [20, 21]})
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    result = driver(
        data=[{"source": df_source1, "enrich": df_enrich}, df_source2],
        cube=cube,
        store=function_store,
    )

    assert set(result.keys()) == {cube.seed_dataset, "enrich"}

    ds_source = result[cube.seed_dataset].load_all_indices(function_store())
    ds_enrich = result["enrich"].load_all_indices(function_store())

    assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)
    assert ds_enrich.uuid == cube.ktk_dataset_uuid("enrich")

    assert len(ds_source.partitions) == 2
    assert len(ds_enrich.partitions) == 1
def test_fail_partition_on_4(driver, function_store):
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )

    with pytest.raises(
        ValueError, match="Unspecified but provided partition columns in enrich: p"
    ):
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
            partition_on={"enrich": []},
        )
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_distinct_branches(driver, function_store):
    """
    Just check this actually works.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    result = driver(
        data=[{"source": df_source}, {"enrich": df_enrich}],
        cube=cube,
        store=function_store,
    )

    assert set(result.keys()) == {cube.seed_dataset, "enrich"}

    ds_source = result[cube.seed_dataset].load_all_indices(function_store())
    ds_enrich = result["enrich"].load_all_indices(function_store())

    assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)
    assert ds_enrich.uuid == cube.ktk_dataset_uuid("enrich")

    assert len(ds_source.partitions) == 2
    assert len(ds_enrich.partitions) == 2
def test_fail_wrong_types(driver, function_store):
    """
    Might catch nasty pandas and other type bugs.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0.0, 1.0, 2.0, 3.0], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )

    cause = exc_info.value.__cause__
    assert isinstance(cause, ValueError)
    assert 'Found incompatible entries for column "x"' in str(cause)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Exemple #5
0
def test_copy_validates():
    cube1 = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube1")
    with pytest.raises(ValueError) as exc:
        cube1.copy(uuid_prefix="cube2++")
    assert (
        str(exc.value) == 'uuid_prefix ("cube2++") must not contain UUID separator ++'
    )
def test_fail_nondistinc_payload(driver, function_store):
    """
    This would lead to problems during the query phase.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )
    cause = exc_info.value.__cause__
    assert isinstance(cause, ValueError)
    assert "Found columns present in multiple datasets" in str(cause)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Exemple #7
0
def test_copy_simple():
    cube1 = Cube(dimension_columns=["x"],
                 partition_columns=["p"],
                 uuid_prefix="cube1")
    cube2 = cube1.copy(uuid_prefix="cube2")
    assert cube1.uuid_prefix == "cube1"
    assert cube2.uuid_prefix == "cube2"
Exemple #8
0
def test_additional_files(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(data=df_seed, cube=cube, store=function_store)

    key_in_ds = cube.ktk_dataset_uuid(cube.seed_dataset) + "/foo"
    key_with_ds_prefix = cube.ktk_dataset_uuid(cube.seed_dataset) + ".foo"
    key_with_cube_prefix = cube.uuid_prefix + ".foo"
    key_with_cube_prefix_separator = cube.uuid_prefix + KTK_CUBE_UUID_SEPARATOR + ".foo"

    function_store().put(key_in_ds, b"")
    function_store().put(key_with_ds_prefix, b"")
    function_store().put(key_with_cube_prefix, b"")
    function_store().put(key_with_cube_prefix_separator, b"")

    driver(cube=cube, store=function_store)
    assert key_in_ds not in set(function_store().keys())
    assert key_with_ds_prefix not in set(function_store().keys())
    assert key_with_cube_prefix in set(function_store().keys())
    assert key_with_cube_prefix_separator not in set(function_store().keys())
def test_projected_data(driver, function_store):
    """
    Projected dataset (useful for de-duplication).
    """
    df_source = pd.DataFrame(
        {
            "x": [0, 1, 0, 1],
            "y": [0, 0, 1, 1],
            "p": [0, 0, 1, 1],
            "v1": [10, 11, 12, 13],
        }
    )
    df_enrich = pd.DataFrame({"y": [0, 1], "p": [0, 1], "v2": [20, 21]})
    cube = Cube(
        dimension_columns=["x", "y"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    result = driver(
        data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store
    )

    assert set(result.keys()) == {cube.seed_dataset, "enrich"}

    ds_source = result[cube.seed_dataset].load_all_indices(function_store())
    ds_enrich = result["enrich"].load_all_indices(function_store())

    assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)
    assert ds_enrich.uuid == cube.ktk_dataset_uuid("enrich")

    assert len(ds_source.partitions) == 2
    assert len(ds_enrich.partitions) == 2
Exemple #10
0
def test_delayed_index_build_correction_restriction(driver, function_store):
    """
    Ensure that adding extra indices for dimension columns does not mark other datasets as restrictive.
    """
    df_seed = pd.DataFrame({"x": [0, 1, 2, 3, 4, 5], "p": [0, 0, 1, 1, 2, 2]})
    df_extend = pd.DataFrame({"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 1, 2]})
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="delayed_index_cube",
        index_columns=[],
    )
    build_cube(
        data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube
    )

    build_dataset_indices(
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid("extend"),
        columns=["x"],
    )

    results = driver(cube=cube, store=function_store, conditions=C("x") >= 0)
    assert len(results) == 1

    df_actual = results[0]
    df_expected = pd.DataFrame(
        {
            "x": [0, 1, 2, 3, 4, 5],
            "p": [0, 0, 1, 1, 2, 2],
            "v": [0, 1, 2, np.nan, np.nan, np.nan],
        },
        columns=["p", "v", "x"],
    )
    pdt.assert_frame_equal(df_actual, df_expected)
Exemple #11
0
def test_delayed_index_build_partition_by(driver, function_store):
    df_seed = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]})
    df_extend = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [0, 0, 0, 1]})
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="delayed_index_cube",
        index_columns=[],
    )
    build_cube(
        data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube
    )

    build_dataset_indices(
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid("extend"),
        columns=["v"],
    )

    results = driver(cube=cube, store=function_store, partition_by=["v"])
    assert len(results) == 2

    df_result1 = pd.DataFrame(
        data={"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 0, 0]}, columns=["p", "v", "x"]
    )
    df_result2 = pd.DataFrame(
        data={"x": [3], "p": [1], "v": [1]}, columns=["p", "v", "x"]
    )
    pdt.assert_frame_equal(results[0], df_result1)
    pdt.assert_frame_equal(results[1], df_result2)
Exemple #12
0
def test_cube_blacklist_dimension_index(function_store, driver):

    cube1 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    df_1 = pd.DataFrame({"A": range(10), "P": 1, "B": 1, "payload": ""})
    build_cube(
        data={"source": df_1},
        cube=cube1,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )

    cube2 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        suppress_index_on=["B"],
    )
    df_2 = pd.DataFrame({"A": range(10), "P": 1, "B": 2, "payload": ""})
    driver(
        data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None
    )

    dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset)
    dm = DatasetMetadata.load_from_store(
        dataset_uuid, function_store(), load_all_indices=True
    )
    obs_values = dm.indices["B"].observed_values()

    assert sorted(obs_values) == [1, 2]
def test_nones(driver, function_store, none_first, driver_name):
    """
    Test what happens if user passes None to ktk_cube.
    """
    if driver_name == "dask_dataframe":
        pytest.skip("user cannot create None-partitions with dask.dataframe")

    df = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]})
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    result = driver(
        data=[None, df] if none_first else [df, None], cube=cube, store=function_store
    )

    assert set(result.keys()) == {cube.seed_dataset}

    ds = list(result.values())[0]
    ds = ds.load_all_indices(function_store())

    assert ds.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)
    assert len(ds.partitions) == 2

    assert set(ds.indices.keys()) == {"p", "x"}
    assert isinstance(ds.indices["p"], PartitionIndex)
    assert isinstance(ds.indices["x"], ExplicitSecondaryIndex)

    assert set(ds.table_meta) == {SINGLE_TABLE}
Exemple #14
0
def test_hash():
    cube1 = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    cube2 = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    cube3 = Cube(
        dimension_columns=["x", "y"], partition_columns=["p"], uuid_prefix="cube"
    )

    assert hash(cube1) == hash(cube2)
    assert hash(cube3) != hash(cube1)
def test_fails_null_dimension(driver, function_store):
    """
    Since we do not allow NULL values in queries, it should be banned from dimension columns in the first place.
    """
    df = pd.DataFrame(
        {"x": [0, 1, 2, np.nan], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]}
    )
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    with pytest.raises(ValueError) as exc:
        driver(data=df, cube=cube, store=function_store)

    assert 'Found NULL-values in dimension column "x" of dataset "seed"' in str(exc)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("seed"), function_store())
Exemple #16
0
def test_fail_all_empty(driver, function_store):
    """
    Might happen due to DB-based filters.
    """
    df = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]}
    ).loc[[]]
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    with pytest.raises(ValueError) as exc:
        driver(data=df, cube=cube, store=function_store)
    assert "Cannot write empty datasets: seed" in str(exc.value)

    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Exemple #17
0
def ensure_valid_cube_indices(existing_datasets: Mapping[str,
                                                         DatasetMetadataBase],
                              cube: Cube) -> Cube:
    """
    Parse all existing datasets and infer the required set of indices. We do not
    allow indices to be removed or added in update steps at the momenent and
    need to make sure that existing ones are updated properly.
    The returned `Cube` instance will be a copy of the input with
    `index_columns` and `suppress_index_on` fields adjusted to reflect the
    existing datasets.
    """
    dataset_indices = []
    for ds in existing_datasets.values():
        for internal_table in ds.table_meta:
            dataset_columns = set(ds.table_meta[internal_table].names)
            table_indices = cube.index_columns & dataset_columns
            compatible_indices = _ensure_compatible_indices(ds, table_indices)
            if compatible_indices:
                dataset_indices.append(set(compatible_indices))
    required_indices = cube.index_columns.union(*dataset_indices)
    suppress_index_on = cube.suppress_index_on.difference(*dataset_indices)
    # Need to remove dimension columns since they *are* technically indices but
    # the cube interface class declares them as not indexed just to add them
    # later on, assuming it is not blacklisted
    return cube.copy(
        index_columns=required_indices - set(cube.dimension_columns),
        suppress_index_on=suppress_index_on,
    )
Exemple #18
0
def test_init_fail_illegal_uuid_prefix_ktk():
    with pytest.raises(ValueError) as exc:
        Cube(dimension_columns=["x"],
             partition_columns=["p"],
             uuid_prefix="cu be")
    assert str(
        exc.value) == 'uuid_prefix ("cu be") is not compatible with kartothek'
Exemple #19
0
def test_init_fail_illegal_uuid_prefix_sep():
    with pytest.raises(ValueError) as exc:
        Cube(dimension_columns=["x"],
             partition_columns=["p"],
             uuid_prefix="cu++be")
    assert str(exc.value
               ) == 'uuid_prefix ("cu++be") must not contain UUID separator ++'
Exemple #20
0
def _get_cube(function_store, with_partition_on):
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "q": 0, "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "q": 0, "v2": [10, 11, 12, 13]}
    )
    if with_partition_on:
        df_enrich.drop(columns=["p", "q"], inplace=True)

    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p", "q"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(
        data={"source": df_source, "enrich": df_enrich},
        cube=cube,
        store=function_store,
        metadata={"source": {"userkey1": "value1"}},
        partition_on={"enrich": []} if with_partition_on else None,
    )
    return cube
Exemple #21
0
def test_converters():
    cube = Cube(
        dimension_columns=b"my_dim",
        partition_columns=b"my_part",
        uuid_prefix=b"my_prefix",
        seed_dataset=b"my_seed",
        index_columns=b"my_index",
        suppress_index_on=b"my_dim",
    )

    assert cube.dimension_columns == ("my_dim", )
    assert all(isinstance(s, str) for s in cube.dimension_columns)

    assert cube.partition_columns == ("my_part", )
    assert all(isinstance(s, str) for s in cube.partition_columns)

    assert cube.uuid_prefix == "my_prefix"
    assert isinstance(cube.uuid_prefix, str)

    assert cube.seed_dataset == "my_seed"
    assert isinstance(cube.seed_dataset, str)

    assert cube.index_columns == {"my_index"}
    assert all(isinstance(s, str) for s in cube.index_columns)

    assert cube.suppress_index_on == {"my_dim"}
def existing_cube(function_store):
    df_source = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    df_enrich = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(data={
        "source": df_source,
        "enrich": df_enrich
    },
               cube=cube,
               store=function_store)
    return cube
Exemple #23
0
def test_defaults():
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")

    assert cube.seed_dataset == "seed"
    assert isinstance(cube.seed_dataset, str)

    assert cube.index_columns == set()
def test_overwrite(driver, function_store):
    """
    Test overwrite behavior aka call the build function if the cube already exists.
    """
    df1 = pd.DataFrame({"x": [0, 1], "p": [0, 0], "v": [10, 11]})
    df2 = pd.DataFrame({"x": [2, 3], "p": [1, 1], "v": [12, 13]})
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    driver(data=df1, cube=cube, store=function_store)

    # implicit overwrite fails
    keys = set(function_store().keys())
    with pytest.raises(RuntimeError) as exc:
        driver(data=df1, cube=cube, store=function_store)
    assert "already exists and overwrite is not permitted" in str(exc.value)
    assert set(function_store().keys()) == keys

    # explicit overwrite works
    result = driver(data=df2, cube=cube, store=function_store, overwrite=True)

    ds = list(result.values())[0]
    ds = ds.load_all_indices(function_store())

    assert len(ds.partitions) == 1

    assert set(ds.indices["p"].index_dct.keys()) == {1}
def test_metadata(driver, function_store):
    """
    Test auto- and user-generated metadata.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 4, 5], "p": [0, 0, 2, 2], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    result = driver(
        data={"source": df_source, "enrich": df_enrich},
        cube=cube,
        store=function_store,
        metadata={"enrich": {"foo": 1}},
    )

    assert set(result.keys()) == {cube.seed_dataset, "enrich"}

    ds_source = result[cube.seed_dataset]
    assert set(ds_source.metadata.keys()) == {
        "creation_time",
        KTK_CUBE_METADATA_DIMENSION_COLUMNS,
        KTK_CUBE_METADATA_KEY_IS_SEED,
        KTK_CUBE_METADATA_PARTITION_COLUMNS,
        KTK_CUBE_METADATA_SUPPRESS_INDEX_ON,
    }
    assert ds_source.metadata[KTK_CUBE_METADATA_DIMENSION_COLUMNS] == list(
        cube.dimension_columns
    )
    assert ds_source.metadata[KTK_CUBE_METADATA_KEY_IS_SEED] is True
    assert ds_source.metadata[KTK_CUBE_METADATA_PARTITION_COLUMNS] == list(
        cube.partition_columns
    )
    assert ds_source.metadata[KTK_CUBE_METADATA_SUPPRESS_INDEX_ON] == []

    ds_enrich = result["enrich"]
    assert set(ds_enrich.metadata.keys()) == {
        "creation_time",
        KTK_CUBE_METADATA_DIMENSION_COLUMNS,
        KTK_CUBE_METADATA_KEY_IS_SEED,
        KTK_CUBE_METADATA_PARTITION_COLUMNS,
        KTK_CUBE_METADATA_SUPPRESS_INDEX_ON,
        "foo",
    }
    assert ds_enrich.metadata[KTK_CUBE_METADATA_DIMENSION_COLUMNS] == list(
        cube.dimension_columns
    )
    assert ds_enrich.metadata[KTK_CUBE_METADATA_KEY_IS_SEED] is False
    assert ds_enrich.metadata[KTK_CUBE_METADATA_PARTITION_COLUMNS] == list(
        cube.partition_columns
    )
    assert ds_enrich.metadata["foo"] == 1
    assert ds_source.metadata[KTK_CUBE_METADATA_SUPPRESS_INDEX_ON] == []
def test_dimension_index_suppression(driver, function_store):
    """
    Test that suppress_index_on works as expected
    """
    df_source = pd.DataFrame(
        {
            "x": [0, 0, 1, 1],
            "y": [10, 11, 12, 13],
            "p": [0, 0, 1, 1],
            "v1": [10, 11, 12, 13],
            "i1": [100, 101, 102, 103],
        }
    )
    cube = Cube(
        dimension_columns=["x", "y"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2"],
        suppress_index_on=["x"],
    )
    result = driver(data={"source": df_source}, cube=cube, store=function_store)

    ds_source = result[cube.seed_dataset].load_all_indices(function_store())

    assert set(ds_source.indices.keys()) == {"p", "i1", "y"}
    assert isinstance(ds_source.indices["p"], PartitionIndex)
    assert isinstance(ds_source.indices["i1"], ExplicitSecondaryIndex)
    assert isinstance(ds_source.indices["y"], ExplicitSecondaryIndex)
Exemple #27
0
def _write_cube(function_store) -> Tuple[pd.DataFrame, Cube]:
    """
    Write a cube with dimension column "x" and partition column "p"

    returns the 'source' and 'enrich' dataframes and the cube specification.
    """
    df_source = pd.DataFrame(
        {
            "i1": [10, 11, 12, 13],
            "p": [0, 0, 1, 1],
            "v1": [10, 11, 12, 13],
            "x": [0, 1, 2, 3],
        }
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(
        data={"source": df_source},
        cube=cube,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )
    return df_source, cube
def test_fail_no_store_factory(driver, function_store, skip_eager):
    df = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]})
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    store = function_store()
    with pytest.raises(TypeError) as exc:
        driver(data=df, cube=cube, store=store, no_run=True)
    assert str(exc.value) == "store must be a factory but is HFilesystemStore"
Exemple #29
0
def cube():
    return Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube__uuid",
        seed_dataset="source",
    )
def test_fail_wrong_dataset_ids(driver, function_store, skip_eager, driver_name):
    if driver_name == "dask_dataframe":
        pytest.skip("not an interface for dask.dataframe")

    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )

    with pytest.raises(ValueError) as exc:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
            ktk_cube_dataset_ids=["source", "extra"],
        )

    assert (
        'Ktk_cube Dataset ID "enrich" is present during pipeline execution but was not '
        "specified in ktk_cube_dataset_ids (extra, source)." in str(exc.value)
    )