Esempio n. 1
0
def test_fail_nondistinc_payload(driver, function_store):
    """
    This would lead to problems during the query phase.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )
    cause = exc_info.value.__cause__
    assert isinstance(cause, ValueError)
    assert "Found columns present in multiple datasets" in str(cause)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Esempio n. 2
0
def test_fail_partition_on_4(driver, function_store):
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )

    with pytest.raises(
        ValueError, match="Unspecified but provided partition columns in enrich: p"
    ):
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
            partition_on={"enrich": []},
        )
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Esempio n. 3
0
def test_fail_wrong_types(driver, function_store):
    """
    Might catch nasty pandas and other type bugs.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0.0, 1.0, 2.0, 3.0], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )

    cause = exc_info.value.__cause__
    assert isinstance(cause, ValueError)
    assert 'Found incompatible entries for column "x"' in str(cause)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Esempio n. 4
0
def test_fail_all_empty(driver, function_store):
    """
    Might happen due to DB-based filters.
    """
    df = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]}
    ).loc[[]]
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    with pytest.raises(ValueError) as exc:
        driver(data=df, cube=cube, store=function_store)
    assert "Cannot write empty datasets: seed" in str(exc.value)

    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Esempio n. 5
0
def validate_partition_keys(
    dataset_uuid,
    store,
    ds_factory,
    default_metadata_version,
    partition_on,
    **load_kwargs,
):
    if ds_factory or DatasetMetadata.exists(dataset_uuid,
                                            _instantiate_store(store)):
        ds_factory = _ensure_factory(
            dataset_uuid=dataset_uuid,
            store=store,
            factory=ds_factory,
            load_dataset_metadata=load_kwargs.pop("load_dataset_metadata",
                                                  True),
        )

        ds_metadata_version = ds_factory.metadata_version
        if partition_on:
            if not isinstance(partition_on, list):
                partition_on = [partition_on]
            if partition_on != ds_factory.partition_keys:
                raise ValueError(
                    "Incompatible set of partition keys encountered. "
                    "Input partitioning was `{}` while actual dataset was `{}`"
                    .format(partition_on, ds_factory.partition_keys))
        else:
            partition_on = ds_factory.partition_keys
    else:
        ds_factory = None
        ds_metadata_version = default_metadata_version
    return ds_factory, ds_metadata_version, partition_on
Esempio n. 6
0
def test_fails_projected_duplicates(driver, driver_name, function_store):
    """
    Test if duplicate check also works w/ projected data. (was a regression)
    """
    if driver_name == "dask_dataframe":
        pytest.xfail(reason="Cannot guarantee duplicates for DDF")
    df_source = pd.DataFrame(
        {
            "x": [0, 1, 0, 1],
            "y": [0, 0, 1, 1],
            "p": [0, 0, 1, 1],
            "v1": [10, 11, 12, 13],
        }
    )
    df_enrich = pd.DataFrame(
        {"y": [0, 0, 1, 1], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23], "v3": 42}
    )
    cube = Cube(
        dimension_columns=["x", "y"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(ValueError) as exc:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )
    msg = """
Found duplicate cells by [p, y] in dataset "enrich", example:

Keys:
p    0
y    0

Identical Payload:
v3    42

Non-Idential Payload:
   v2
0  20
1  21
""".strip()
    assert msg in str(exc.value)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Esempio n. 7
0
def test_fail_all_empty(driver, driver_name, function_store):
    """
    Might happen due to DB-based filters.
    """
    df = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]}
    ).loc[[]]
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")

    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(data=df, cube=cube, store=function_store)
    exc = exc_info.value.__cause__
    assert isinstance(exc, ValueError)
    assert "Cannot write empty datasets" in str(exc)

    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Esempio n. 8
0
def test_fail_duplicates_local(driver, driver_name, function_store):
    """
    Might happen during DB queries.
    """
    if driver_name == "dask_dataframe":
        pytest.xfail(reason="Cannot guarantee duplicates for DDF")
    df = pd.DataFrame(
        {
            "x": [0, 0],
            "y": ["a", "a"],
            "z": [pd.Timestamp("2017"), pd.Timestamp("2017")],
            "p": [0, 0],
        }
    )
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(ValueError) as exc:
        driver(data=df, cube=cube, store=function_store)
    msg = """
Found duplicate cells by [p, x, y, z] in dataset "source", example:

Keys:
p                      0
x                      0
y                      a
z    2017-01-01 00:00:00

Identical Payload:
n/a

Non-Idential Payload:
n/a
""".strip()
    assert msg in str(exc.value)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Esempio n. 9
0
def test_fails_null_dimension(driver, function_store):
    """
    Since we do not allow NULL values in queries, it should be banned from dimension columns in the first place.
    """
    df = pd.DataFrame(
        {"x": [0, 1, 2, np.nan], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]}
    )
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    with pytest.raises(ValueError) as exc:
        driver(data=df, cube=cube, store=function_store)

    assert 'Found NULL-values in dimension column "x" of dataset "seed"' in str(exc)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("seed"), function_store())
Esempio n. 10
0
def test_fails_incompatible_dtypes(driver, function_store, existing_cube):
    """
    Should also cross check w/ seed dataset.
    """
    df = pd.DataFrame({
        "x": [0.0, 1.0, 2.0, 3.0],
        "p": [0, 0, 1, 1],
        "v3": [10, 11, 12, 13],
        "i3": [100, 101, 102, 103],
    })
    with pytest.raises(ValueError) as exc:
        driver(data={"extra": df}, cube=existing_cube, store=function_store)
    assert 'Found incompatible entries for column "x"' in str(exc.value)
    assert not DatasetMetadata.exists(existing_cube.ktk_dataset_uuid("extra"),
                                      function_store())
Esempio n. 11
0
def test_fail_all_empty(driver, function_store, existing_cube):
    """
    Might happen due to DB-based filters.
    """
    df = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v": [10, 11, 12, 13]
    }).loc[[]]

    with pytest.raises(ValueError) as exc:
        driver(data={"extra": df}, cube=existing_cube, store=function_store)
    assert "Cannot write empty datasets: extra" in str(exc.value)
    assert not DatasetMetadata.exists(existing_cube.ktk_dataset_uuid("extra"),
                                      function_store())
def test_fail_all_empty(driver, function_store, existing_cube):
    """
    Might happen due to DB-based filters.
    """
    df = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v": [10, 11, 12, 13]
    }).loc[[]]

    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(data={"extra": df}, cube=existing_cube, store=function_store)
    exc = exc_info.value.__cause__
    assert isinstance(exc, ValueError)
    assert "Cannot write empty datasets: extra" in str(exc)
    assert not DatasetMetadata.exists(existing_cube.ktk_dataset_uuid("extra"),
                                      function_store())
def test_fails_overlapping_payload_seed(driver, function_store, existing_cube):
    """
    Forbidden by spec, results in problems during query.
    """
    pre_keys = set(function_store().keys())
    df = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    with pytest.raises(ValueError) as exc:
        driver(data={"extra": df}, cube=existing_cube, store=function_store)
    assert 'Payload written in "extra" is already present in cube: v1' in str(
        exc.value)
    assert not DatasetMetadata.exists(existing_cube.ktk_dataset_uuid("extra"),
                                      function_store())

    post_keys = set(function_store().keys())
    assert pre_keys == post_keys
Esempio n. 14
0
def test_fails_no_dimension_columns(driver, function_store):
    """
    Ensure that we catch missing dimension columns early.
    """
    df_source = pd.DataFrame({"x": [0, 1], "y": [0, 1], "z": [0, 1], "p": 0})
    df_enrich = pd.DataFrame({"p": [0], "v1": 0})
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(ValueError) as exc:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )
    assert (
        'Dataset "enrich" must have at least 1 of the following dimension columns: x, y'
        in str(exc.value)
    )
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_fails_overlapping_payload_partial(driver, function_store,
                                           existing_cube):
    """
    Forbidden by spec, results in problems during query.
    """
    pre_keys = set(function_store().keys())
    df1 = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    df2 = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v3": [10, 11, 12, 13]
    })
    with pytest.raises(ValueError) as exc:
        driver(
            data={
                "extra1": df1,
                "extra2": df2
            },
            cube=existing_cube,
            store=function_store,
        )
    assert 'Payload written in "extra1" is already present in cube: v2' in str(
        exc.value)

    assert not DatasetMetadata.exists(existing_cube.ktk_dataset_uuid("extra1"),
                                      function_store())
    # extra2 might exist, depending on the compute graph

    # extra2 keys might be present, only look that extra1 is absent
    post_keys = set(function_store().keys())
    extra_keys = post_keys - pre_keys
    extra1_keys = {k for k in extra_keys if "extra1" in k}
    assert extra1_keys == set()