def test_fail_nondistinc_payload(driver, function_store): """ This would lead to problems during the query phase. """ df_source = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]} ) df_enrich = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [20, 21, 22, 23]} ) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", ) with pytest.raises(MultiTableCommitAborted) as exc_info: driver( data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store, ) cause = exc_info.value.__cause__ assert isinstance(cause, ValueError) assert "Found columns present in multiple datasets" in str(cause) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store()) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_fail_partition_on_4(driver, function_store): df_source = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]} ) df_enrich = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]} ) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", ) with pytest.raises( ValueError, match="Unspecified but provided partition columns in enrich: p" ): driver( data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store, partition_on={"enrich": []}, ) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store()) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_fail_wrong_types(driver, function_store): """ Might catch nasty pandas and other type bugs. """ df_source = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]} ) df_enrich = pd.DataFrame( {"x": [0.0, 1.0, 2.0, 3.0], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]} ) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", ) with pytest.raises(MultiTableCommitAborted) as exc_info: driver( data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store, ) cause = exc_info.value.__cause__ assert isinstance(cause, ValueError) assert 'Found incompatible entries for column "x"' in str(cause) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store()) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_fail_all_empty(driver, function_store): """ Might happen due to DB-based filters. """ df = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]} ).loc[[]] cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") with pytest.raises(ValueError) as exc: driver(data=df, cube=cube, store=function_store) assert "Cannot write empty datasets: seed" in str(exc.value) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store()) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def validate_partition_keys( dataset_uuid, store, ds_factory, default_metadata_version, partition_on, **load_kwargs, ): if ds_factory or DatasetMetadata.exists(dataset_uuid, _instantiate_store(store)): ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=store, factory=ds_factory, load_dataset_metadata=load_kwargs.pop("load_dataset_metadata", True), ) ds_metadata_version = ds_factory.metadata_version if partition_on: if not isinstance(partition_on, list): partition_on = [partition_on] if partition_on != ds_factory.partition_keys: raise ValueError( "Incompatible set of partition keys encountered. " "Input partitioning was `{}` while actual dataset was `{}`" .format(partition_on, ds_factory.partition_keys)) else: partition_on = ds_factory.partition_keys else: ds_factory = None ds_metadata_version = default_metadata_version return ds_factory, ds_metadata_version, partition_on
def test_fails_projected_duplicates(driver, driver_name, function_store): """ Test if duplicate check also works w/ projected data. (was a regression) """ if driver_name == "dask_dataframe": pytest.xfail(reason="Cannot guarantee duplicates for DDF") df_source = pd.DataFrame( { "x": [0, 1, 0, 1], "y": [0, 0, 1, 1], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13], } ) df_enrich = pd.DataFrame( {"y": [0, 0, 1, 1], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23], "v3": 42} ) cube = Cube( dimension_columns=["x", "y"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", ) with pytest.raises(ValueError) as exc: driver( data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store, ) msg = """ Found duplicate cells by [p, y] in dataset "enrich", example: Keys: p 0 y 0 Identical Payload: v3 42 Non-Idential Payload: v2 0 20 1 21 """.strip() assert msg in str(exc.value) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store()) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_fail_all_empty(driver, driver_name, function_store): """ Might happen due to DB-based filters. """ df = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]} ).loc[[]] cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") with pytest.raises(MultiTableCommitAborted) as exc_info: driver(data=df, cube=cube, store=function_store) exc = exc_info.value.__cause__ assert isinstance(exc, ValueError) assert "Cannot write empty datasets" in str(exc) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store()) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_fail_duplicates_local(driver, driver_name, function_store): """ Might happen during DB queries. """ if driver_name == "dask_dataframe": pytest.xfail(reason="Cannot guarantee duplicates for DDF") df = pd.DataFrame( { "x": [0, 0], "y": ["a", "a"], "z": [pd.Timestamp("2017"), pd.Timestamp("2017")], "p": [0, 0], } ) cube = Cube( dimension_columns=["x", "y", "z"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", ) with pytest.raises(ValueError) as exc: driver(data=df, cube=cube, store=function_store) msg = """ Found duplicate cells by [p, x, y, z] in dataset "source", example: Keys: p 0 x 0 y a z 2017-01-01 00:00:00 Identical Payload: n/a Non-Idential Payload: n/a """.strip() assert msg in str(exc.value) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store()) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_fails_null_dimension(driver, function_store): """ Since we do not allow NULL values in queries, it should be banned from dimension columns in the first place. """ df = pd.DataFrame( {"x": [0, 1, 2, np.nan], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]} ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") with pytest.raises(ValueError) as exc: driver(data=df, cube=cube, store=function_store) assert 'Found NULL-values in dimension column "x" of dataset "seed"' in str(exc) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("seed"), function_store())
def test_fails_incompatible_dtypes(driver, function_store, existing_cube): """ Should also cross check w/ seed dataset. """ df = pd.DataFrame({ "x": [0.0, 1.0, 2.0, 3.0], "p": [0, 0, 1, 1], "v3": [10, 11, 12, 13], "i3": [100, 101, 102, 103], }) with pytest.raises(ValueError) as exc: driver(data={"extra": df}, cube=existing_cube, store=function_store) assert 'Found incompatible entries for column "x"' in str(exc.value) assert not DatasetMetadata.exists(existing_cube.ktk_dataset_uuid("extra"), function_store())
def test_fail_all_empty(driver, function_store, existing_cube): """ Might happen due to DB-based filters. """ df = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13] }).loc[[]] with pytest.raises(ValueError) as exc: driver(data={"extra": df}, cube=existing_cube, store=function_store) assert "Cannot write empty datasets: extra" in str(exc.value) assert not DatasetMetadata.exists(existing_cube.ktk_dataset_uuid("extra"), function_store())
def test_fail_all_empty(driver, function_store, existing_cube): """ Might happen due to DB-based filters. """ df = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13] }).loc[[]] with pytest.raises(MultiTableCommitAborted) as exc_info: driver(data={"extra": df}, cube=existing_cube, store=function_store) exc = exc_info.value.__cause__ assert isinstance(exc, ValueError) assert "Cannot write empty datasets: extra" in str(exc) assert not DatasetMetadata.exists(existing_cube.ktk_dataset_uuid("extra"), function_store())
def test_fails_overlapping_payload_seed(driver, function_store, existing_cube): """ Forbidden by spec, results in problems during query. """ pre_keys = set(function_store().keys()) df = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13] }) with pytest.raises(ValueError) as exc: driver(data={"extra": df}, cube=existing_cube, store=function_store) assert 'Payload written in "extra" is already present in cube: v1' in str( exc.value) assert not DatasetMetadata.exists(existing_cube.ktk_dataset_uuid("extra"), function_store()) post_keys = set(function_store().keys()) assert pre_keys == post_keys
def test_fails_no_dimension_columns(driver, function_store): """ Ensure that we catch missing dimension columns early. """ df_source = pd.DataFrame({"x": [0, 1], "y": [0, 1], "z": [0, 1], "p": 0}) df_enrich = pd.DataFrame({"p": [0], "v1": 0}) cube = Cube( dimension_columns=["x", "y", "z"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", ) with pytest.raises(ValueError) as exc: driver( data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store, ) assert ( 'Dataset "enrich" must have at least 1 of the following dimension columns: x, y' in str(exc.value) ) assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
def test_fails_overlapping_payload_partial(driver, function_store, existing_cube): """ Forbidden by spec, results in problems during query. """ pre_keys = set(function_store().keys()) df1 = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [10, 11, 12, 13] }) df2 = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v3": [10, 11, 12, 13] }) with pytest.raises(ValueError) as exc: driver( data={ "extra1": df1, "extra2": df2 }, cube=existing_cube, store=function_store, ) assert 'Payload written in "extra1" is already present in cube: v2' in str( exc.value) assert not DatasetMetadata.exists(existing_cube.ktk_dataset_uuid("extra1"), function_store()) # extra2 might exist, depending on the compute graph # extra2 keys might be present, only look that extra1 is absent post_keys = set(function_store().keys()) extra_keys = post_keys - pre_keys extra1_keys = {k for k in extra_keys if "extra1" in k} assert extra1_keys == set()