def test_invalid_uuid(): expected = { "dataset_metadata_version": 4, "dataset_uuid": "uuid.", "partitions": { "part_1": { "files": { "core": "file.parquet" } } }, } with pytest.raises(ValueError): DatasetMetadata.from_dict(expected) expected = { "dataset_metadata_version": 4, "dataset_uuid": "mañana", "partitions": { "part_1": { "files": { "core": "file.parquet" } } }, } with pytest.raises(ValueError): DatasetMetadata.from_dict(expected)
def test_existing_indices_are_added_when_missing_in_cube(): """ Test that indices already existing in the dataset are added to the validated cube """ source_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "source", "dataset_metadata_version": 4, "schema": FakeExtraTableMetadata(), "partition_keys": ["p"], "indices": { "d1": { "1": ["part_1"] }, "d2": { "1": ["part_1"] }, "i1": { "1": ["part_1"] }, "i2": { "1": ["part_1"] }, }, }) extra_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "extra", "dataset_metadata_version": 4, "schema": FakeExtraTableMetadata(), "partition_keys": ["p"], "indices": { "i1": { "1": ["part_1"] } }, }) cube = Cube( dimension_columns=["d1", "d2"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", index_columns=["i1"], ) validated_cube = ensure_valid_cube_indices( { "source": source_metadata, "extra": extra_metadata }, cube) assert validated_cube.index_columns == {"i1", "i2"}
def test_no_indices_are_suppressed_when_they_already_exist(): """ Test that no indicies marked as suppressed in the cube are actually suppressed when they are already present in the dataset """ source_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "source", "dataset_metadata_version": 4, "schema": FakeSeedTableMetadata(), "partition_keys": ["p"], "indices": { "d1": { "1": ["part_1"] }, "d2": { "1": ["part_1"] }, "i1": { "1": ["part_1"] }, }, }) extra_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "extra", "dataset_metadata_version": 4, "schema": FakeExtraTableMetadata(), "partition_keys": ["p"], "indices": { "i1": { "1": ["part_1"] } }, }) cube = Cube( dimension_columns=["d1", "d2"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", suppress_index_on=["d1", "d2"], ) validated_cube = ensure_valid_cube_indices( { "source": source_metadata, "extra": extra_metadata }, cube) assert validated_cube.suppress_index_on == frozenset()
def test_cube_with_valid_indices_is_not_modified_by_validation(): """ Test that a cube with valid indices is not modified by `ensure_valid_cube_indices` """ source_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "source", "dataset_metadata_version": 4, "schema": FakeSeedTableMetadata(), "partition_keys": ["p"], "indices": { "d1": { "1": ["part_1"] }, "d2": { "1": ["part_1"] }, "i1": { "1": ["part_1"] }, }, }) extra_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "extra", "dataset_metadata_version": 4, "schema": FakeExtraTableMetadata(), "partition_keys": ["p"], "indices": { "i1": { "1": ["part_1"] } }, }) cube = Cube( dimension_columns=["d1", "d2"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", index_columns=["i1"], ) validated_cube = ensure_valid_cube_indices( { "source": source_metadata, "extra": extra_metadata }, cube) assert validated_cube == cube
def test_raises_when_cube_defines_index_not_in_dataset(): """ Test that a `ValueError` is raised when the cube defines an index that is not part of a dataset """ source_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "source", "dataset_metadata_version": 4, "schema": FakeSeedTableMetadata(), "partition_keys": ["p"], "indices": { "d1": { "1": ["part_1"] }, "d2": { "1": ["part_1"] }, "i1": { "1": ["part_1"] }, }, }) extra_metadata = DatasetMetadata.from_dict({ "dataset_uuid": "extra", "dataset_metadata_version": 4, "schema": FakeExtraTableMetadata(), "partition_keys": ["p"], "indices": { "i1": { "1": ["part_1"] } }, }) cube = Cube( dimension_columns=["d1", "d2"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", index_columns=["i2"], ) with pytest.raises(ValueError): ensure_valid_cube_indices( { "source": source_metadata, "extra": extra_metadata }, cube)
def test_complicated_uuid(): expected = { "dataset_metadata_version": 4, "dataset_uuid": "uuid+namespace-attribute12_underscored", "partitions": { "part_1": { "files": { "core": "file.parquet" } } }, } DatasetMetadata.from_dict(expected)
def test_load_all_indices(store, metadata_version): meta_dct = { "dataset_metadata_version": metadata_version, "dataset_uuid": "uuid+namespace-attribute12_underscored", "partitions": { "location_id=1/part_1": { "files": { "core_data": "dataset_uuid/table/location_id=1/part_1.parquet" } } }, "indices": { "product_id": { "1": ["part_1"], "2": ["part_1"], "100": ["part_1"], "34": ["part_1"], } }, } dmd = DatasetMetadata.from_dict(meta_dct) dmd.schema = make_meta(pd.DataFrame( {"location_id": pd.Series([1], dtype=int)}), origin="core") dmd = dmd.load_all_indices(store) assert "product_id" in dmd.indices assert isinstance(dmd.indices["product_id"], ExplicitSecondaryIndex) assert "location_id" in dmd.indices assert isinstance(dmd.indices["location_id"], PartitionIndex) assert len(dmd.indices) == 2
def test_load_indices_embedded(metadata_version): expected = { "dataset_metadata_version": metadata_version, "dataset_uuid": "uuid+namespace-attribute12_underscored", "partitions": { "part_1": { "files": { "core_data": "file.parquest" } } }, "indices": { "product_id": { "1": ["part_1"], "2": ["part_1"], "100": ["part_1"], "34": ["part_1"], } }, } dmd = DatasetMetadata.from_dict(expected) assert "product_id" in dmd.indices with pytest.raises(KeyError): dmd.load_index("not there", store=None) dmd_loaded = dmd.load_index("product_id", store=None) assert "product_id" in dmd_loaded.indices
def test_raise_multitable(metadata_version): expected = { "dataset_metadata_version": metadata_version, "dataset_uuid": "uuid", "metadata": {}, "partitions": { "part_1": { "files": { "tableA": "file.parquet", "tableB": "file.parquet" } } }, "partition_keys": [], } with pytest.raises( RuntimeError, match= r"Dataset uuid has tables.*but read support for multi tabled dataset was dropped with kartothek 4\.0\.", ): DatasetMetadata.from_dict(expected)
def test_roundtrip_no_metadata(metadata_version, frozen_time): expected = { "dataset_metadata_version": metadata_version, "dataset_uuid": "uuid", "metadata": { "creation_time": "2000-01-01 01:01:01" }, "partition_keys": [], "partitions": { "part_1": { "files": { "core": "file.parquet" } } }, } result = DatasetMetadata.from_dict(expected).to_dict() assert expected == result
def test_builder_to_dataset(metadata_version, frozen_time): expected = { "dataset_uuid": "uuid", "dataset_metadata_version": metadata_version, "partitions": { "part_2": { "files": { "core": "uuid/core/part_2.parquet" } } }, "metadata": { "key": "value", "creation_time": TIME_TO_FREEZE_ISO }, "indices": { "col1": { "a": ["part1"], "b": ["part2"] } }, } builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version) part_2 = Partition("part_2", {"core": "uuid/core/part_2.parquet"}) builder.add_partition("part_2", part_2) builder.add_metadata("key", "value") builder.add_embedded_index( "col1", ExplicitSecondaryIndex("col1", { "a": ["part1"], "b": ["part2"] })) result = builder.to_dataset() expected_from_dict = DatasetMetadata.from_dict(expected) assert result == expected_from_dict
def test_roundtrip_empty(metadata_version): ds = DatasetMetadata(uuid="dataset_uuid", metadata_version=metadata_version) assert ds == ds.from_dict(ds.to_dict())