def create_empty_dataset_header( store, dataset_uuid, table_meta, partition_on=None, metadata=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, ): """ Create an dataset header without any partitions. This may be used in combination with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets. .. note:: The created dataset will **always** have explicit_partition==False .. warning:: This function should only be used in very rare occasions. Usually you're better off using full end-to-end pipelines. Parameters ---------- """ store = _make_callable(store)() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) for table, schema in table_meta.items(): table_meta[table] = make_meta(schema, origin=table, partition_keys=partition_on) store_schema_metadata( schema=table_meta[table], dataset_uuid=dataset_uuid, store=store, table=table, ) dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=partition_on, explicit_partitions=False, table_meta=table_meta, ) if metadata: for key, value in metadata.items(): dataset_builder.add_metadata(key, value) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) return dataset_builder.to_dataset()
def test_schema_roundtrip(df_all_types, store): expected_meta = make_meta(df_all_types, origin="df_all_types") store_schema_metadata( expected_meta, dataset_uuid="dataset_uuid", store=store, table="table" ) result = read_schema_metadata( dataset_uuid="dataset_uuid", store=store, table="table" ) assert result == expected_meta
def test_dynamic_partitions_quote(store, metadata_version): """ Do not specify partitions in metadata, but read them dynamically from store """ dataset_uuid = "uuid-namespace-attribute12_underscored" partition0_core = create_partition_key(dataset_uuid, "core", [("location", "München")], "data.parquet") partition1_core = create_partition_key(dataset_uuid, "core", [("location", "å\\ øß")], "data.parquet") metadata = { "dataset_metadata_version": metadata_version, "dataset_uuid": dataset_uuid, } expected_partitions = { "location=M%C3%BCnchen/data": { "files": { "core": partition0_core } }, "location=%C3%A5%5C%20%C3%B8%C3%9F/data": { "files": { "core": partition1_core } }, } expected_indices = { "location": { "München": ["location=M%C3%BCnchen/data"], "å\\ øß": ["location=%C3%A5%5C%20%C3%B8%C3%9F/data"], } } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"), dataset_uuid, store, "core", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions assert dmd_dict["indices"] == expected_indices
def test_overlap_keyspace(store, metadata_version): dataset_uuid1 = "uuid+namespace-attribute12_underscored" dataset_uuid2 = "uuid+namespace-attribute12_underscored_ext" table = "core" for dataset_uuid in (dataset_uuid1, dataset_uuid2): partition0 = "location=L-0".format(dataset_uuid) partition0_key = "{}/{}/{}/data.parquet".format( dataset_uuid, table, partition0) metadata = { "dataset_metadata_version": metadata_version, "dataset_uuid": dataset_uuid, } # put two partitions for two tables each to store store.put( "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX), simplejson.dumps(metadata).encode("utf-8"), ) store.put(partition0_key, b"test") store_schema_metadata( make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"), dataset_uuid, store, "core", ) for dataset_uuid in (dataset_uuid1, dataset_uuid2): partition0_label = "location=L-0/data".format(dataset_uuid) partition0_key = "{}/{}/{}.parquet".format(dataset_uuid, table, partition0_label) expected_partitions = { "location=L-0/data": { "files": { "core": partition0_key } } } expected_indices = {"location": {"L-0": ["location=L-0/data"]}} assert DatasetMetadata.storage_keys(dataset_uuid, store) == [ "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX), _get_common_metadata_key(dataset_uuid, "core"), partition0_key, ] dmd = DatasetMetadata.load_from_store(dataset_uuid, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions assert dmd_dict["indices"] == expected_indices
def test_store_schema_metadata(store, df_all_types): store_schema_metadata( schema=make_meta(df_all_types, origin="df_all_types"), dataset_uuid="some_uuid", store=store, table="some_table", ) key = "some_uuid/some_table/_common_metadata" assert key in store.keys() pq_file = pq.ParquetFile(store.open(key)) actual_schema = pq_file.schema.to_arrow_schema() fields = [ pa.field("array_float32", pa.list_(pa.float64())), pa.field("array_float64", pa.list_(pa.float64())), pa.field("array_int16", pa.list_(pa.int64())), pa.field("array_int32", pa.list_(pa.int64())), pa.field("array_int64", pa.list_(pa.int64())), pa.field("array_int8", pa.list_(pa.int64())), pa.field("array_uint16", pa.list_(pa.uint64())), pa.field("array_uint32", pa.list_(pa.uint64())), pa.field("array_uint64", pa.list_(pa.uint64())), pa.field("array_uint8", pa.list_(pa.uint64())), pa.field("array_unicode", pa.list_(pa.string())), pa.field("bool", pa.bool_()), pa.field("byte", pa.binary()), pa.field("date", pa.date32()), pa.field("datetime64", pa.timestamp("us")), pa.field("float32", pa.float64()), pa.field("float64", pa.float64()), pa.field("int16", pa.int64()), pa.field("int32", pa.int64()), pa.field("int64", pa.int64()), pa.field("int8", pa.int64()), pa.field("null", pa.null()), pa.field("uint16", pa.uint64()), pa.field("uint32", pa.uint64()), pa.field("uint64", pa.uint64()), pa.field("uint8", pa.uint64()), pa.field("unicode", pa.string()), ] if not ARROW_LARGER_EQ_0130: fields.append(pa.field("__index_level_0__", pa.int64())) expected_schema = pa.schema(fields) assert actual_schema.remove_metadata() == expected_schema
def test_load_from_store_with_indices(store): meta_dct = { "dataset_metadata_version": 4, "dataset_uuid": "uuid", "partitions": { "product_id=1/part_1": { "files": { "core_data": "dataset_uuid/table/location_id=1/part_1.parquet" } } }, "indices": { "product_id": { "1": ["part_1"], "2": ["part_1"], "100": ["part_1"], "34": ["part_1"], } }, } store.put("uuid.by-dataset-metadata.json", simplejson.dumps(meta_dct).encode("utf-8")) df = pd.DataFrame({"index": [1], "location_id": [1], "product_id": [1]}) store_schema_metadata(make_meta(df, origin="core"), "uuid", store, "core_data") storage_key = "uuid/some_index.parquet" index2 = ExplicitSecondaryIndex( column="location_id", index_dct={ 1: ["part_1", "part_2"], 3: ["part_3"] }, index_storage_key=storage_key, dtype=pa.int64(), ) index2.store(store, "dataset_uuid") dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid") assert "location_id" not in dmd.indices dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid", load_all_indices=True) assert "location_id" in dmd.indices
def test_load_partition_indices_types(store): dataset_uuid = "uuid+namespace-attribute12_underscored" table = "table" index_name = "location_id" index_value = 1 meta_dct = { "dataset_metadata_version": 4, "dataset_uuid": dataset_uuid, "partitions": { "{index_name}={index_value}/part_1".format(index_name=index_name, index_value=index_value): { "files": { table: "{dataset_uuid}/{table}/location_id=1/part_1.parquet". format(dataset_uuid=dataset_uuid, table=table) } } }, } store.put( "{dataset_uuid}.by-dataset-metadata.json".format( dataset_uuid=dataset_uuid), simplejson.dumps(meta_dct).encode(), ) store_schema_metadata( make_meta( pd.DataFrame({index_name: pd.Series([index_value], dtype=int)}), origin="core", ), dataset_uuid, store, table, ) dmd = DatasetMetadata.load_from_store(store=store, uuid=dataset_uuid) dmd = dmd.load_partition_indices() assert len(dmd.indices) == 1 assert "location_id" in dmd.indices assert isinstance(dmd.indices["location_id"], PartitionIndex) idx = dmd.indices["location_id"] assert idx.dtype == pa.int64() assert idx.query(1) == ["location_id=1/part_1"]
def test_load_partition_keys(store): expected = { "dataset_metadata_version": 4, "dataset_uuid": "uuid", "partitions": { "part_1": { "files": { "core_data": "uuid/table/index=1/index2=2/file.parquet" } }, "part_2": { "files": { "core_data": "uuid/table/index=1/index2=2/file2.parquet" } }, }, "indices": { "product_id": { "1": ["part_1"], "2": ["part_2"], "100": ["part_1", "part_2"], "34": ["part_1"], }, "location_id": { "1": ["part_1"], "2": ["part_2"], "3": ["part_1"], "4": ["part_2"], }, }, } store.put("uuid.by-dataset-metadata.json", simplejson.dumps(expected).encode("utf-8")) df = pd.DataFrame({ "index": [1], "index2": [1], "product_id": [1], "location_id": [1] }) store_schema_metadata(make_meta(df, origin="core"), "uuid", store, "core_data") dmd = DatasetMetadata.load_from_store("uuid", store) assert dmd.partition_keys == ["index", "index2"]
def persist_common_metadata(partition_list, update_dataset, store, dataset_uuid): # hash the schemas for quick equality check with possible false negatives # (e.g. other pandas version or null schemas) tm_dct = defaultdict(set) for mp in partition_list: for tab, tm in mp.table_meta.items(): tm_dct[tab].add(tm) if update_dataset: if set(tm_dct.keys()) and set(update_dataset.tables) != set( tm_dct.keys()): raise ValueError(( "Input partitions for update have different tables than dataset:\n" "Input partition tables: {}\n" "Tables of existing dataset: {}").format( set(tm_dct.keys()), update_dataset.tables)) for table in update_dataset.tables: tm_dct[table].add( read_schema_metadata(dataset_uuid=dataset_uuid, store=store, table=table)) result = {} # sort tables and schemas to have reproducible error messages for table in sorted(tm_dct.keys()): schemas = sorted(tm_dct[table], key=lambda s: sorted(s.origin)) try: result[table] = validate_compatible(schemas) except ValueError as e: raise ValueError( "Schemas for table '{table}' of dataset '{dataset_uuid}' are not compatible!\n\n{e}" .format(table=table, dataset_uuid=dataset_uuid, e=e)) validate_shared_columns(list(result.values())) for table, schema in result.items(): store_schema_metadata(schema=schema, dataset_uuid=dataset_uuid, store=store, table=table) return result
def test_compat_old_rw_path(df_all_types, store): # strip down DF before some column types weren't supported before anyway df = df_all_types[ [ c for c in df_all_types.columns if ( not c.startswith("array_") # array types (always null) and c != "unicode" # unicode type (alway null) and "8" not in c # 8 bit types are casted to 64 bit and "16" not in c # 16 bit types are casted to 64 bit and "32" not in c # 32 bit types are casted to 64 bit ) ] ] expected_meta = make_meta(df, origin="df") # old schema write path old_meta = dask_make_meta(df) pa_table = pa.Table.from_pandas(old_meta) buf = pa.BufferOutputStream() pq.write_table(pa_table, buf, version="2.0") key_old = _get_common_metadata_key("dataset_uuid_old", "table") store.put(key_old, buf.getvalue().to_pybytes()) actual_meta = read_schema_metadata( dataset_uuid="dataset_uuid_old", store=store, table="table" ) validate_compatible([actual_meta, expected_meta]) store_schema_metadata( schema=make_meta(df, origin="df"), dataset_uuid="dataset_uuid_new", store=store, table="table", ) key_new = _get_common_metadata_key("dataset_uuid_new", "table") actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store) actual_df["date"] = actual_df["date"].dt.date pdt.assert_frame_equal(actual_df, old_meta)
def test_read_table_meta(store): meta_dct = { "dataset_metadata_version": 4, "dataset_uuid": "dataset_uuid", "partitions": { "location_id=1/part_1": { "files": { "table1": "dataset_uuid/table1/location_id=1/part_1.parquet" } } }, } df1 = pd.DataFrame({ "location_id": pd.Series([1], dtype=int), "x": pd.Series([True], dtype=bool) }) schema1 = make_meta(df1, origin="1") store_schema_metadata(schema1, "dataset_uuid", store, "table1") dmd = DatasetMetadata.load_from_dict(meta_dct, store) assert dmd.schema == schema1
def test_reconstruct_index_duplicates(store): ser = ParquetSerializer() df = pd.DataFrame({"index_col": [1, 1], "column": list("ab")}) label = "dontcare" key_prefix = "uuid/table/index_col=2/{}".format(label) key = ser.store(store, key_prefix, df) schema = make_meta(df, origin="1", partition_keys="index_col") store_schema_metadata(schema, "uuid", store) mp = MetaPartition( label="dontcare", file=key, metadata_version=4, schema=schema, partition_keys=["index_col"], ) mp = mp.load_dataframes(store) df_actual = mp.data df_expected = pd.DataFrame( OrderedDict([("index_col", [2, 2]), ("column", list("ab"))])) pdt.assert_frame_equal(df_actual, df_expected)
def test_partition_on_roundtrip(store): original_df = pd.DataFrame( OrderedDict([("test", [1, 2, 3]), ("some_values", [1, 2, 3])])) mp = MetaPartition( label="label_1", data={"core": original_df}, dataset_metadata={"dataset": "metadata"}, metadata_version=4, ) new_mp = mp.partition_on(["test"]) new_mp = new_mp.store_dataframes(store=store, dataset_uuid="some_uuid") store_schema_metadata(new_mp.table_meta["core"], "some_uuid", store, "core") # Test immediately after dropping and later once with new metapartition to check table meta reloading new_mp = new_mp.load_dataframes(store=store) assert len(new_mp.metapartitions) == 3 dfs = [] for internal_mp in new_mp: dfs.append(internal_mp.data["core"]) actual_df = pd.concat(dfs).sort_values(by="test").reset_index(drop=True) pdt.assert_frame_equal(original_df, actual_df) for i in range(1, 4): # Check with fresh metapartitions new_mp = MetaPartition( label="test={}/label_1".format(i), files={"core": "some_uuid/core/test={}/label_1.parquet".format(i)}, metadata_version=4, ) new_mp = new_mp.load_dataframes(store=store) actual_df = new_mp.data["core"] expected_df = pd.DataFrame( OrderedDict([("test", [i]), ("some_values", [i])])) pdt.assert_frame_equal(expected_df, actual_df)
def test_reconstruct_date_index(store, metadata_version, dates_as_object): ser = ParquetSerializer() # If the parquet file does include the primary index col, still use the reconstructed index and ignore the content of the file df = pd.DataFrame( {"index_col": [date(2018, 6, 1), date(2018, 6, 1)], "column": list("ab")} ) label = "dontcare" key_prefix = "uuid/table/index_col=2018-06-02/{}".format(label) key = ser.store(store, key_prefix, df) schema = make_meta(df, origin="1", partition_keys="index_col") store_schema_metadata(schema, "uuid", store) mp = MetaPartition( label="dontcare", file=key, metadata_version=metadata_version, schema=schema, partition_keys=["index_col"], ) mp = mp.load_dataframes(store, dates_as_object=dates_as_object) df_actual = mp.data if dates_as_object: dt_constructor = date else: dt_constructor = datetime df_expected = pd.DataFrame( OrderedDict( [ ("index_col", [dt_constructor(2018, 6, 2), dt_constructor(2018, 6, 2)]), ("column", list("ab")), ] ) ) pdt.assert_frame_equal(df_actual, df_expected)
def test_dynamic_partitions(store): """ Do not specify partitions in metadata, but read them dynamically from store """ partition_suffix = "suffix" dataset_uuid = "uuid+namespace-attribute12_underscored" partition0_core = create_partition_key( dataset_uuid, "core", [("location", "L-0")], "{}.parquet".format(partition_suffix), ) partition1_core = create_partition_key( dataset_uuid, "core", [("location", "L-1")], "{}.parquet".format(partition_suffix), ) partition0_ext = create_partition_key( dataset_uuid, "extension", [("location", "L-0")], "{}.parquet".format(partition_suffix), ) partition1_ext = create_partition_key( dataset_uuid, "extension", [("location", "L-1")], "{}.parquet".format(partition_suffix), ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/{}".format(partition_suffix): { "files": { "core": partition0_core, "extension": partition0_ext } }, "location=L-1/{}".format(partition_suffix): { "files": { "core": partition1_core, "extension": partition1_ext } }, } expected_indices = { "location": { "L-0": ["location=L-0/{}".format(partition_suffix)], "L-1": ["location=L-1/{}".format(partition_suffix)], } } # put two partitions for two tables each to store store.put( "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX), simplejson.dumps(metadata).encode("utf-8"), ) store.put(partition0_core, b"test") store.put(partition1_core, b"test") store.put(partition0_ext, b"test") store.put(partition1_ext, b"test") store_schema_metadata( make_meta( pd.DataFrame({"location": ["L-0/{}".format(partition_suffix)]}), origin="stored", ), dataset_uuid, store, "core", ) # instantiate metadata to write table metadatad core_schema = make_meta( pd.DataFrame({ "column_0": pd.Series([1], dtype=int), "column_1": pd.Series([1], dtype=int), "location": pd.Series(["str"]), }), origin="core", ) extension_schema = make_meta( pd.DataFrame({ "column_77": pd.Series([1], dtype=int), "column_78": pd.Series([1], dtype=int), "location": pd.Series(["str"]), }), origin="extension", ) store_schema_metadata(core_schema, dataset_uuid, store, "core") store_schema_metadata(extension_schema, dataset_uuid, store, "extension") dmd = DatasetMetadata.load_from_store(dataset_uuid, store) # reload metadata to use table metadata dmd = DatasetMetadata.load_from_store(dataset_uuid, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions assert dmd_dict["indices"] == expected_indices
def test_query_indices_external(store, metadata_version): expected = { "dataset_metadata_version": metadata_version, "dataset_uuid": "uuid+namespace-attribute12_underscored", "partitions": { "part_1": { "files": { "core_data": "file.parquest" } }, "part_2": { "files": { "core_data": "file2.parquest" } }, }, "indices": { "product_id": "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet", "location_id": { "1": ["part_1"], "2": ["part_2"], "3": ["part_1"], "4": ["part_2"], }, }, } store.put( "uuid+namespace-attribute12_underscored.by-dataset-metadata.json", simplejson.dumps(expected).encode("utf-8"), ) df = pd.DataFrame({ "product_id": [1, 2, 100, 34], "partition": [ np.array(["part_1"], dtype=object), np.array(["part_2"], dtype=object), np.array(["part_1", "part_2"], dtype=object), np.array(["part_1"], dtype=object), ], }) schema = pa.schema([ pa.field("partition", pa.list_(pa.string())), pa.field("product_id", pa.int64()), ]) table = pa.Table.from_pandas(df, schema=schema) buf = pa.BufferOutputStream() pq.write_table(table, buf) store.put( "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet", buf.getvalue().to_pybytes(), ) store_schema_metadata( make_meta(df, origin="core"), "uuid+namespace-attribute12_underscored", store, "core_data", ) dmd = DatasetMetadata.load_from_store( "uuid+namespace-attribute12_underscored", store) dmd = dmd.load_index("product_id", store) assert dmd.query(product_id=2) == ["part_2"] dmd = dmd.load_all_indices(store) assert dmd.query(product_id=2, location_id=2) == ["part_2"] assert dmd.query(product_id=100, location_id=3) == ["part_1"] assert dmd.query(product_id=2, location_id=2, something_else="bla") == ["part_2"] additional_index = ExplicitSecondaryIndex.from_v2( "another_column", {"1": ["part_2", "part_3"]}) assert dmd.query(indices=[additional_index], another_column="1", product_id=2, location_id=2) == ["part_2"]
def test_dynamic_partitions_with_garbage(store): """ In case there are unknown files, dataset and indices still load correctly """ dataset_uuid = "uuid+namespace-attribute12_underscored" partition_suffix = "suffix" partition0_core = create_partition_key( dataset_uuid, "core", [("location", "L-0"), ("product", "P-0")], "{}.parquet".format(partition_suffix), ) partition1_core = create_partition_key( dataset_uuid, "core", [("location", "L-1"), ("product", "P-0")], "{}.parquet".format(partition_suffix), ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/product=P-0/{}".format(partition_suffix): { "files": { "core": partition0_core } }, "location=L-1/product=P-0/{}".format(partition_suffix): { "files": { "core": partition1_core } }, } expected_indices = { "location": { "L-0": ["location=L-0/product=P-0/{}".format(partition_suffix)], "L-1": ["location=L-1/product=P-0/{}".format(partition_suffix)], }, "product": { "P-0": [ "location=L-0/product=P-0/{}".format(partition_suffix), "location=L-1/product=P-0/{}".format(partition_suffix), ] }, } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({ "location": ["L-0"], "product": ["P-0"] }), origin="1"), dataset_uuid, store, "core", ) # the following files are garbage and should not interfere with the indices and/or partitions for suffix in ["", ".json", ".msgpack", ".my_own_file_format"]: store.put("this_should_not_exist{}".format(suffix), b"ignore me") store.put("{}/this_should_not_exist{}".format(dataset_uuid, suffix), b"ignore me") store.put( "{}/{}/this_should_not_exist{}".format(dataset_uuid, "core", suffix), b"ignore me", ) store.put( "{}/{}/location=L-0/this_should_not_exist{}".format( dataset_uuid, "core", suffix), b"ignore me", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions # Sorting may differ in the index list. This is ok for runtime # but does produce flaky tests thus sort them. sorted_result = { column: {label: sorted(x) for label, x in index.items()} for column, index in dmd_dict["indices"].items() } assert sorted_result == expected_indices
def test_dynamic_partitions_multiple_indices(store): """ Do not specify partitions in metadata, but read them dynamically from store """ suffix = "suffix" dataset_uuid = "uuid+namespace-attribute12_underscored" partition0_core = create_partition_key( dataset_uuid, "core", [("location", "L-0"), ("product", "P-0")], "{}.parquet".format(suffix), ) partition1_core = create_partition_key( dataset_uuid, "core", [("location", "L-1"), ("product", "P-0")], "{}.parquet".format(suffix), ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/product=P-0/{}".format(suffix): { "files": { "core": partition0_core } }, "location=L-1/product=P-0/{}".format(suffix): { "files": { "core": partition1_core } }, } expected_indices = { "location": { "L-0": ["location=L-0/product=P-0/{}".format(suffix)], "L-1": ["location=L-1/product=P-0/{}".format(suffix)], }, "product": { "P-0": [ "location=L-0/product=P-0/{}".format(suffix), "location=L-1/product=P-0/{}".format(suffix), ] }, } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({ "location": ["L-0"], "product": ["P-0"] }), origin="1"), dataset_uuid, store, "core", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions # Sorting may differ in the index list. This is ok for runtime # but does produce flaky tests thus sort them. sorted_result = { column: {label: sorted(x) for label, x in index.items()} for column, index in dmd_dict["indices"].items() } assert sorted_result == expected_indices