def from_dict(dct: Dict, explicit_partitions: bool = True): """ Load dataset metadata from a dictionary. This must have no external references. Otherwise use ``load_from_dict`` to have them resolved automatically. """ # Use the builder class for reconstruction to have a single point for metadata version changes builder = DatasetMetadataBuilder( uuid=dct[naming.UUID_KEY], metadata_version=dct[naming.METADATA_VERSION_KEY], explicit_partitions=explicit_partitions, partition_keys=dct.get("partition_keys", None), table_meta=dct.get("table_meta", None), ) for key, value in dct.get("metadata", {}).items(): builder.add_metadata(key, value) for partition_label, part_dct in dct.get("partitions", {}).items(): builder.add_partition( partition_label, Partition.from_dict(partition_label, part_dct)) for column, index_dct in dct.get("indices", {}).items(): if isinstance(index_dct, IndexBase): builder.add_embedded_index(column, index_dct) else: builder.add_embedded_index( column, ExplicitSecondaryIndex.from_v2(column, index_dct)) return builder.to_dataset()
def test_query_indices_external(store, metadata_version): expected = { "dataset_metadata_version": metadata_version, "dataset_uuid": "uuid+namespace-attribute12_underscored", "partitions": { "part_1": { "files": { "core_data": "file.parquest" } }, "part_2": { "files": { "core_data": "file2.parquest" } }, }, "indices": { "product_id": "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet", "location_id": { "1": ["part_1"], "2": ["part_2"], "3": ["part_1"], "4": ["part_2"], }, }, } store.put( "uuid+namespace-attribute12_underscored.by-dataset-metadata.json", simplejson.dumps(expected).encode("utf-8"), ) df = pd.DataFrame({ "product_id": [1, 2, 100, 34], "partition": [ np.array(["part_1"], dtype=object), np.array(["part_2"], dtype=object), np.array(["part_1", "part_2"], dtype=object), np.array(["part_1"], dtype=object), ], }) schema = pa.schema([ pa.field("partition", pa.list_(pa.string())), pa.field("product_id", pa.int64()), ]) table = pa.Table.from_pandas(df, schema=schema) buf = pa.BufferOutputStream() pq.write_table(table, buf) store.put( "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet", buf.getvalue().to_pybytes(), ) store_schema_metadata( make_meta(df, origin="core"), "uuid+namespace-attribute12_underscored", store, "core_data", ) dmd = DatasetMetadata.load_from_store( "uuid+namespace-attribute12_underscored", store) dmd = dmd.load_index("product_id", store) assert dmd.query(product_id=2) == ["part_2"] dmd = dmd.load_all_indices(store) assert dmd.query(product_id=2, location_id=2) == ["part_2"] assert dmd.query(product_id=100, location_id=3) == ["part_1"] assert dmd.query(product_id=2, location_id=2, something_else="bla") == ["part_2"] additional_index = ExplicitSecondaryIndex.from_v2( "another_column", {"1": ["part_2", "part_3"]}) assert dmd.query(indices=[additional_index], another_column="1", product_id=2, location_id=2) == ["part_2"]