def create_empty_dataset_header( store, dataset_uuid, table_meta, partition_on=None, metadata=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, ): """ Create an dataset header without any partitions. This may be used in combination with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets. .. note:: The created dataset will **always** have explicit_partition==False .. warning:: This function should only be used in very rare occasions. Usually you're better off using full end-to-end pipelines. Parameters ---------- """ store = lazy_store(store)() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) for table, schema in table_meta.items(): table_meta[table] = make_meta(schema, origin=table, partition_keys=partition_on) store_schema_metadata( schema=table_meta[table], dataset_uuid=dataset_uuid, store=store, table=table, ) dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=partition_on, explicit_partitions=False, table_meta=table_meta, ) if metadata: for key, value in metadata.items(): dataset_builder.add_metadata(key, value) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) return dataset_builder.to_dataset()
def test_builder_full(metadata_version, frozen_time): expected = { "dataset_uuid": "uuid", "dataset_metadata_version": metadata_version, "partitions": { "run_id=1/L=1/P=1/part_1": { "files": { "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet", "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet", } } }, "metadata": { "key": "value", "creation_time": TIME_TO_FREEZE_ISO }, "indices": { "col1": { "a": ["run_id=1/L=1/P=1/part_1"], "b": ["run_id=2/L=1/P=1/part_1"], }, "col2": "uuid.col2.by-dataset-index.parquet", }, "partition_keys": ["L", "P"], } builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version, partition_keys=["L", "P"]) part_2 = Partition( label="run_id=1/L=1/P=1/part_1", files={ "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet", "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet", }, ) builder.add_partition("run_id=1/L=1/P=1/part_1", part_2) builder.add_metadata("key", "value") builder.add_external_index("col2") builder.add_embedded_index( "col1", ExplicitSecondaryIndex("col1", { "a": ["run_id=1/L=1/P=1/part_1"], "b": ["run_id=2/L=1/P=1/part_1"] }), ) key, result = builder.to_json() result = simplejson.loads(result) assert key == "uuid.by-dataset-metadata.json" assert result == expected
def test_builder_to_dataset(metadata_version, frozen_time): expected = { "dataset_uuid": "uuid", "dataset_metadata_version": metadata_version, "partitions": { "part_2": { "files": { "core": "uuid/core/part_2.parquet" } } }, "metadata": { "key": "value", "creation_time": TIME_TO_FREEZE_ISO }, "indices": { "col1": { "a": ["part1"], "b": ["part2"] } }, } builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version) part_2 = Partition("part_2", {"core": "uuid/core/part_2.parquet"}) builder.add_partition("part_2", part_2) builder.add_metadata("key", "value") builder.add_embedded_index( "col1", ExplicitSecondaryIndex("col1", { "a": ["part1"], "b": ["part2"] })) result = builder.to_dataset() expected_from_dict = DatasetMetadata.from_dict(expected) assert result == expected_from_dict