Esempio n. 1
0
def create_empty_dataset_header(
    store,
    dataset_uuid,
    table_meta,
    partition_on=None,
    metadata=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Create an dataset header without any partitions. This may be used in combination
    with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets.

    .. note::

        The created dataset will **always** have explicit_partition==False

    .. warning::

        This function should only be used in very rare occasions. Usually you're better off using
        full end-to-end pipelines.

    Parameters
    ----------
    """
    store = lazy_store(store)()
    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    for table, schema in table_meta.items():
        table_meta[table] = make_meta(schema,
                                      origin=table,
                                      partition_keys=partition_on)
        store_schema_metadata(
            schema=table_meta[table],
            dataset_uuid=dataset_uuid,
            store=store,
            table=table,
        )
    dataset_builder = DatasetMetadataBuilder(
        uuid=dataset_uuid,
        metadata_version=metadata_version,
        partition_keys=partition_on,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    if metadata:
        for key, value in metadata.items():
            dataset_builder.add_metadata(key, value)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    return dataset_builder.to_dataset()
Esempio n. 2
0
def test_builder_full(metadata_version, frozen_time):
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "partitions": {
            "run_id=1/L=1/P=1/part_1": {
                "files": {
                    "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet",
                    "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet",
                }
            }
        },
        "metadata": {
            "key": "value",
            "creation_time": TIME_TO_FREEZE_ISO
        },
        "indices": {
            "col1": {
                "a": ["run_id=1/L=1/P=1/part_1"],
                "b": ["run_id=2/L=1/P=1/part_1"],
            },
            "col2": "uuid.col2.by-dataset-index.parquet",
        },
        "partition_keys": ["L", "P"],
    }

    builder = DatasetMetadataBuilder("uuid",
                                     metadata_version=metadata_version,
                                     partition_keys=["L", "P"])
    part_2 = Partition(
        label="run_id=1/L=1/P=1/part_1",
        files={
            "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet",
            "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet",
        },
    )
    builder.add_partition("run_id=1/L=1/P=1/part_1", part_2)
    builder.add_metadata("key", "value")
    builder.add_external_index("col2")
    builder.add_embedded_index(
        "col1",
        ExplicitSecondaryIndex("col1", {
            "a": ["run_id=1/L=1/P=1/part_1"],
            "b": ["run_id=2/L=1/P=1/part_1"]
        }),
    )
    key, result = builder.to_json()
    result = simplejson.loads(result)
    assert key == "uuid.by-dataset-metadata.json"
    assert result == expected
Esempio n. 3
0
def test_builder_to_dataset(metadata_version, frozen_time):
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "partitions": {
            "part_2": {
                "files": {
                    "core": "uuid/core/part_2.parquet"
                }
            }
        },
        "metadata": {
            "key": "value",
            "creation_time": TIME_TO_FREEZE_ISO
        },
        "indices": {
            "col1": {
                "a": ["part1"],
                "b": ["part2"]
            }
        },
    }

    builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version)
    part_2 = Partition("part_2", {"core": "uuid/core/part_2.parquet"})
    builder.add_partition("part_2", part_2)
    builder.add_metadata("key", "value")
    builder.add_embedded_index(
        "col1", ExplicitSecondaryIndex("col1", {
            "a": ["part1"],
            "b": ["part2"]
        }))

    result = builder.to_dataset()
    expected_from_dict = DatasetMetadata.from_dict(expected)
    assert result == expected_from_dict