コード例 #1
0
ファイル: eager.py プロジェクト: nefta-kanilmaz-by/kartothek
def create_empty_dataset_header(
    store,
    dataset_uuid,
    table_meta,
    partition_on=None,
    metadata=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Create an dataset header without any partitions. This may be used in combination
    with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets.

    .. note::

        The created dataset will **always** have explicit_partition==False

    .. warning::

        This function should only be used in very rare occasions. Usually you're better off using
        full end-to-end pipelines.

    Parameters
    ----------
    """
    store = lazy_store(store)()
    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    for table, schema in table_meta.items():
        table_meta[table] = make_meta(schema,
                                      origin=table,
                                      partition_keys=partition_on)
        store_schema_metadata(
            schema=table_meta[table],
            dataset_uuid=dataset_uuid,
            store=store,
            table=table,
        )
    dataset_builder = DatasetMetadataBuilder(
        uuid=dataset_uuid,
        metadata_version=metadata_version,
        partition_keys=partition_on,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    if metadata:
        for key, value in metadata.items():
            dataset_builder.add_metadata(key, value)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    return dataset_builder.to_dataset()
コード例 #2
0
ファイル: write.py プロジェクト: x-malet/kartothek
def store_dataset_from_partitions(
    partition_list,
    store,
    dataset_uuid,
    dataset_metadata=None,
    metadata_merger=None,
    update_dataset=None,
    remove_partitions=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
):
    store = _instantiate_store(store)

    if update_dataset:
        dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset)
        metadata_version = dataset_builder.metadata_version
    else:
        mp = next(iter(partition_list), None)
        if mp is None:
            raise ValueError(
                "Cannot store empty datasets, partition_list must not be empty if in store mode."
            )

        metadata_version = mp.metadata_version
        dataset_builder = DatasetMetadataBuilder(
            uuid=dataset_uuid,
            metadata_version=metadata_version,
            partition_keys=mp.partition_keys,
        )

    dataset_builder.explicit_partitions = True

    dataset_builder.table_meta = persist_common_metadata(
        partition_list, update_dataset, store, dataset_uuid)

    # We can only check for non unique partition labels here and if they occur we will
    # fail hard. The resulting dataset may be corrupted or file may be left in the store
    # without dataset metadata
    partition_labels = partition_labels_from_mps(partition_list)
    non_unique_labels = extract_duplicates(partition_labels)

    if non_unique_labels:
        raise ValueError(
            "The labels {} are duplicated. Dataset metadata was not written.".
            format(", ".join(non_unique_labels)))

    if remove_partitions is None:
        remove_partitions = []

    if metadata_merger is None:
        metadata_merger = combine_metadata

    dataset_builder = update_metadata(dataset_builder, metadata_merger,
                                      partition_list, dataset_metadata)
    dataset_builder = update_partitions(dataset_builder, partition_list,
                                        remove_partitions)
    dataset_builder = update_indices(dataset_builder, store, partition_list,
                                     remove_partitions)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    dataset = dataset_builder.to_dataset()
    return dataset
コード例 #3
0
def store_dataset_from_partitions(
    partition_list,
    store: StoreInput,
    dataset_uuid,
    dataset_metadata=None,
    metadata_merger=None,
    update_dataset=None,
    remove_partitions=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
):
    store = ensure_store(store)

    schemas = set()
    if update_dataset:
        dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset)
        metadata_version = dataset_builder.metadata_version
        table_name = update_dataset.table_name
        schemas.add(update_dataset.schema)
    else:
        mp = next(iter(partition_list), None)

        if mp is None:
            raise ValueError(
                "Cannot store empty datasets, partition_list must not be empty if in store mode."
            )
        table_name = mp.table_name
        metadata_version = mp.metadata_version
        dataset_builder = DatasetMetadataBuilder(
            uuid=dataset_uuid,
            metadata_version=metadata_version,
            partition_keys=mp.partition_keys,
        )

    for mp in partition_list:
        if mp.schema:
            schemas.add(mp.schema)

    dataset_builder.schema = persist_common_metadata(
        schemas=schemas,
        update_dataset=update_dataset,
        store=store,
        dataset_uuid=dataset_uuid,
        table_name=table_name,
    )

    # We can only check for non unique partition labels here and if they occur we will
    # fail hard. The resulting dataset may be corrupted or file may be left in the store
    # without dataset metadata
    partition_labels = partition_labels_from_mps(partition_list)

    # This could be safely removed since we do not allow to set this by the user
    # anymore. It has implications on tests if mocks are used
    non_unique_labels = extract_duplicates(partition_labels)

    if non_unique_labels:
        raise ValueError(
            "The labels {} are duplicated. Dataset metadata was not written.".
            format(", ".join(non_unique_labels)))

    if remove_partitions is None:
        remove_partitions = []

    if metadata_merger is None:
        metadata_merger = combine_metadata

    dataset_builder = update_metadata(dataset_builder, metadata_merger,
                                      dataset_metadata)
    dataset_builder = update_partitions(dataset_builder, partition_list,
                                        remove_partitions)
    dataset_builder = update_indices(dataset_builder, store, partition_list,
                                     remove_partitions)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    dataset = dataset_builder.to_dataset()
    return dataset