Exemple #1
0
def _rollback_transaction(existing_datasets, new_datasets, store):
    """
    Rollback changes made during tht write process.

    Parameters
    ----------
    existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that existings before the write process started.
    new_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that where created / changed during the write process.
    store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
        KV store.
    """
    if callable(store):
        store = store()

    # delete newly created datasets that where not present before the "transaction"
    for ktk_cube_dataset_id in sorted(
            set(new_datasets) - set(existing_datasets)):
        store.delete(
            metadata_key_from_uuid(new_datasets[ktk_cube_dataset_id].uuid))

    # recover changes of old datasets
    for ktk_cube_dataset_id in sorted(
            set(new_datasets) & set(existing_datasets)):
        ds = existing_datasets[ktk_cube_dataset_id]
        builder = DatasetMetadataBuilder.from_dataset(ds)
        store.put(*builder.to_json())
        store_schema_metadata(schema=ds.schema,
                              dataset_uuid=ds.uuid,
                              store=store,
                              table=ds.table_name)
def store_dataset_from_partitions(
    partition_list,
    store: StoreInput,
    dataset_uuid,
    dataset_metadata=None,
    metadata_merger=None,
    update_dataset=None,
    remove_partitions=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
):
    store = ensure_store(store)

    schemas = set()
    if update_dataset:
        dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset)
        metadata_version = dataset_builder.metadata_version
        table_name = update_dataset.table_name
        schemas.add(update_dataset.schema)
    else:
        mp = next(iter(partition_list), None)

        if mp is None:
            raise ValueError(
                "Cannot store empty datasets, partition_list must not be empty if in store mode."
            )
        table_name = mp.table_name
        metadata_version = mp.metadata_version
        dataset_builder = DatasetMetadataBuilder(
            uuid=dataset_uuid,
            metadata_version=metadata_version,
            partition_keys=mp.partition_keys,
        )

    for mp in partition_list:
        if mp.schema:
            schemas.add(mp.schema)

    dataset_builder.schema = persist_common_metadata(
        schemas=schemas,
        update_dataset=update_dataset,
        store=store,
        dataset_uuid=dataset_uuid,
        table_name=table_name,
    )

    # We can only check for non unique partition labels here and if they occur we will
    # fail hard. The resulting dataset may be corrupted or file may be left in the store
    # without dataset metadata
    partition_labels = partition_labels_from_mps(partition_list)

    # This could be safely removed since we do not allow to set this by the user
    # anymore. It has implications on tests if mocks are used
    non_unique_labels = extract_duplicates(partition_labels)

    if non_unique_labels:
        raise ValueError(
            "The labels {} are duplicated. Dataset metadata was not written.".
            format(", ".join(non_unique_labels)))

    if remove_partitions is None:
        remove_partitions = []

    if metadata_merger is None:
        metadata_merger = combine_metadata

    dataset_builder = update_metadata(dataset_builder, metadata_merger,
                                      dataset_metadata)
    dataset_builder = update_partitions(dataset_builder, partition_list,
                                        remove_partitions)
    dataset_builder = update_indices(dataset_builder, store, partition_list,
                                     remove_partitions)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    dataset = dataset_builder.to_dataset()
    return dataset
Exemple #3
0
def store_dataset_from_partitions(
    partition_list,
    store,
    dataset_uuid,
    dataset_metadata=None,
    metadata_merger=None,
    update_dataset=None,
    remove_partitions=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
):
    store = _instantiate_store(store)

    if update_dataset:
        dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset)
        metadata_version = dataset_builder.metadata_version
    else:
        mp = next(iter(partition_list), None)
        if mp is None:
            raise ValueError(
                "Cannot store empty datasets, partition_list must not be empty if in store mode."
            )

        metadata_version = mp.metadata_version
        dataset_builder = DatasetMetadataBuilder(
            uuid=dataset_uuid,
            metadata_version=metadata_version,
            partition_keys=mp.partition_keys,
        )

    dataset_builder.explicit_partitions = True

    dataset_builder.table_meta = persist_common_metadata(
        partition_list, update_dataset, store, dataset_uuid)

    # We can only check for non unique partition labels here and if they occur we will
    # fail hard. The resulting dataset may be corrupted or file may be left in the store
    # without dataset metadata
    partition_labels = partition_labels_from_mps(partition_list)
    non_unique_labels = extract_duplicates(partition_labels)

    if non_unique_labels:
        raise ValueError(
            "The labels {} are duplicated. Dataset metadata was not written.".
            format(", ".join(non_unique_labels)))

    if remove_partitions is None:
        remove_partitions = []

    if metadata_merger is None:
        metadata_merger = combine_metadata

    dataset_builder = update_metadata(dataset_builder, metadata_merger,
                                      partition_list, dataset_metadata)
    dataset_builder = update_partitions(dataset_builder, partition_list,
                                        remove_partitions)
    dataset_builder = update_indices(dataset_builder, store, partition_list,
                                     remove_partitions)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    dataset = dataset_builder.to_dataset()
    return dataset
Exemple #4
0
def copy_dataset(
    source_dataset_uuid: str,
    store: KeyValueStore,
    target_dataset_uuid: Optional[str] = None,
    target_store: Optional[KeyValueStore] = None,
) -> Dict[str, DatasetMetadata]:
    """
    Copies and optionally renames a dataset, either  from one store to another or
    within one store.

    Parameters
    ----------
    source_dataset_uuid: str
        UUID of source dataset
    store: simplekv.KeyValueStore
        Source store
    target_dataset_uuid: Optional[str]
        UUID of target dataset. May be the same as src_dataset_uuid, if store
        and tgt_store are different. If empty, src_dataset_uuid is used
    target_store: Optional[simplekv.KeyValueStore]
        Target Store. May be the same as store, if src_dataset_uuid and
        target_dataset_uuid are different. If empty, value from parameter store is
        used
    """
    if target_dataset_uuid is None:
        target_dataset_uuid = source_dataset_uuid
    if target_store is None:
        target_store = store

    if (source_dataset_uuid == target_dataset_uuid) & (store == target_store):
        raise ValueError(
            "Cannot copy to a dataset with the same UUID within the same store!"
        )

    ds_factory_source = _ensure_factory(
        dataset_uuid=source_dataset_uuid,
        store=store,
        factory=None,
        load_dataset_metadata=True,
    )

    # Create a dict of {source key: target key} entries
    keys = get_dataset_keys(ds_factory_source.dataset_metadata)
    mapped_keys = {
        source_key: source_key.replace(source_dataset_uuid,
                                       target_dataset_uuid)
        for source_key in keys
    }

    # Create a dict of metadata which has to be changed. This is only the
    # <uuid>.by-dataset-metadata.json file

    md_transformed = {
        f"{target_dataset_uuid}{METADATA_BASE_SUFFIX}{METADATA_FORMAT_JSON}":
        DatasetMetadataBuilder.from_dataset(
            ds_factory_source.dataset_metadata).modify_uuid(
                target_dataset_uuid).to_dataset()
    }
    # Copy the keys from one store to another
    copy_rename_keys(mapped_keys, store, target_store, md_transformed)

    return md_transformed