def _rollback_transaction(existing_datasets, new_datasets, store): """ Rollback changes made during tht write process. Parameters ---------- existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that existings before the write process started. new_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that where created / changed during the write process. store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore] KV store. """ if callable(store): store = store() # delete newly created datasets that where not present before the "transaction" for ktk_cube_dataset_id in sorted( set(new_datasets) - set(existing_datasets)): store.delete( metadata_key_from_uuid(new_datasets[ktk_cube_dataset_id].uuid)) # recover changes of old datasets for ktk_cube_dataset_id in sorted( set(new_datasets) & set(existing_datasets)): ds = existing_datasets[ktk_cube_dataset_id] builder = DatasetMetadataBuilder.from_dataset(ds) store.put(*builder.to_json()) store_schema_metadata(schema=ds.schema, dataset_uuid=ds.uuid, store=store, table=ds.table_name)
def update_indices_from_partitions(partition_list, dataset_metadata_factory): """ This takes indices from a partition list and overwrites all indices in the dataset metadata provided by the dataset metadata factory. The same is done in the store dataset part. This is used in an additional build index step (by the build_dataset_indices__pipeline) which should be used after updating partitions of a dataset. """ dataset_indices = MetaPartition.merge_indices(partition_list) indices = persist_indices( store=dataset_metadata_factory.store, dataset_uuid=dataset_metadata_factory.uuid, indices=dataset_indices, ) for column, storage_key in six.iteritems(indices): dataset_metadata_factory.indices[column] = ExplicitSecondaryIndex( column=column, index_storage_key=storage_key) dataset_metadata_factory.store.put( naming.metadata_key_from_uuid(dataset_metadata_factory.uuid), dataset_metadata_factory.to_json(), ) return dataset_metadata_factory
def load_from_store( uuid: str, store: StoreInput, load_schema: bool = True, load_all_indices: bool = False, ) -> "DatasetMetadata": """ Load a dataset from a storage Parameters ---------- uuid UUID of the dataset. store Object that implements the .get method for file/object loading. load_schema Load table schema load_all_indices Load all registered indices into memory. Returns ------- dataset_metadata: :class:`~kartothek.core.dataset.DatasetMetadata` Parsed metadata. """ key1 = naming.metadata_key_from_uuid(uuid) store = ensure_store(store) try: value = store.get(key1) metadata = load_json(value) except KeyError: key2 = naming.metadata_key_from_uuid(uuid, format="msgpack") try: value = store.get(key2) metadata = unpackb(value) except KeyError: raise KeyError( "Dataset does not exist. Tried {} and {}".format( key1, key2)) ds = DatasetMetadata.load_from_dict(metadata, store, load_schema=load_schema) if load_all_indices: ds = ds.load_all_indices(store) return ds
def exists(uuid: str, store: StoreInput) -> bool: """ Check if a dataset exists in a storage Parameters ---------- uuid UUID of the dataset. store Object that implements the .get method for file/object loading. """ store = ensure_store(store) key = naming.metadata_key_from_uuid(uuid) if key in store: return True key = naming.metadata_key_from_uuid(uuid, format="msgpack") return key in store
def raise_if_dataset_exists(dataset_uuid, store): try: store_instance = _instantiate_store(store) for form in ["msgpack", "json"]: key = naming.metadata_key_from_uuid(uuid=dataset_uuid, format=form) if key in store_instance: raise RuntimeError( "Dataset `%s` already exists and overwrite is not permitted!", dataset_uuid, ) except KeyError: pass
def exists(uuid, store): """ Check if a dataset exists in a storage Parameters ---------- uuid: str or unicode UUID of the dataset. store: Object Object that implements the .get method for file/object loading. Returns ------- exists: bool Whether a metadata file could be found. """ key = naming.metadata_key_from_uuid(uuid) if key in store: return True key = naming.metadata_key_from_uuid(uuid, format="msgpack") return key in store
def to_msgpack(self) -> Tuple[str, bytes]: """ Render the dataset to msgpack. Returns ------- storage_key: str The path where this metadata should be placed in the storage. dataset_json: str The rendered JSON for this dataset. """ return ( naming.metadata_key_from_uuid(self.uuid, format="msgpack"), packb(self.to_dict()), )
def to_json(self): """ Render the dataset to JSON. Returns ------- storage_key: str The path where this metadata should be placed in the storage. dataset_json: str The rendered JSON for this dataset. """ return ( naming.metadata_key_from_uuid(self.uuid), simplejson.dumps(self.to_dict()).encode("utf-8"), )
def delete_top_level_metadata(dataset_factory, *args): """ The additional arguments allow to schedule this function with delayed objects. """ dataset_factory.store.delete( metadata_key_from_uuid(dataset_factory.dataset_uuid))