def assert_target_keys(src_store, src_uuid, tgt_store, tgt_uuid): """ Check that the expected keys exist in the target data set, and the corresponding values are equal to the source data set (or modified as expected) """ df_source = DatasetFactory( dataset_uuid=src_uuid, store_factory=lazy_store(src_store), ) src_keys = get_dataset_keys(df_source.dataset_metadata) df_target = DatasetFactory( dataset_uuid=tgt_uuid, store_factory=lazy_store(tgt_store), ) tgt_keys = get_dataset_keys(df_target.dataset_metadata) for src_key in src_keys: # check for each source key if the corresponding target key exists tgt_key = src_key.replace(src_uuid, tgt_uuid) assert tgt_key in tgt_keys # check if the files for source and target key are equal (exception: # metadata => here the target must contain the modified metadata) b1 = src_store.get(src_key) b2 = tgt_store.get(tgt_key) if tgt_key.endswith("by-dataset-metadata.json"): b1_mod = b1.decode("utf-8").replace(src_uuid, tgt_uuid).encode("utf-8") assert b1_mod == b2 else: assert b1 == b2
def test_ensure_store_fact(store_input_types): store_fact = lazy_store(store_input_types) assert callable(store_fact) store = store_fact() assert isinstance(store, KeyValueStore) value = b"value" key = "key" store.put(key, value) assert value == store.get(key) assert store_fact is lazy_store(store_fact)
def store_delayed_as_dataset( delayed_tasks: List[Delayed], store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, table_name: str = SINGLE_TABLE, secondary_indices=None, ) -> Delayed: """ Transform and store a list of dictionaries containing dataframes to a kartothek dataset in store. Parameters ---------- """ store = lazy_store(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) raise_if_indices_overlap(partition_on, secondary_indices) input_to_mps = partial( parse_input_to_metapartition, metadata_version=metadata_version, table_name=table_name, ) mps = map_delayed(input_to_mps, delayed_tasks) if partition_on: mps = map_delayed(MetaPartition.partition_on, mps, partition_on=partition_on) if secondary_indices: mps = map_delayed(MetaPartition.build_indices, mps, columns=secondary_indices) mps = map_delayed( MetaPartition.store_dataframes, mps, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) return delayed(store_dataset_from_partitions)( mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, )
def create_empty_dataset_header( store, dataset_uuid, table_meta, partition_on=None, metadata=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, ): """ Create an dataset header without any partitions. This may be used in combination with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets. .. note:: The created dataset will **always** have explicit_partition==False .. warning:: This function should only be used in very rare occasions. Usually you're better off using full end-to-end pipelines. Parameters ---------- """ store = lazy_store(store)() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) for table, schema in table_meta.items(): table_meta[table] = make_meta(schema, origin=table, partition_keys=partition_on) store_schema_metadata( schema=table_meta[table], dataset_uuid=dataset_uuid, store=store, table=table, ) dataset_builder = DatasetMetadataBuilder( uuid=dataset_uuid, metadata_version=metadata_version, partition_keys=partition_on, explicit_partitions=False, table_meta=table_meta, ) if metadata: for key, value in metadata.items(): dataset_builder.add_metadata(key, value) if metadata_storage_format.lower() == "json": store.put(*dataset_builder.to_json()) elif metadata_storage_format.lower() == "msgpack": store.put(*dataset_builder.to_msgpack()) else: raise ValueError( "Unknown metadata storage format encountered: {}".format( metadata_storage_format)) return dataset_builder.to_dataset()
def __init__( self, dataset_uuid: str, store_factory: StoreInput, load_schema: bool = True, load_all_indices: bool = False, load_dataset_metadata: bool = True, ) -> None: """ A dataset factory object which can be used to cache dataset load operations. This class should be the primary user entry point when reading datasets. Example using the eager backend: .. code:: from functools import partial from storefact import get_store_from_url from kartothek.io.eager import read_table ds_factory = DatasetFactory( dataset_uuid="my_test_dataset", store=partial(get_store_from_url, store_url) ) df = read_table(factory=ds_factory) Parameters ---------- dataset_uuid The unique indetifier for the dataset. store_factory A callable which creates a KeyValueStore object load_schema Load the schema information immediately. load_all_indices Load all indices immediately. load_dataset_metadata Keep the user metadata in memory """ self._cache_metadata: Optional[DatasetMetadata] = None self._cache_store = None self.store_factory = lazy_store(store_factory) self.dataset_uuid = dataset_uuid self.load_schema = load_schema self._ds_callable = None self.is_loaded = False self.load_dataset_metadata = load_dataset_metadata self.load_all_indices_flag = load_all_indices
def normalize_arg(arg_name, old_value): """ Normalizes an argument according to pre-defined types Type A: * "partition_on" * "delete_scope" * "secondary_indices" * "dispatch_by" will be converted to a list. If it is None, an empty list will be created Type B: * "store" Will be converted to a callable returning """ def _make_list(_args): if isinstance(_args, (str, bytes, int, float)): return [_args] if _args is None: return [] if isinstance(_args, (set, frozenset, dict)): raise ValueError( "{} is incompatible for normalisation.".format(type(_args)) ) return list(_args) if arg_name in _NORMALIZE_ARGS_LIST: if old_value is None: return [] elif isinstance(old_value, list): return old_value else: return _make_list(old_value) elif arg_name == "dispatch_by": if old_value is None: return old_value elif isinstance(old_value, list): return old_value else: return _make_list(old_value) elif arg_name == "store" and old_value is not None: return lazy_store(old_value) return old_value
def _ensure_factory( dataset_uuid: Optional[str], store: Optional[StoreInput], factory: Optional[DatasetFactory], load_schema: bool = True, ) -> DatasetFactory: if store is None and dataset_uuid is None and factory is not None: return factory elif store is not None and dataset_uuid is not None and factory is None: return DatasetFactory( dataset_uuid=dataset_uuid, store_factory=lazy_store(store), load_schema=load_schema, ) else: raise ValueError( "Need to supply either a `factory` or `dataset_uuid` and `store`")
def store_bag_as_dataset( bag, store, dataset_uuid=None, metadata=None, df_serializer=None, overwrite=False, metadata_merger=None, metadata_version=naming.DEFAULT_METADATA_VERSION, partition_on=None, metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT, secondary_indices=None, ): """ Transform and store a dask.bag of dictionaries containing dataframes to a kartothek dataset in store. This is the dask.bag-equivalent of :func:`~kartothek.io.dask.delayed.store_delayed_as_dataset`. See there for more detailed documentation on the different possible input types. Parameters ---------- bag: dask.bag.Bag A dask bag containing dictionaries of dataframes or dataframes. """ store = lazy_store(store) if dataset_uuid is None: dataset_uuid = gen_uuid() if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) raise_if_indices_overlap(partition_on, secondary_indices) input_to_mps = partial(parse_input_to_metapartition, metadata_version=metadata_version) mps = bag.map(input_to_mps) if partition_on: mps = mps.map(MetaPartition.partition_on, partition_on=partition_on) if secondary_indices: mps = mps.map(MetaPartition.build_indices, columns=secondary_indices) mps = mps.map( MetaPartition.store_dataframes, store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid, ) aggregate = partial( _store_dataset_from_partitions_flat, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_merger=metadata_merger, metadata_storage_format=metadata_storage_format, ) return mps.reduction(perpartition=list, aggregate=aggregate, split_every=False)
def merge_datasets_as_delayed( left_dataset_uuid, right_dataset_uuid, store, merge_tasks, match_how="exact", label_merger=None, metadata_merger=None, ): """ A dask.delayed graph to perform the merge of two full kartothek datasets. Parameters ---------- left_dataset_uuid : str UUID for left dataset (order does not matter in all merge schemas) right_dataset_uuid : str UUID for right dataset (order does not matter in all merge schemas) match_how : Union[str, Callable] Define the partition label matching scheme. Available implementations are: * left (right) : The left (right) partitions are considered to be the base partitions and **all** partitions of the right (left) dataset are joined to the left partition. This should only be used if one of the datasets contain very few partitions. * prefix : The labels of the partitions of the dataset with fewer partitions are considered to be the prefixes to the right dataset * exact : All partition labels of the left dataset need to have an exact match in the right dataset * callable : A callable with signature func(left, right) which returns a boolean to determine if the partitions match If True, an exact match of partition labels between the to-be-merged datasets is required in order to merge. If False (Default), the partition labels of the dataset with fewer partitions are interpreted as prefixes. merge_tasks : List[Dict] A list of merge tasks. Each item in this list is a dictionary giving explicit instructions for a specific merge. Each dict should contain key/values: * `left`: The table for the left dataframe * `right`: The table for the right dataframe * 'output_label' : The table for the merged dataframe * `merge_func`: A callable with signature `merge_func(left_df, right_df, merge_kwargs)` to handle the data preprocessing and merging. Default pandas.merge * 'merge_kwargs' : The kwargs to be passed to the `merge_func` Example: .. code:: >>> merge_tasks = [ ... { ... "left": "left_dict", ... "right": "right_dict", ... "merge_kwargs": {"kwargs of merge_func": ''}, ... "output_label": 'merged_core_data' ... }, ... ] """ store = lazy_store(store) mps = align_datasets( left_dataset_uuid=left_dataset_uuid, right_dataset_uuid=right_dataset_uuid, store=store, match_how=match_how, ) mps = map_delayed( _load_and_merge_mps, mps, store=store, label_merger=label_merger, metadata_merger=metadata_merger, merge_tasks=merge_tasks, ) return list(mps)
def write_single_partition( store: Optional[KeyValueStore] = None, dataset_uuid: Optional[str] = None, data=None, metadata: Optional[Dict[str, Dict[str, Any]]] = None, df_serializer: Optional[ParquetSerializer] = None, overwrite: bool = False, metadata_merger=None, metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[List[str]] = None, factory=None, secondary_indices=None, ): """ Write the parquet file(s) for a single partition. This will **not** update the dataset header and can therefore be used for highly concurrent dataset writes. For datasets with explicit partitions, the dataset header can be updated by calling :func:`kartothek.io.eager.commit_dataset` with the output of this function. .. note:: It is highly recommended to use the full pipelines whenever possible. This functionality should be used with caution and should only be necessary in cases where traditional pipeline scheduling is not an option. .. note:: This function requires an existing dataset metadata file and the schemas for the tables to be present. Either you have ensured that the dataset always exists though some other means or use :func:`create_empty_dataset_header` at the start of your computation to ensure the basic dataset metadata is there. Parameters ---------- data: Dict The input is defined according to :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition` Returns ------- An empty :class:`~kartothek.io_components.metapartition.MetaPartition` referencing the new files """ if metadata is not None: warnings.warn( "The keyword `metadata` has no use and will be removed in the next major release ", DeprecationWarning, ) if overwrite is not False: warnings.warn( "The keyword `overwrite` has no use and will be removed in the next major release ", DeprecationWarning, ) if metadata_merger is not None: warnings.warn( "The keyword `metadata_merger` has no use and will be removed in the next major release ", DeprecationWarning, ) if data is None: raise TypeError("The parameter `data` is not optional") _, ds_metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=lazy_store(store), ds_factory=factory, default_metadata_version=metadata_version, partition_on=partition_on, ) mp = parse_input_to_metapartition(obj=data, metadata_version=ds_metadata_version) if partition_on: mp = mp.partition_on(partition_on) if secondary_indices: mp = mp.build_indices(columns=secondary_indices) mp = mp.validate_schema_compatible(dataset_uuid=dataset_uuid, store=store) mp = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) return mp
def commit_dataset( store: Optional[StoreInput] = None, dataset_uuid: Optional[str] = None, new_partitions: Optional[Iterable[MetaPartition]] = None, output_dataset_uuid: Optional[str] = None, delete_scope: Optional[Iterable[Dict[str, Any]]] = None, metadata: Dict = None, df_serializer: DataFrameSerializer = None, metadata_merger: Callable[[List[Dict]], Dict] = None, default_metadata_version: int = DEFAULT_METADATA_VERSION, partition_on: Optional[Iterable[str]] = None, factory: Optional[DatasetFactory] = None, secondary_indices: Optional[Iterable[str]] = None, ): """ Commit new state to an existing dataset. This can be used for three distinct operations 1. Add previously written partitions to this dataset If for some reasons, the existing pipelines are not sufficient but you need more control, you can write the files outside of a kartothek pipeline and commit them whenever you choose to. This should be used in combination with :func:`~kartothek.io.eager.write_single_partition` and :func:`~kartothek.io.eager.create_empty_dataset_header`. .. code:: import pandas as pd from kartothek.io.eager import write_single_partition, commit_dataset store = "hfs://my_store" # The partition writing can be done concurrently and distributed if wanted. # Only the information about what partitions have been written is required for the commit. new_partitions = [ write_single_partition( store=store, dataset_uuid='dataset_uuid', data=pd.DataFrame({'column': [1, 2]}), ) ] new_dataset = commit_dataset( store=store, dataset_uuid='dataset_uuid', new_partitions=new_partitions, ) 2. Simple delete of partitions If you want to remove some partitions this is one of the simples ways of doing so. By simply providing a delete_scope, this removes the references to these files in an atomic commit. .. code:: commit_dataset( store=store, dataset_uuid='dataset_uuid', delete_scope=[ { "partition_column": "part_value_to_be_removed" } ], ) 3. Add additional metadata To add new metadata to an existing dataset .. code:: commit_dataset( store=store, dataset_uuid='dataset_uuid', metadata={"new": "user_metadata"}, ) Note:: If you do not want the new metadata to be merged with the existing one, povide a custom ``metadata_merger`` Parameters ---------- new_partitions: Input partition to be committed. """ if output_dataset_uuid is not None: warnings.warn( "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ", DeprecationWarning, ) if df_serializer is not None: warnings.warn( "The keyword `df_serializer` is deprecated and will be removed in the next major release.", DeprecationWarning, ) if not new_partitions and not metadata and not delete_scope: raise ValueError( "Need to provide either new data, new metadata or a delete scope. None of it was provided." ) store = lazy_store(store) ds_factory, metadata_version, partition_on = validate_partition_keys( dataset_uuid=dataset_uuid, store=store, ds_factory=factory, default_metadata_version=default_metadata_version, partition_on=partition_on, ) mps = parse_input_to_metapartition(new_partitions, metadata_version=metadata_version) if secondary_indices: mps = mps.build_indices(columns=secondary_indices) mps_list = [_maybe_infer_files_attribute(mp, dataset_uuid) for mp in mps] dmd = update_dataset_from_partitions( mps_list, store_factory=store, dataset_uuid=dataset_uuid, ds_factory=ds_factory, delete_scope=delete_scope, metadata=metadata, metadata_merger=metadata_merger, ) return dmd
def test_lazy_store_accepts_decorated_store(): store = get_store_from_url("memory://") pstore = PrefixDecorator("pre", store) assert lazy_store(pstore)() is pstore
def test_lazy_store_returns_same_store(): store = get_store_from_url("memory://") assert lazy_store(lambda: store)() is store
def store_factory2(tmpdir): path = tmpdir.join("store2").strpath url = "hfs://{}".format(path) return lazy_store(url)
def store_session_factory(tmpdir_factory): path = tmpdir_factory.mktemp("fsstore_test") path = path.realpath() url = "hfs://{}".format(path) return lazy_store(url)