コード例 #1
0
def assert_target_keys(src_store, src_uuid, tgt_store, tgt_uuid):
    """
    Check that the expected keys exist in the target data set, and the corresponding
    values are equal to the source data set (or modified as expected)
    """
    df_source = DatasetFactory(
        dataset_uuid=src_uuid, store_factory=lazy_store(src_store),
    )
    src_keys = get_dataset_keys(df_source.dataset_metadata)
    df_target = DatasetFactory(
        dataset_uuid=tgt_uuid, store_factory=lazy_store(tgt_store),
    )
    tgt_keys = get_dataset_keys(df_target.dataset_metadata)

    for src_key in src_keys:
        # check for each source key if the corresponding target key exists
        tgt_key = src_key.replace(src_uuid, tgt_uuid)
        assert tgt_key in tgt_keys

        # check if the files for source and target key are equal (exception:
        # metadata => here the target must contain the modified metadata)
        b1 = src_store.get(src_key)
        b2 = tgt_store.get(tgt_key)

        if tgt_key.endswith("by-dataset-metadata.json"):
            b1_mod = b1.decode("utf-8").replace(src_uuid, tgt_uuid).encode("utf-8")
            assert b1_mod == b2
        else:
            assert b1 == b2
コード例 #2
0
ファイル: test_utils.py プロジェクト: xhochy/kartothek
def test_ensure_store_fact(store_input_types):
    store_fact = lazy_store(store_input_types)
    assert callable(store_fact)
    store = store_fact()
    assert isinstance(store, KeyValueStore)
    value = b"value"
    key = "key"
    store.put(key, value)
    assert value == store.get(key)

    assert store_fact is lazy_store(store_fact)
コード例 #3
0
ファイル: delayed.py プロジェクト: mattsu2020/kartothek
def store_delayed_as_dataset(
    delayed_tasks: List[Delayed],
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    table_name: str = SINGLE_TABLE,
    secondary_indices=None,
) -> Delayed:
    """
    Transform and store a list of dictionaries containing
    dataframes to a kartothek dataset in store.

    Parameters
    ----------
    """
    store = lazy_store(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    raise_if_indices_overlap(partition_on, secondary_indices)

    input_to_mps = partial(
        parse_input_to_metapartition,
        metadata_version=metadata_version,
        table_name=table_name,
    )
    mps = map_delayed(input_to_mps, delayed_tasks)

    if partition_on:
        mps = map_delayed(MetaPartition.partition_on, mps, partition_on=partition_on)

    if secondary_indices:
        mps = map_delayed(MetaPartition.build_indices, mps, columns=secondary_indices)

    mps = map_delayed(
        MetaPartition.store_dataframes,
        mps,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    return delayed(store_dataset_from_partitions)(
        mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )
コード例 #4
0
ファイル: eager.py プロジェクト: nefta-kanilmaz-by/kartothek
def create_empty_dataset_header(
    store,
    dataset_uuid,
    table_meta,
    partition_on=None,
    metadata=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Create an dataset header without any partitions. This may be used in combination
    with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets.

    .. note::

        The created dataset will **always** have explicit_partition==False

    .. warning::

        This function should only be used in very rare occasions. Usually you're better off using
        full end-to-end pipelines.

    Parameters
    ----------
    """
    store = lazy_store(store)()
    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    for table, schema in table_meta.items():
        table_meta[table] = make_meta(schema,
                                      origin=table,
                                      partition_keys=partition_on)
        store_schema_metadata(
            schema=table_meta[table],
            dataset_uuid=dataset_uuid,
            store=store,
            table=table,
        )
    dataset_builder = DatasetMetadataBuilder(
        uuid=dataset_uuid,
        metadata_version=metadata_version,
        partition_keys=partition_on,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    if metadata:
        for key, value in metadata.items():
            dataset_builder.add_metadata(key, value)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    return dataset_builder.to_dataset()
コード例 #5
0
ファイル: factory.py プロジェクト: JDASoftwareGroup/kartothek
    def __init__(
        self,
        dataset_uuid: str,
        store_factory: StoreInput,
        load_schema: bool = True,
        load_all_indices: bool = False,
        load_dataset_metadata: bool = True,
    ) -> None:
        """
        A dataset factory object which can be used to cache dataset load operations. This class should be the primary user entry point when
        reading datasets.

        Example using the eager backend:

        .. code::

            from functools import partial
            from storefact import get_store_from_url
            from kartothek.io.eager import read_table

            ds_factory = DatasetFactory(
                dataset_uuid="my_test_dataset",
                store=partial(get_store_from_url, store_url)
            )

            df = read_table(factory=ds_factory)

        Parameters
        ----------
        dataset_uuid
            The unique indetifier for the dataset.
        store_factory
            A callable which creates a KeyValueStore object
        load_schema
            Load the schema information immediately.
        load_all_indices
            Load all indices immediately.
        load_dataset_metadata
            Keep the user metadata in memory
        """
        self._cache_metadata: Optional[DatasetMetadata] = None
        self._cache_store = None

        self.store_factory = lazy_store(store_factory)
        self.dataset_uuid = dataset_uuid
        self.load_schema = load_schema
        self._ds_callable = None
        self.is_loaded = False
        self.load_dataset_metadata = load_dataset_metadata
        self.load_all_indices_flag = load_all_indices
コード例 #6
0
ファイル: utils.py プロジェクト: xhochy/kartothek
def normalize_arg(arg_name, old_value):
    """
    Normalizes an argument according to pre-defined types

    Type A:

    * "partition_on"
    * "delete_scope"
    * "secondary_indices"
    * "dispatch_by"

    will be converted to a list. If it is None, an empty list will be created

    Type B:
    * "store"

    Will be converted to a callable returning
    """

    def _make_list(_args):
        if isinstance(_args, (str, bytes, int, float)):
            return [_args]
        if _args is None:
            return []
        if isinstance(_args, (set, frozenset, dict)):
            raise ValueError(
                "{} is incompatible for normalisation.".format(type(_args))
            )
        return list(_args)

    if arg_name in _NORMALIZE_ARGS_LIST:
        if old_value is None:
            return []
        elif isinstance(old_value, list):
            return old_value
        else:
            return _make_list(old_value)
    elif arg_name == "dispatch_by":
        if old_value is None:
            return old_value
        elif isinstance(old_value, list):
            return old_value
        else:
            return _make_list(old_value)
    elif arg_name == "store" and old_value is not None:
        return lazy_store(old_value)

    return old_value
コード例 #7
0
def _ensure_factory(
    dataset_uuid: Optional[str],
    store: Optional[StoreInput],
    factory: Optional[DatasetFactory],
    load_schema: bool = True,
) -> DatasetFactory:

    if store is None and dataset_uuid is None and factory is not None:
        return factory
    elif store is not None and dataset_uuid is not None and factory is None:
        return DatasetFactory(
            dataset_uuid=dataset_uuid,
            store_factory=lazy_store(store),
            load_schema=load_schema,
        )

    else:
        raise ValueError(
            "Need to supply either a `factory` or `dataset_uuid` and `store`")
コード例 #8
0
def store_bag_as_dataset(
    bag,
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    secondary_indices=None,
):
    """
    Transform and store a dask.bag of dictionaries containing
    dataframes to a kartothek dataset in store.

    This is the dask.bag-equivalent of
    :func:`~kartothek.io.dask.delayed.store_delayed_as_dataset`. See there
    for more detailed documentation on the different possible input types.

    Parameters
    ----------
    bag: dask.bag.Bag
        A dask bag containing dictionaries of dataframes or dataframes.

    """
    store = lazy_store(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    raise_if_indices_overlap(partition_on, secondary_indices)

    input_to_mps = partial(parse_input_to_metapartition,
                           metadata_version=metadata_version)
    mps = bag.map(input_to_mps)

    if partition_on:
        mps = mps.map(MetaPartition.partition_on, partition_on=partition_on)

    if secondary_indices:
        mps = mps.map(MetaPartition.build_indices, columns=secondary_indices)

    mps = mps.map(
        MetaPartition.store_dataframes,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    aggregate = partial(
        _store_dataset_from_partitions_flat,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )

    return mps.reduction(perpartition=list,
                         aggregate=aggregate,
                         split_every=False)
コード例 #9
0
def merge_datasets_as_delayed(
    left_dataset_uuid,
    right_dataset_uuid,
    store,
    merge_tasks,
    match_how="exact",
    label_merger=None,
    metadata_merger=None,
):
    """
    A dask.delayed graph to perform the merge of two full kartothek datasets.

    Parameters
    ----------
    left_dataset_uuid : str
        UUID for left dataset (order does not matter in all merge schemas)
    right_dataset_uuid : str
        UUID for right dataset (order does not matter in all merge schemas)
    match_how : Union[str, Callable]
        Define the partition label matching scheme.
        Available implementations are:

        * left (right) : The left (right) partitions are considered to be
                            the base partitions and **all** partitions of the
                            right (left) dataset are joined to the left
                            partition. This should only be used if one of the
                            datasets contain very few partitions.
        * prefix : The labels of the partitions of the dataset with fewer
                    partitions are considered to be the prefixes to the
                    right dataset
        * exact : All partition labels of the left dataset need to have
                    an exact match in the right dataset
        * callable : A callable with signature func(left, right) which
                        returns a boolean to determine if the partitions match

        If True, an exact match of partition labels between the to-be-merged
        datasets is required in order to merge.
        If False (Default), the partition labels of the dataset with fewer
        partitions are interpreted as prefixes.
    merge_tasks : List[Dict]
        A list of merge tasks. Each item in this list is a dictionary giving
        explicit instructions for a specific merge.
        Each dict should contain key/values:

        * `left`: The table for the left dataframe
        * `right`: The table for the right dataframe
        * 'output_label' : The table for the merged dataframe
        * `merge_func`: A callable with signature
                        `merge_func(left_df, right_df, merge_kwargs)` to
                        handle the data preprocessing and merging.
                        Default pandas.merge
        * 'merge_kwargs' : The kwargs to be passed to the `merge_func`

        Example:

        .. code::

            >>> merge_tasks = [
            ...     {
            ...         "left": "left_dict",
            ...         "right": "right_dict",
            ...         "merge_kwargs": {"kwargs of merge_func": ''},
            ...         "output_label": 'merged_core_data'
            ...     },
            ... ]

    """
    store = lazy_store(store)

    mps = align_datasets(
        left_dataset_uuid=left_dataset_uuid,
        right_dataset_uuid=right_dataset_uuid,
        store=store,
        match_how=match_how,
    )
    mps = map_delayed(
        _load_and_merge_mps,
        mps,
        store=store,
        label_merger=label_merger,
        metadata_merger=metadata_merger,
        merge_tasks=merge_tasks,
    )

    return list(mps)
コード例 #10
0
ファイル: eager.py プロジェクト: nefta-kanilmaz-by/kartothek
def write_single_partition(
    store: Optional[KeyValueStore] = None,
    dataset_uuid: Optional[str] = None,
    data=None,
    metadata: Optional[Dict[str, Dict[str, Any]]] = None,
    df_serializer: Optional[ParquetSerializer] = None,
    overwrite: bool = False,
    metadata_merger=None,
    metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    factory=None,
    secondary_indices=None,
):
    """
    Write the parquet file(s) for a single partition. This will **not** update the dataset header and can therefore
    be used for highly concurrent dataset writes.

    For datasets with explicit partitions, the dataset header can be updated by calling
    :func:`kartothek.io.eager.commit_dataset` with the output of this function.

    .. note::

        It is highly recommended to use the full pipelines whenever possible. This functionality should be
        used with caution and should only be necessary in cases where traditional pipeline scheduling is not an
        option.

    .. note::

        This function requires an existing dataset metadata file and the schemas for the tables to be present.
        Either you have ensured that the dataset always exists though some other means or use
        :func:`create_empty_dataset_header` at the start of your computation to ensure the basic dataset
        metadata is there.

    Parameters
    ----------
    data: Dict
        The input is defined according to :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition`

    Returns
    -------
    An empty :class:`~kartothek.io_components.metapartition.MetaPartition` referencing the new files
    """
    if metadata is not None:
        warnings.warn(
            "The keyword `metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if overwrite is not False:
        warnings.warn(
            "The keyword `overwrite` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if metadata_merger is not None:
        warnings.warn(
            "The keyword `metadata_merger` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if data is None:
        raise TypeError("The parameter `data` is not optional")
    _, ds_metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=lazy_store(store),
        ds_factory=factory,
        default_metadata_version=metadata_version,
        partition_on=partition_on,
    )

    mp = parse_input_to_metapartition(obj=data,
                                      metadata_version=ds_metadata_version)
    if partition_on:
        mp = mp.partition_on(partition_on)

    if secondary_indices:
        mp = mp.build_indices(columns=secondary_indices)

    mp = mp.validate_schema_compatible(dataset_uuid=dataset_uuid, store=store)

    mp = mp.store_dataframes(store=store,
                             dataset_uuid=dataset_uuid,
                             df_serializer=df_serializer)
    return mp
コード例 #11
0
ファイル: eager.py プロジェクト: nefta-kanilmaz-by/kartothek
def commit_dataset(
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    new_partitions: Optional[Iterable[MetaPartition]] = None,
    output_dataset_uuid: Optional[str] = None,
    delete_scope: Optional[Iterable[Dict[str, Any]]] = None,
    metadata: Dict = None,
    df_serializer: DataFrameSerializer = None,
    metadata_merger: Callable[[List[Dict]], Dict] = None,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[Iterable[str]] = None,
    factory: Optional[DatasetFactory] = None,
    secondary_indices: Optional[Iterable[str]] = None,
):
    """
    Commit new state to an existing dataset. This can be used for three distinct operations

    1. Add previously written partitions to this dataset

        If for some reasons, the existing pipelines are not sufficient but you need more control, you can write the files outside of a kartothek pipeline and commit them whenever you choose to.

        This should be used in combination with
        :func:`~kartothek.io.eager.write_single_partition` and :func:`~kartothek.io.eager.create_empty_dataset_header`.

        .. code::

            import pandas as pd
            from kartothek.io.eager import write_single_partition, commit_dataset

            store = "hfs://my_store"

            # The partition writing can be done concurrently and distributed if wanted.
            # Only the information about what partitions have been written is required for the commit.
            new_partitions = [
                write_single_partition(
                    store=store,
                    dataset_uuid='dataset_uuid',
                    data=pd.DataFrame({'column': [1, 2]}),
                )
            ]

            new_dataset = commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                new_partitions=new_partitions,
            )

    2. Simple delete of partitions

        If you want to remove some partitions this is one of the simples ways of doing so. By simply providing a delete_scope, this removes the references to these files in an atomic commit.

        .. code::

            commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                delete_scope=[
                    {
                        "partition_column": "part_value_to_be_removed"
                    }
                ],
            )

    3. Add additional metadata

        To add new metadata to an existing dataset

        .. code::

            commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                metadata={"new": "user_metadata"},
            )

        Note::

            If you do not want the new metadata to be merged with the existing one, povide a custom ``metadata_merger``

    Parameters
    ----------
    new_partitions:
        Input partition to be committed.

    """
    if output_dataset_uuid is not None:
        warnings.warn(
            "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if df_serializer is not None:
        warnings.warn(
            "The keyword `df_serializer` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if not new_partitions and not metadata and not delete_scope:
        raise ValueError(
            "Need to provide either new data, new metadata or a delete scope. None of it was provided."
        )
    store = lazy_store(store)
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    mps = parse_input_to_metapartition(new_partitions,
                                       metadata_version=metadata_version)

    if secondary_indices:
        mps = mps.build_indices(columns=secondary_indices)

    mps_list = [_maybe_infer_files_attribute(mp, dataset_uuid) for mp in mps]

    dmd = update_dataset_from_partitions(
        mps_list,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
    return dmd
コード例 #12
0
ファイル: test_utils.py プロジェクト: xhochy/kartothek
def test_lazy_store_accepts_decorated_store():
    store = get_store_from_url("memory://")
    pstore = PrefixDecorator("pre", store)
    assert lazy_store(pstore)() is pstore
コード例 #13
0
ファイル: test_utils.py プロジェクト: xhochy/kartothek
def test_lazy_store_returns_same_store():
    store = get_store_from_url("memory://")
    assert lazy_store(lambda: store)() is store
コード例 #14
0
def store_factory2(tmpdir):
    path = tmpdir.join("store2").strpath
    url = "hfs://{}".format(path)
    return lazy_store(url)
コード例 #15
0
def store_session_factory(tmpdir_factory):
    path = tmpdir_factory.mktemp("fsstore_test")
    path = path.realpath()
    url = "hfs://{}".format(path)
    return lazy_store(url)