Esempio n. 1
0
    def __init__(
        self,
        dataset_uuid: str,
        store_factory: Callable[[], "KeyValueStore"],
        load_schema: bool = True,
        load_all_indices: bool = False,
        load_dataset_metadata: bool = True,
    ) -> None:
        """
        A dataset factory object which can be used to cache dataset load operations. This class should be the primary user entry point when
        reading datasets.

        Example using the eager backend:

        .. code::

            from functools import partial
            from storefact import get_store_from_url
            from kartothek.io.eager import read_table

            ds_factory = DatasetFactory(
                dataset_uuid="my_test_dataset",
                store=partial(get_store_from_url, store_url)
            )

            df = read_table(factory=ds_factory)

        Parameters
        ----------
        dataset_uuid: str
            The unique indetifier for the dataset.
        store_factory: callable
            A callable which creates a KeyValueStore object
        load_schema: bool
            Load the schema information immediately.
        load_all_indices: bool
            Load all indices immediately.
        load_dataset_metadata: bool
            Keep the user metadata in memory
        """
        self._cache_metadata: Optional[DatasetMetadata] = None
        self._cache_store = None

        _check_callable(store_factory)
        self.store_factory = store_factory
        self.dataset_uuid = dataset_uuid
        self.load_schema = load_schema
        self._ds_callable = None
        self.is_loaded = False
        self.load_dataset_metadata = load_dataset_metadata
        self.load_all_indices_flag = load_all_indices
Esempio n. 2
0
def store_delayed_as_dataset(
    delayed_tasks,
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    secondary_indices=None,
):
    """
    Transform and store a list of dictionaries containing
    dataframes to a kartothek dataset in store.

    Parameters
    ----------
    delayed_tasks: list of dask.delayed
        Every delayed object represents a partition and should be accepted by
        :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition`


    Returns
    -------
    A dask.delayed dataset object.
    """
    _check_callable(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    input_to_mps = partial(parse_input_to_metapartition,
                           metadata_version=metadata_version)
    mps = map_delayed(delayed_tasks, input_to_mps)

    if partition_on:
        mps = map_delayed(mps,
                          MetaPartition.partition_on,
                          partition_on=partition_on)

    if secondary_indices:
        mps = map_delayed(mps,
                          MetaPartition.build_indices,
                          columns=secondary_indices)

    mps = map_delayed(
        mps,
        MetaPartition.store_dataframes,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    return delayed(store_dataset_from_partitions)(
        mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )
Esempio n. 3
0
def merge_datasets_as_delayed(
    left_dataset_uuid,
    right_dataset_uuid,
    store,
    merge_tasks,
    match_how="exact",
    label_merger=None,
    metadata_merger=None,
):
    """
    A dask.delayed graph to perform the merge of two full kartothek datasets.

    Parameters
    ----------
    left_dataset_uuid : basestring
        UUID for left dataset (order does not matter in all merge schemas)
    right_dataset_uuid : basestring
        UUID for right dataset (order does not matter in all merge schemas)
    match_how : basestring or callable, {left, right, prefix, exact}
        Define the partition label matching scheme.
        Available implementations are:

    Parameters
    ----------
    left_dataset_uuid : str
        UUID for left dataset (order does not matter in all merge schemas)
    right_dataset_uuid : str
        UUID for right dataset (order does not matter in all merge schemas)
    match_how : Union[str, Callable]
        Define the partition label matching scheme.
        Available implementations are:

        * left (right) : The left (right) partitions are considered to be
                            the base partitions and **all** partitions of the
                            right (left) dataset are joined to the left
                            partition. This should only be used if one of the
                            datasets contain very few partitions.
        * prefix : The labels of the partitions of the dataset with fewer
                    partitions are considered to be the prefixes to the
                    right dataset
        * exact : All partition labels of the left dataset need to have
                    an exact match in the right dataset
        * callable : A callable with signature func(left, right) which
                        returns a boolean to determine if the partitions match

        If True, an exact match of partition labels between the to-be-merged
        datasets is required in order to merge.
        If False (Default), the partition labels of the dataset with fewer
        partitions are interpreted as prefixes.
    merge_tasks : List[Dict]
        A list of merge tasks. Each item in this list is a dictionary giving
        explicit instructions for a specific merge.
        Each dict should contain key/values:

        * `left`: The table for the left dataframe
        * `right`: The table for the right dataframe
        * 'output_label' : The table for the merged dataframe
        * `merge_func`: A callable with signature
                        `merge_func(left_df, right_df, merge_kwargs)` to
                        handle the data preprocessing and merging.
                        Default pandas.merge
        * 'merge_kwargs' : The kwargs to be passed to the `merge_func`

        Example:

        .. code::

            >>> merge_tasks = [
            ...     {
            ...         "left": "left_dict",
            ...         "right": "right_dict",
            ...         "merge_kwargs": {"kwargs of merge_func": ''},
            ...         "output_label": 'merged_core_data'
            ...     },
            ... ]

    """
    _check_callable(store)

    mps = align_datasets(
        left_dataset_uuid=left_dataset_uuid,
        right_dataset_uuid=right_dataset_uuid,
        store=store,
        match_how=match_how,
    )
    mps = map_delayed(
        mps,
        _load_and_merge_mps,
        store=store,
        label_merger=label_merger,
        metadata_merger=metadata_merger,
        merge_tasks=merge_tasks,
    )

    return mps
Esempio n. 4
0
def store_bag_as_dataset(
    bag,
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    secondary_indices=None,
):
    """
    Transform and store a dask.bag of dictionaries containing
    dataframes to a kartothek dataset in store.

    This is the dask.bag-equivalent of
    :func:`store_delayed_as_dataset`. See there
    for more detailed documentation on the different possible input types.

    Parameters
    ----------
    bag: dask.bag
        A dask bag containing dictionaries of dataframes or dataframes.

    Returns
    -------
    A dask.bag.Item dataset object.
    """
    _check_callable(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    input_to_mps = partial(
        parse_input_to_metapartition, metadata_version=metadata_version
    )
    mps = bag.map(input_to_mps)

    if partition_on:
        mps = mps.map(MetaPartition.partition_on, partition_on=partition_on)

    if secondary_indices:
        mps = mps.map(MetaPartition.build_indices, columns=secondary_indices)

    mps = mps.map(
        MetaPartition.store_dataframes,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    aggregate = partial(
        _store_dataset_from_partitions_flat,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )

    return mps.reduction(perpartition=list, aggregate=aggregate, split_every=False)