Ejemplo n.º 1
0
def garbage_collect_dataset(dataset_uuid=None, store=None, factory=None):
    """
    Remove auxiliary files that are no longer tracked by the dataset.

    These files include indices that are no longer referenced by the metadata
    as well as files in the directories of the tables that are no longer
    referenced. The latter is only applied to static datasets.

    Parameters
    ----------
    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    nested_files = dispatch_files_to_gc(dataset_uuid=None,
                                        store_factory=None,
                                        chunk_size=None,
                                        factory=ds_factory)

    # Given that `nested_files` is a generator with a single element, just
    # return the output of `delete_files` on that element.
    return delete_files(next(nested_files),
                        store_factory=ds_factory.store_factory)
Ejemplo n.º 2
0
def validate_partition_keys(
    dataset_uuid,
    store,
    ds_factory,
    default_metadata_version,
    partition_on,
    load_dataset_metadata=True,
):
    if ds_factory or DatasetMetadata.exists(dataset_uuid, ensure_store(store)):
        ds_factory = _ensure_factory(
            dataset_uuid=dataset_uuid,
            store=store,
            factory=ds_factory,
            load_dataset_metadata=load_dataset_metadata,
        )

        ds_metadata_version = ds_factory.metadata_version
        if partition_on:
            if not isinstance(partition_on, list):
                partition_on = [partition_on]
            if partition_on != ds_factory.partition_keys:
                raise ValueError(
                    "Incompatible set of partition keys encountered. "
                    "Input partitioning was `{}` while actual dataset was `{}`".format(
                        partition_on, ds_factory.partition_keys
                    )
                )
        else:
            partition_on = ds_factory.partition_keys
    else:
        ds_factory = None
        ds_metadata_version = default_metadata_version
    return ds_factory, ds_metadata_version, partition_on
Ejemplo n.º 3
0
def delete_dataset(dataset_uuid=None, store=None, factory=None):
    """
    Parameters
    ----------
    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        load_schema=False,
        store=_make_callable(store),
        factory=factory,
        load_dataset_metadata=False,
    )

    # Remove possibly unreferenced files
    garbage_collect_dataset(factory=ds_factory)

    # Delete indices first since they do not affect dataset integrity
    delete_indices(dataset_factory=ds_factory)

    for metapartition in dispatch_metapartitions_from_factory(ds_factory):
        metapartition = cast(MetaPartition, metapartition)
        metapartition.delete_from_store(dataset_uuid=dataset_uuid, store=store)

    # delete common metadata after partitions
    delete_common_metadata(dataset_factory=ds_factory)

    # Delete the top level metadata file
    delete_top_level_metadata(dataset_factory=ds_factory)
Ejemplo n.º 4
0
def store_dataset_from_ddf(
    ddf: dd.DataFrame,
    store: StoreInput,
    dataset_uuid: str,
    table: str = SINGLE_TABLE,
    secondary_indices: Optional[List[str]] = None,
    shuffle: bool = False,
    repartition_ratio: Optional[SupportsFloat] = None,
    num_buckets: int = 1,
    sort_partitions_by: Optional[Union[List[str], str]] = None,
    delete_scope: Optional[Iterable[Mapping[str, str]]] = None,
    metadata: Optional[Mapping] = None,
    df_serializer: Optional[DataFrameSerializer] = None,
    metadata_merger: Optional[Callable] = None,
    metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    bucket_by: Optional[Union[List[str], str]] = None,
    overwrite: bool = False,
):
    """
    Store a dataset from a dask.dataframe.
    """
    partition_on = normalize_arg("partition_on", partition_on)
    secondary_indices = normalize_arg("secondary_indices", secondary_indices)
    sort_partitions_by = normalize_arg("sort_partitions_by", sort_partitions_by)
    bucket_by = normalize_arg("bucket_by", bucket_by)
    store = normalize_arg("store", store)
    delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope)

    if table is None:
        raise TypeError("The parameter `table` is not optional.")

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid, store=store, factory=None, load_dataset_metadata=True
    )

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)
    mps = _write_dataframe_partitions(
        ddf=ddf,
        store=store,
        dataset_uuid=dataset_uuid,
        table=table,
        secondary_indices=secondary_indices,
        shuffle=shuffle,
        repartition_ratio=repartition_ratio,
        num_buckets=num_buckets,
        sort_partitions_by=sort_partitions_by,
        df_serializer=df_serializer,
        metadata_version=metadata_version,
        partition_on=partition_on,
        bucket_by=bucket_by,
    )
    return dask.delayed(store_dataset_from_partitions)(
        mps,
        store=ds_factory.store_factory if ds_factory else store,
        dataset_uuid=ds_factory.dataset_uuid if ds_factory else dataset_uuid,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 5
0
def hash_dataset(
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    subset=None,
    group_key=None,
    table: str = SINGLE_TABLE,
    predicates: Optional[PredicatesType] = None,
    factory: Optional[DatasetFactory] = None,
) -> dd.Series:
    """
    Calculate a partition wise, or group wise, hash of the dataset.

    .. note::

        We do not guarantee the hash values to remain constant accross versions.


    Example output::

        Assuming a dataset with two unique values in column `P` this gives

        >>> hash_dataset(factory=dataset_with_index_factory,group_key=["P"]).compute()
        ... P
        ... 1    11462879952839863487
        ... 2    12568779102514529673
        ... dtype: uint64

    Parameters
    ----------
    subset
        If provided, only take these columns into account when hashing the dataset
    group_key
        If provided, calculate hash per group instead of per partition
    """
    dataset_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    columns = subset
    if subset and group_key:
        columns = sorted(set(subset) | set(group_key))
    ddf = read_dataset_as_ddf(
        table=table,
        predicates=predicates,
        factory=dataset_factory,
        columns=columns,
        dates_as_object=True,
    )
    if not group_key:
        return ddf.map_partitions(_hash_partition, meta="uint64").astype("uint64")
    else:
        ddf2 = pack_payload(ddf, group_key=group_key)
        return (
            ddf2.groupby(group_key)
            .apply(_unpack_hash, unpack_meta=ddf._meta, subset=subset, meta="uint64")
            .astype("uint64")
        )
Ejemplo n.º 6
0
def build_dataset_indices(store, dataset_uuid, columns, factory=None):
    """
    Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`.

    This function loads the dataset, computes the requested indices and writes
    the indices to the dataset. The dataset partitions itself are not mutated.

    Parameters
    ----------
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    new_partitions = []
    for mp in read_dataset_as_metapartitions__iterator(factory=ds_factory):
        mp = mp.build_indices(columns=columns)
        mp = mp.remove_dataframes()  # Remove dataframe from memory
        new_partitions.append(mp)

    return update_indices_from_partitions(new_partitions,
                                          dataset_metadata_factory=ds_factory)
Ejemplo n.º 7
0
def delete_dataset__delayed(dataset_uuid=None, store=None, factory=None):
    """
    Parameters
    ----------
    """
    dataset_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_schema=False,
        load_dataset_metadata=False,
    )

    gc = garbage_collect_dataset__delayed(factory=dataset_factory)

    mps = dispatch_metapartitions_from_factory(dataset_factory)

    delayed_dataset_uuid = delayed(_delete_all_additional_metadata)(
        dataset_factory=dataset_factory)

    mps = map_delayed(
        mps,
        MetaPartition.delete_from_store,
        store=store,
        dataset_uuid=delayed_dataset_uuid,
    )

    return delayed(_delete_tl_metadata)(dataset_factory, mps, gc)
Ejemplo n.º 8
0
def garbage_collect_dataset__delayed(
    dataset_uuid: Optional[str] = None,
    store: StoreInput = None,
    chunk_size: int = 100,
    factory=None,
) -> List[Delayed]:
    """
    Remove auxiliary files that are no longer tracked by the dataset.

    These files include indices that are no longer referenced by the metadata
    as well as files in the directories of the tables that are no longer
    referenced. The latter is only applied to static datasets.

    Parameters
    ----------
    chunk_size
        Number of files that should be deleted in a single job.

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    nested_files = dispatch_files_to_gc(
        dataset_uuid=None, store_factory=None, chunk_size=chunk_size, factory=ds_factory
    )
    return list(
        map_delayed(delete_files, nested_files, store_factory=ds_factory.store_factory)
    )
Ejemplo n.º 9
0
def read_dataset_as_dataframes(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
):
    """
    Read a dataset as a list of dataframes.

    Every element of the list corresponds to a physical partition.

    Parameters
    ----------

    Returns
    -------
    List[pandas.DataFrame]
        Returns a list of pandas.DataFrame. One element per partition

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_table

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> dfs = read_dataset_as_dataframes('dataset_uuid', store, 'core')

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=_make_callable(store),
        factory=factory,
        load_dataset_metadata=True,
    )

    mps = read_dataset_as_metapartitions(
        tables=tables,
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
    )
    return [mp.data for mp in mps]
Ejemplo n.º 10
0
def read_dataset_as_metapartitions__iterator(
    dataset_uuid=None,
    store=None,
    columns=None,
    predicate_pushdown_to_io=True,
    categoricals=None,
    dates_as_object: bool = True,
    predicates=None,
    factory=None,
    dispatch_by=None,
):
    """

    A Python iterator to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator`

    Parameters
    ----------

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid, store=store, factory=factory,
    )

    store = ds_factory.store
    mps = dispatch_metapartitions_from_factory(
        ds_factory, predicates=predicates, dispatch_by=dispatch_by,
    )

    for mp in mps:
        if dispatch_by is not None:
            mp = MetaPartition.concat_metapartitions(
                [
                    mp_inner.load_dataframes(
                        store=store,
                        columns=columns,
                        categoricals=categoricals,
                        predicate_pushdown_to_io=predicate_pushdown_to_io,
                        predicates=predicates,
                    )
                    for mp_inner in mp
                ]
            )
        else:
            mp = cast(MetaPartition, mp)
            mp = mp.load_dataframes(
                store=store,
                columns=columns,
                categoricals=categoricals,
                predicate_pushdown_to_io=predicate_pushdown_to_io,
                dates_as_object=dates_as_object,
                predicates=predicates,
            )
        yield mp
Ejemplo n.º 11
0
def read_dataset_as_ddf(
    dataset_uuid=None,
    store=None,
    table=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
):
    """
    Retrieve a single table from a dataset as partition-individual :class:`~dask.dataframe.DataFrame` instance.

    Please take care when using categoricals with Dask. For index columns, this function will construct dataset
    wide categoricals. For all other columns, Dask will determine the categories on a partition level and will
    need to merge them when shuffling data.

    Parameters
    ----------
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )
    if isinstance(columns, dict):
        columns = columns[table]
    meta = _get_dask_meta_for_dataset(
        ds_factory, table, columns, categoricals, dates_as_object
    )

    if columns is None:
        columns = list(meta.columns)

    # that we can use factories instead of dataset_uuids
    delayed_partitions = read_table_as_delayed(
        factory=ds_factory,
        table=table,
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals={table: categoricals},
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
    )

    return dd.from_delayed(delayed_partitions, meta=meta)
Ejemplo n.º 12
0
def build_dataset_indices__bag(
    store, dataset_uuid, columns, partition_size=None, factory=None
):
    """
    Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`.

    This function loads the dataset, computes the requested indices and writes
    the indices to the dataset. The dataset partitions itself are not mutated.

    Parameters
    ----------
    partition_size: Optional[int]
        Dask bag partition size. Use a larger numbers to decrease scheduler load and overhead, use smaller numbers for a
        fine-grained scheduling and better resilience against worker errors.

    Returns
    -------
    A dask.delayed computation object.
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    cols_to_load = {
        table: set(columns) & set(meta.names)
        for table, meta in ds_factory.table_meta.items()
    }
    cols_to_load = {table: cols for table, cols in cols_to_load.items() if cols}

    mps = dispatch_metapartitions_from_factory(ds_factory)

    return (
        db.from_sequence(seq=mps, partition_size=partition_size)
        .map(
            MetaPartition.load_dataframes,
            store=ds_factory.store_factory,
            tables=list(cols_to_load.keys()),
            columns=cols_to_load,
        )
        .map(MetaPartition.build_indices, columns=columns)
        .map(MetaPartition.remove_dataframes)
        .reduction(list, list, split_every=False, out_type=db.Bag)
        .flatten()
        .map_partitions(list)
        .map_partitions(
            update_indices_from_partitions, dataset_metadata_factory=ds_factory
        )
    )
Ejemplo n.º 13
0
def build_dataset_indices__bag(
    store: Optional[StoreInput],
    dataset_uuid: Optional[str],
    columns: Sequence[str],
    partition_size: Optional[int] = None,
    factory: Optional[DatasetFactory] = None,
) -> Delayed:
    """
    Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`.

    This function loads the dataset, computes the requested indices and writes
    the indices to the dataset. The dataset partitions itself are not mutated.

    Parameters
    ----------

    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    cols_to_load = {
        table: set(columns) & set(meta.names)
        for table, meta in ds_factory.table_meta.items()
    }
    cols_to_load = {table: cols for table, cols in cols_to_load.items() if cols}

    mps = dispatch_metapartitions_from_factory(ds_factory)

    return (
        db.from_sequence(seq=mps, partition_size=partition_size)
        .map(
            MetaPartition.load_dataframes,
            store=ds_factory.store_factory,
            tables=list(cols_to_load.keys()),
            columns=cols_to_load,
        )
        .map(MetaPartition.build_indices, columns=columns)
        .map(MetaPartition.remove_dataframes)
        .reduction(list, list, split_every=False, out_type=db.Bag)
        .flatten()
        .map_partitions(list)
        .map_partitions(
            update_indices_from_partitions, dataset_metadata_factory=ds_factory
        )
    )
Ejemplo n.º 14
0
def dispatch_files_to_gc(dataset_uuid, store_factory, chunk_size, factory):
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store_factory,
        factory=factory,
        load_dataset_metadata=False,
    )
    dataset_uuid = dataset_uuid or ds_factory.uuid

    index_path = "{dataset_uuid}/indices/".format(dataset_uuid=dataset_uuid)
    remove_index_files = set(ds_factory.store.iter_keys(prefix=index_path))

    for index in ds_factory.indices.values():
        index_keys = set()
        # We only add the indices that are saved as explicit indices
        if index.index_storage_key:
            index_keys.add(index.index_storage_key)
        remove_index_files -= index_keys

    remove_table_files = set()
    if ds_factory.explicit_partitions:
        table_files = set()
        for partition in ds_factory.partitions.values():
            for name in partition.files.values():
                table_files.add(name)

        for table in ds_factory.tables:
            table_path = "{dataset_uuid}/{table}/".format(
                dataset_uuid=dataset_uuid, table=table
            )
            table_files.add(table_path + TABLE_METADATA_FILE)
            for key in ds_factory.store.iter_keys(prefix=table_path):
                remove_table_files.add(key)
        remove_table_files -= table_files

    files_to_remove = list(remove_index_files | remove_table_files)

    if chunk_size is None:
        yield files_to_remove
    else:
        for i in range(0, len(files_to_remove), chunk_size):
            yield files_to_remove[i : i + chunk_size]
Ejemplo n.º 15
0
def garbage_collect_dataset__delayed(dataset_uuid=None,
                                     store=None,
                                     chunk_size=100,
                                     factory=None):
    """
    Remove auxiliary files that are no longer tracked by the dataset.

    These files include indices that are no longer referenced by the metadata
    as well as files in the directories of the tables that are no longer
    referenced. The latter is only applied to static datasets.

    Parameters
    ----------
    dataset_uuid: basestring
        The UUID of the dataset to be deleted
    store: callable
        A function returning a KeyValueStore.
    chunk_size: int
        Number of files that should be deleted in a single job.

    Returns
    -------
    tasks: list of dask.delayed
    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    nested_files = dispatch_files_to_gc(dataset_uuid=None,
                                        store_factory=None,
                                        chunk_size=chunk_size,
                                        factory=ds_factory)
    return [
        delayed(delete_files)(files, store_factory=ds_factory.store_factory)
        for files in nested_files
    ]
Ejemplo n.º 16
0
def build_dataset_indices(store, dataset_uuid, columns, factory=None):
    """
    Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`.

    This function loads the dataset, computes the requested indices and writes
    the indices to the dataset. The dataset partitions itself are not mutated.

    Parameters
    ----------
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    cols_to_load = {
        table: set(columns) & set(meta.names)
        for table, meta in ds_factory.table_meta.items()
    }
    cols_to_load = {
        table: cols
        for table, cols in cols_to_load.items() if cols
    }

    new_partitions = []
    for mp in dispatch_metapartitions_from_factory(ds_factory):
        mp = mp.load_dataframes(
            store=ds_factory.store,
            tables=list(cols_to_load.keys()),
            columns=cols_to_load,
        )
        mp = mp.build_indices(columns=columns)
        mp = mp.remove_dataframes()  # Remove dataframe from memory
        new_partitions.append(mp)

    return update_indices_from_partitions(new_partitions,
                                          dataset_metadata_factory=ds_factory)
Ejemplo n.º 17
0
def read_table(
    dataset_uuid=None,
    store=None,
    table=SINGLE_TABLE,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
):
    """
    A utility function to load a single table with multiple partitions as a single dataframe in one go.
    Mostly useful for smaller tables or datasets where all partitions fit into memory.

    The order of partitions is not guaranteed to be stable in the resulting dataframe.

    Parameters
    ----------

    Returns
    -------
    pandas.DataFrame
        Returns a pandas.DataFrame holding the data of the requested columns

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_table

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> df = read_table(store, 'dataset_uuid', 'core')

    """
    if concat_partitions_on_primary_index is not False:
        warnings.warn(
            "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if not isinstance(table, str):
        raise TypeError("Argument `table` needs to be a string")

    columns = _check_compatible_list(table, columns, "columns")
    categoricals = _check_compatible_list(table, categoricals, "categoricals")

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=_make_callable(store),
        factory=factory,
        load_dataset_metadata=False,
    )
    partitions = read_dataset_as_dataframes(
        tables=[table],
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
    )

    empty_df = empty_dataframe_from_schema(
        schema=ds_factory.table_meta[table],
        columns=columns[table] if columns is not None else None,
    )
    dfs = [partition_data[table] for partition_data in partitions] + [empty_df]
    # require meta 4 otherwise, can't construct types/columns
    if categoricals:
        dfs = align_categories(dfs, categoricals[table])
    df = pd.concat(dfs, ignore_index=True, sort=False)

    # ensure column order
    if len(empty_df.columns) > 0:
        df = df.reindex(empty_df.columns, copy=False, axis=1)

    return df
Ejemplo n.º 18
0
def read_dataset_as_delayed_metapartitions(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
):
    """
    A collection of dask.delayed objects to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io.dask.read_dataset_as_delayed`

    Parameters
    ----------

    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )
    store = ds_factory.store_factory
    mps = dispatch_metapartitions_from_factory(
        dataset_factory=ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
    )

    if concat_partitions_on_primary_index or dispatch_by:
        mps = _load_and_concat_metapartitions(
            mps,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )
    else:
        mps = map_delayed(
            mps,
            MetaPartition.load_dataframes,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, categoricals)

    if categoricals_from_index:
        func_dict = defaultdict(_identity)
        func_dict.update({
            table: partial(_cast_categorical_to_index_cat, categories=cats)
            for table, cats in categoricals_from_index.items()
        })
        mps = map_delayed(mps, MetaPartition.apply, func_dict, type_safe=True)

    return mps
Ejemplo n.º 19
0
def read_dataset_as_metapartitions_bag(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
    partition_size=None,
    dispatch_metadata=True,
):
    """
    Retrieve dataset as `dask.bag.Bag` of `MetaPartition` objects.

    Parameters
    ----------

    Returns
    -------
    dask.bag.Bag:
        A dask.bag object containing the metapartions.
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )

    if len(ds_factory.tables) > 1:
        warnings.warn(
            "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next "
            "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube "
            "functionality. "
            "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html",
            DeprecationWarning,
        )

    store = ds_factory.store_factory
    mps = dispatch_metapartitions_from_factory(
        dataset_factory=ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
        dispatch_metadata=dispatch_metadata,
    )
    mps = db.from_sequence(mps, partition_size=partition_size)

    if concat_partitions_on_primary_index or dispatch_by is not None:
        mps = mps.map(
            _load_and_concat_metapartitions_inner,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )
    else:
        mps = mps.map(
            MetaPartition.load_dataframes,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, categoricals)

    if categoricals_from_index:
        func_dict = defaultdict(_identity)
        func_dict.update({
            table: partial(_cast_categorical_to_index_cat, categories=cats)
            for table, cats in categoricals_from_index.items()
        })
        mps = mps.map(MetaPartition.apply, func_dict, type_safe=True)
    return mps
Ejemplo n.º 20
0
def read_dataset_as_metapartitions_bag(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
    partition_size=None,
):
    """
    Retrieve dataset as `dask.bag` of `MetaPartition` objects.

    Parameters
    ----------

    Returns
    -------
    A dask.bag object containing the metapartions.
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )
    store = ds_factory.store_factory
    mps = dispatch_metapartitions_from_factory(
        dataset_factory=ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
    )
    mps = db.from_sequence(mps, partition_size=partition_size)

    if concat_partitions_on_primary_index or dispatch_by:
        mps = mps.map(
            _load_and_concat_metapartitions_inner,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )
    else:
        mps = mps.map(
            MetaPartition.load_dataframes,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, categoricals
    )

    if categoricals_from_index:
        func_dict = defaultdict(_identity)
        func_dict.update(
            {
                table: partial(_cast_categorical_to_index_cat, categories=cats)
                for table, cats in categoricals_from_index.items()
            }
        )
        mps = mps.map(MetaPartition.apply, func_dict, type_safe=True)
    return mps
Ejemplo n.º 21
0
def read_dataset_as_metapartitions(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
    dispatch_metadata=True,
):
    """
    Read a dataset as a list of :class:`kartothek.io_components.metapartition.MetaPartition`.

    Every element of the list corresponds to a physical partition.

    Parameters
    ----------

    Returns
    -------
    List[kartothek.io_components.metapartition.MetaPartition]
        Returns a tuple of the loaded dataframe and the dataset metadata

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_dataset_as_dataframe

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> list_mps = read_dataset_as_metapartitions('dataset_uuid', store, 'core')

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    if len(ds_factory.tables) > 1:
        warnings.warn(
            "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next "
            "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube "
            "functionality. "
            "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html",
            DeprecationWarning,
        )

    from .iter import read_dataset_as_metapartitions__iterator

    ds_iter = read_dataset_as_metapartitions__iterator(
        tables=tables,
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
        dispatch_by=dispatch_by,
        dispatch_metadata=dispatch_metadata,
    )
    return list(ds_iter)
Ejemplo n.º 22
0
def read_dataset_as_delayed_metapartitions(
    dataset_uuid=None,
    store=None,
    columns=None,
    predicate_pushdown_to_io=True,
    categoricals: Optional[Sequence[str]] = None,
    dates_as_object: bool = True,
    predicates=None,
    factory=None,
    dispatch_by=None,
):
    """
    A collection of dask.delayed objects to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io.dask.read_dataset_as_delayed`

    Parameters
    ----------

    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid, store=store, factory=factory,
    )

    store = ds_factory.store_factory
    mps = dispatch_metapartitions_from_factory(
        dataset_factory=ds_factory, predicates=predicates, dispatch_by=dispatch_by,
    )

    if dispatch_by is not None:
        mps = _load_and_concat_metapartitions(
            mps,
            store=store,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )
    else:
        mps = map_delayed(
            MetaPartition.load_dataframes,
            mps,
            store=store,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, categoricals
    )

    if categoricals_from_index:

        mps = map_delayed(
            partial(  # type: ignore
                MetaPartition.apply,
                func=partial(  # type: ignore
                    _cast_categorical_to_index_cat, categories=categoricals_from_index
                ),
                type_safe=True,
            ),
            mps,
        )

    return list(mps)
Ejemplo n.º 23
0
def read_dataset_as_ddf(
    dataset_uuid=None,
    store=None,
    table=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
    dask_index_on=None,
):
    """
    Retrieve a single table from a dataset as partition-individual :class:`~dask.dataframe.DataFrame` instance.

    Please take care when using categoricals with Dask. For index columns, this function will construct dataset
    wide categoricals. For all other columns, Dask will determine the categories on a partition level and will
    need to merge them when shuffling data.

    Parameters
    ----------
    dask_index_on: str
        Reconstruct (and set) a dask index on the provided index column.

        For details on performance, see also `dispatch_by`
    """
    if dask_index_on is not None and not isinstance(dask_index_on, str):
        raise TypeError(
            f"The paramter `dask_index_on` must be a string but got {type(dask_index_on)}"
        )
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )
    if isinstance(columns, dict):
        columns = columns[table]
    meta = _get_dask_meta_for_dataset(ds_factory, table, columns, categoricals,
                                      dates_as_object)

    if columns is None:
        columns = list(meta.columns)

    # that we can use factories instead of dataset_uuids
    delayed_partitions = read_table_as_delayed(
        factory=ds_factory,
        table=table,
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals={table: categoricals},
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        dispatch_by=dask_index_on,
    )
    if dask_index_on:
        divisions = ds_factory.indices[dask_index_on].observed_values()
        divisions.sort()
        divisions = list(divisions)
        divisions.append(divisions[-1])
        return dd.from_delayed(delayed_partitions,
                               meta=meta,
                               divisions=divisions).set_index(
                                   dask_index_on,
                                   divisions=divisions,
                                   sorted=True)
    else:
        return dd.from_delayed(delayed_partitions, meta=meta)
Ejemplo n.º 24
0
def read_dataset_as_delayed_metapartitions(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
    dispatch_metadata=True,
):
    """
    A collection of dask.delayed objects to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io.dask.read_dataset_as_delayed`

    Parameters
    ----------

    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )

    if len(ds_factory.tables) > 1:
        warnings.warn(
            "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next "
            "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube "
            "functionality. "
            "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html",
            DeprecationWarning,
        )

    store = ds_factory.store_factory
    mps = dispatch_metapartitions_from_factory(
        dataset_factory=ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
        dispatch_metadata=dispatch_metadata,
    )

    if concat_partitions_on_primary_index or dispatch_by is not None:
        mps = _load_and_concat_metapartitions(
            mps,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )
    else:
        mps = map_delayed(
            MetaPartition.load_dataframes,
            mps,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, categoricals)

    if categoricals_from_index:
        func_dict = defaultdict(_identity)
        func_dict.update({
            table: partial(_cast_categorical_to_index_cat, categories=cats)
            for table, cats in categoricals_from_index.items()
        })
        mps = map_delayed(
            partial(MetaPartition.apply, func=func_dict, type_safe=True), mps)

    return list(mps)
Ejemplo n.º 25
0
def read_dataset_as_ddf(
    dataset_uuid=None,
    store=None,
    table=SINGLE_TABLE,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
    dask_index_on=None,
    dispatch_by=None,
):
    """
    Retrieve a single table from a dataset as partition-individual :class:`~dask.dataframe.DataFrame` instance.

    Please take care when using categoricals with Dask. For index columns, this function will construct dataset
    wide categoricals. For all other columns, Dask will determine the categories on a partition level and will
    need to merge them when shuffling data.

    Parameters
    ----------
    dask_index_on: str
        Reconstruct (and set) a dask index on the provided index column. Cannot be used
        in conjunction with `dispatch_by`.

        For details on performance, see also `dispatch_by`
    """
    if dask_index_on is not None and not isinstance(dask_index_on, str):
        raise TypeError(
            f"The paramter `dask_index_on` must be a string but got {type(dask_index_on)}"
        )

    if dask_index_on is not None and dispatch_by is not None and len(dispatch_by) > 0:
        raise ValueError(
            "`read_dataset_as_ddf` got parameters `dask_index_on` and `dispatch_by`. "
            "Note that `dispatch_by` can only be used if `dask_index_on` is None."
        )

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    if len(ds_factory.tables) > 1:
        warnings.warn(
            "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next "
            "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube "
            "functionality. "
            "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html",
            DeprecationWarning,
        )

    if isinstance(columns, dict):
        columns = columns[table]
    meta = _get_dask_meta_for_dataset(
        ds_factory, table, columns, categoricals, dates_as_object
    )

    if columns is None:
        columns = list(meta.columns)

    # that we can use factories instead of dataset_uuids
    delayed_partitions = read_table_as_delayed(
        factory=ds_factory,
        table=table,
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals={table: categoricals},
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        dispatch_by=dask_index_on if dask_index_on else dispatch_by,
    )
    if dask_index_on:
        divisions = ds_factory.indices[dask_index_on].observed_values()
        divisions.sort()
        divisions = list(divisions)
        divisions.append(divisions[-1])
        return dd.from_delayed(
            delayed_partitions, meta=meta, divisions=divisions
        ).set_index(dask_index_on, divisions=divisions, sorted=True)
    else:
        return dd.from_delayed(delayed_partitions, meta=meta)
Ejemplo n.º 26
0
def collect_dataset_metadata(
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    table_name: str = SINGLE_TABLE,
    predicates: Optional[PredicatesType] = None,
    frac: float = 1.0,
    factory: Optional[DatasetFactory] = None,
) -> dd.DataFrame:
    """
    Collect parquet metadata of the dataset. The `frac` parameter can be used to select a subset of the data.

    .. warning::
      If the size of the partitions is not evenly distributed, e.g. some partitions might be larger than others,
      the metadata returned is not a good approximation for the whole dataset metadata.
    .. warning::
      Using the `frac` parameter is not encouraged for a small number of total partitions.


    Parameters
    ----------
    predicates
      Kartothek predicates to apply filters on the data for which to gather statistics

      .. warning::
          Filtering will only be applied for predicates on indices.
          The evaluation of the predicates therefore will therefore only return an approximate result.

    frac
      Fraction of the total number of partitions to use for gathering statistics. `frac == 1.0` will use all partitions.

    Returns
    -------
    dask.dataframe.DataFrame:
        A dask.DataFrame containing the following information about dataset statistics:
        * `partition_label`: File name of the parquet file, unique to each physical partition.
        * `row_group_id`: Index of the row groups within one parquet file.
        * `row_group_compressed_size`: Byte size of the data within one row group.
        * `row_group_uncompressed_size`: Byte size (uncompressed) of the data within one row group.
        * `number_rows_total`: Total number of rows in one parquet file.
        * `number_row_groups`: Number of row groups in one parquet file.
        * `serialized_size`: Serialized size of the parquet file.
        * `number_rows_per_row_group`: Number of rows per row group.

    Raises
    ------
    ValueError
      If no metadata could be retrieved, raise an error.

    """
    if not 0.0 < frac <= 1.0:
        raise ValueError(
            f"Invalid value for parameter `frac`: {frac}."
            "Please make sure to provide a value larger than 0.0 and smaller than or equal to 1.0 ."
        )
    dataset_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    mps = list(
        dispatch_metapartitions_from_factory(dataset_factory, predicates=predicates)
    )
    if mps:
        random.shuffle(mps)
        # ensure that even with sampling at least one metapartition is returned
        cutoff_index = max(1, int(len(mps) * frac))
        mps = mps[:cutoff_index]
        ddf = dd.from_delayed(
            [
                dask.delayed(MetaPartition.get_parquet_metadata)(
                    mp, store=dataset_factory.store_factory, table_name=table_name
                )
                for mp in mps
            ],
            meta=_METADATA_SCHEMA,
        )
    else:
        df = pd.DataFrame(columns=_METADATA_SCHEMA.keys())
        df = df.astype(_METADATA_SCHEMA)
        ddf = dd.from_pandas(df, npartitions=1)

    return ddf
Ejemplo n.º 27
0
def read_dataset_as_metapartitions__iterator(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
):
    """

    A Python iterator to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator`
    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )
    store = ds_factory.store
    mps = dispatch_metapartitions_from_factory(
        ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
    )

    for mp in mps:
        if concat_partitions_on_primary_index:
            mp = MetaPartition.concat_metapartitions([
                mp_inner.load_dataframes(
                    store=store,
                    tables=tables,
                    columns=columns,
                    categoricals=categoricals,
                    predicate_pushdown_to_io=predicate_pushdown_to_io,
                    predicates=predicates,
                ) for mp_inner in mp
            ])
        else:
            mp = mp.load_dataframes(
                store=store,
                tables=tables,
                columns=columns,
                categoricals=categoricals,
                predicate_pushdown_to_io=predicate_pushdown_to_io,
                dates_as_object=dates_as_object,
                predicates=predicates,
            )
        yield mp
Ejemplo n.º 28
0
def read_dataset_as_metapartitions__iterator(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
    dispatch_metadata=True,
):
    """

    A Python iterator to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator`

    Parameters
    ----------

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )

    if len(ds_factory.tables) > 1:
        warnings.warn(
            "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next "
            "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube "
            "functionality. "
            "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html",
            DeprecationWarning,
        )

    store = ds_factory.store
    mps = dispatch_metapartitions_from_factory(
        ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
        dispatch_metadata=dispatch_metadata,
    )

    for mp in mps:
        if concat_partitions_on_primary_index or dispatch_by is not None:
            mp = MetaPartition.concat_metapartitions(
                [
                    mp_inner.load_dataframes(
                        store=store,
                        tables=tables,
                        columns=columns,
                        categoricals=categoricals,
                        predicate_pushdown_to_io=predicate_pushdown_to_io,
                        predicates=predicates,
                    )
                    for mp_inner in mp
                ]
            )
        else:
            mp = cast(MetaPartition, mp)
            mp = mp.load_dataframes(
                store=store,
                tables=tables,
                columns=columns,
                categoricals=categoricals,
                predicate_pushdown_to_io=predicate_pushdown_to_io,
                dates_as_object=dates_as_object,
                predicates=predicates,
            )
        yield mp
Ejemplo n.º 29
0
def read_dataset_as_dataframes(
    dataset_uuid: Optional[str] = None,
    store=None,
    tables: Optional[List[str]] = None,
    columns: Dict[str, List[str]] = None,
    concat_partitions_on_primary_index: bool = False,
    predicate_pushdown_to_io: bool = True,
    categoricals: Dict[str, List[str]] = None,
    label_filter: Callable = None,
    dates_as_object: bool = False,
    predicates: Optional[List[List[Tuple[str, str, Any]]]] = None,
    factory: Optional[DatasetFactory] = None,
    dispatch_by: Optional[List[str]] = None,
) -> List[pd.DataFrame]:
    """
    Read a dataset as a list of dataframes.

    Every element of the list corresponds to a physical partition.

    Parameters
    ----------

    Returns
    -------
    List[pandas.DataFrame]
        Returns a list of pandas.DataFrame. One element per partition

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_dataset_as_dataframes

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> dfs = read_dataset_as_dataframes('dataset_uuid', store, 'core')

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=True,
    )

    mps = read_dataset_as_metapartitions(
        tables=tables,
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
        dispatch_by=dispatch_by,
        dispatch_metadata=False,
    )
    return [mp.data for mp in mps]
Ejemplo n.º 30
0
def read_dataset_as_metapartitions(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
):
    """
    Read a dataset as a list of :class:`kartothek.io_components.metapartition.MetaPartition`.

    Every element of the list corresponds to a physical partition.

    Parameters
    ----------

    Returns
    -------
    List[kartothek.io_components.metapartition.MetaPartition]
        Returns a tuple of the loaded dataframe and the dataset metadata

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_dataset_as_dataframe

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> list_mps = read_dataset_as_metapartitions('dataset_uuid', store, 'core')

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )
    from .iter import read_dataset_as_metapartitions__iterator

    ds_iter = read_dataset_as_metapartitions__iterator(
        tables=tables,
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
        dispatch_by=dispatch_by,
    )
    return list(ds_iter)