コード例 #1
0
def build_dataset_indices(store, dataset_uuid, columns, factory=None):
    """
    Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`.

    This function loads the dataset, computes the requested indices and writes
    the indices to the dataset. The dataset partitions itself are not mutated.

    Parameters
    ----------
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    new_partitions = []
    for mp in read_dataset_as_metapartitions__iterator(factory=ds_factory):
        mp = mp.build_indices(columns=columns)
        mp = mp.remove_dataframes()  # Remove dataframe from memory
        new_partitions.append(mp)

    return update_indices_from_partitions(new_partitions,
                                          dataset_metadata_factory=ds_factory)
コード例 #2
0
def _load_metapartitions(*args, **kwargs):
    return list(read_dataset_as_metapartitions__iterator(*args, **kwargs))
コード例 #3
0
def read_dataset_as_metapartitions(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
):
    """
    Read a dataset as a list of :class:`kartothek.io_components.metapartition.MetaPartition`.

    Every element of the list corresponds to a physical partition.

    Parameters
    ----------

    Returns
    -------
    List[kartothek.io_components.metapartition.MetaPartition]
        Returns a tuple of the loaded dataframe and the dataset metadata

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_dataset_as_dataframe

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> list_mps = read_dataset_as_metapartitions('dataset_uuid', store, 'core')

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )
    from .iter import read_dataset_as_metapartitions__iterator

    ds_iter = read_dataset_as_metapartitions__iterator(
        tables=tables,
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
        dispatch_by=dispatch_by,
    )
    return list(ds_iter)