Ejemplo n.º 1
0
def ensure_valid_cube_indices(existing_datasets: Mapping[str,
                                                         DatasetMetadataBase],
                              cube: Cube) -> Cube:
    """
    Parse all existing datasets and infer the required set of indices. We do not
    allow indices to be removed or added in update steps at the momenent and
    need to make sure that existing ones are updated properly.
    The returned `Cube` instance will be a copy of the input with
    `index_columns` and `suppress_index_on` fields adjusted to reflect the
    existing datasets.
    """
    dataset_indices = []
    for ds in existing_datasets.values():
        for internal_table in ds.table_meta:
            dataset_columns = set(ds.table_meta[internal_table].names)
            table_indices = cube.index_columns & dataset_columns
            compatible_indices = _ensure_compatible_indices(ds, table_indices)
            if compatible_indices:
                dataset_indices.append(set(compatible_indices))
    required_indices = cube.index_columns.union(*dataset_indices)
    suppress_index_on = cube.suppress_index_on.difference(*dataset_indices)
    # Need to remove dimension columns since they *are* technically indices but
    # the cube interface class declares them as not indexed just to add them
    # later on, assuming it is not blacklisted
    return cube.copy(
        index_columns=required_indices - set(cube.dimension_columns),
        suppress_index_on=suppress_index_on,
    )
Ejemplo n.º 2
0
def update_dataset_from_delayed(
    delayed_tasks,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
):
    """
    A dask.delayed graph to add and store a list of dictionaries containing
    dataframes to a kartothek dataset in store. The input should be a list
    (or splitter pipeline) containing
    :class:`~karothek.io.metapartition.MetaPartition`. If you want to use this
    pipeline step for just deleting partitions without adding new ones you
    have to give an empty meta partition as input (``[Metapartition(None)]``).

    Parameters
    ----------
    """
    partition_on = normalize_arg("partition_on", partition_on)
    secondary_indices = normalize_arg("secondary_indices", secondary_indices)
    delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope)

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        ds_factory=factory,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory,
                                                   secondary_indices)
    mps = _update_dask_partitions_one_to_one(
        delayed_tasks=delayed_tasks,
        secondary_indices=secondary_indices,
        metadata_version=metadata_version,
        partition_on=partition_on,
        store_factory=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
        sort_partitions_by=sort_partitions_by,
    )

    return dask.delayed(update_dataset_from_partitions)(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 3
0
def update_dataset_from_dataframes__iter(
    df_generator,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    central_partition_metadata=True,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    load_dynamic_metadata=True,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
):
    """
    Update a kartothek dataset in store iteratively, using a generator of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).
    """
    if load_dynamic_metadata is not True:
        warnings.warn(
            "The keyword `load_dynamic_metadata` has no use and will be removed soon",
            DeprecationWarning,
        )

    if central_partition_metadata is not True:
        warnings.warn(
            "The keyword `central_partition_metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices)

    if sort_partitions_by:  # Define function which sorts each partition by column
        sort_partitions_by_fn = partial(
            sort_values_categorical, column=sort_partitions_by
        )

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(
            df,
            metadata_version=metadata_version,
            expected_secondary_indices=secondary_indices,
        )

        if sort_partitions_by:
            mp = mp.apply(sort_partitions_by_fn)

        if partition_on:
            mp = mp.partition_on(partition_on=partition_on)

        if secondary_indices:
            mp = mp.build_indices(columns=secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(
            store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid
        )

        new_partitions.append(mp)

    return update_dataset_from_partitions(
        new_partitions,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 4
0
def update_dataset_from_ddf(
    ddf,
    store=None,
    dataset_uuid=None,
    table=None,
    secondary_indices=None,
    shuffle=False,
    repartition_ratio=None,
    num_buckets=1,
    sort_partitions_by=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    factory=None,
    bucket_by=None,
):
    """
    Update a dataset from a dask.dataframe.


    .. admonition:: Behavior without ``shuffle==False``

        In the case without ``partition_on`` every dask partition is mapped to a single kartothek partition

        In the case with ``partition_on`` every dask partition is mapped to N kartothek partitions, where N
        depends on the content of the respective partition, such that every resulting kartothek partition has
        only a single value in the respective ``partition_on`` columns.

    .. admonition:: Behavior with ``shuffle==True``

        ``partition_on`` is mandatory

        Perform a data shuffle to ensure that every primary key will have at most ``num_bucket``.

        .. note::
            The number of allowed buckets will have an impact on the required resources and runtime.
            Using a larger number of allowed buckets will usually reduce resource consumption and in some
            cases also improves runtime performance.

        :Example:

            >>> partition_on="primary_key"
            >>> num_buckets=2  # doctest: +SKIP
            primary_key=1/bucket1.parquet
            primary_key=1/bucket2.parquet

    .. note:: This can only be used for datasets with a single table!

    See also, :ref:`partitioning_dask`.

    Parameters
    ----------
    ddf: Union[dask.dataframe.DataFrame, None]
        The dask.Dataframe to be used to calculate the new partitions from. If this parameter is `None`, the update pipeline
        will only delete partitions without creating new ones.
    shuffle: bool
        If `True` and `partition_on` is requested, shuffle the data to reduce number of output partitions.

        See also, :ref:`shuffling`.

        .. warning::

            Dask uses a heuristic to determine how data is shuffled and there are two options, `partd` for local disk shuffling and `tasks` for distributed shuffling using a task graph. If there is no :class:`distributed.Client` in the context and the option is not set explicitly, dask will choose `partd` which may cause data loss when the graph is executed on a distributed cluster.

            Therefore, we recommend to specify the dask shuffle method explicitly, e.g. by using a context manager.

            .. code::

                with dask.config(shuffle='tasks'):
                    graph = update_dataset_from_ddf(...)
                graph.compute()

    repartition_ratio: Optional[Union[int, float]]
        If provided, repartition the dataframe before calculation starts to ``ceil(ddf.npartitions / repartition_ratio)``
    num_buckets: int
        If provided, the output partitioning will have ``num_buckets`` files per primary key partitioning.
        This effectively splits up the execution ``num_buckets`` times. Setting this parameter may be helpful when
        scaling.
        This only has an effect if ``shuffle==True``
    bucket_by:
        The subset of columns which should be considered for bucketing.

        This parameter ensures that groups of the given subset are never split
        across buckets within a given partition.

        Without specifying this the buckets will be created randomly.

        This only has an effect if ``shuffle==True``

        .. admonition:: Secondary indices

            This parameter has a strong effect on the performance of secondary
            indices. Since it guarantees that a given tuple of the subset will
            be entirely put into the same file you can build efficient indices
            with this approach.

        .. note::

            Only columns with data types which can be hashed are allowed to be used in this.
    """
    partition_on = normalize_arg("partition_on", partition_on)
    secondary_indices = normalize_arg("secondary_indices", secondary_indices)
    delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope)

    if table is None:
        raise TypeError("The parameter `table` is not optional.")
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        ds_factory=factory,
    )

    if shuffle and not partition_on:
        raise ValueError(
            "If ``shuffle`` is requested, at least one ``partition_on`` column needs to be provided."
        )
    if ds_factory is not None:
        check_single_table_dataset(ds_factory, table)

    if repartition_ratio and ddf is not None:
        ddf = ddf.repartition(
            npartitions=int(np.ceil(ddf.npartitions / repartition_ratio)))

    if ddf is None:
        mps = [
            parse_input_to_metapartition(
                None, metadata_version=default_metadata_version)
        ]
    else:
        secondary_indices = _ensure_compatible_indices(ds_factory,
                                                       secondary_indices)

        if shuffle and partition_on:
            mps = _update_dask_partitions_shuffle(
                ddf=ddf,
                table=table,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                num_buckets=num_buckets,
                sort_partitions_by=sort_partitions_by,
                bucket_by=bucket_by,
            )
        else:
            delayed_tasks = ddf.to_delayed()
            delayed_tasks = [{"data": {table: task}} for task in delayed_tasks]
            mps = _update_dask_partitions_one_to_one(
                delayed_tasks=delayed_tasks,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                sort_partitions_by=sort_partitions_by,
            )
    return dask.delayed(update_dataset_from_partitions)(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 5
0
def update_dataset_from_dataframes__iter(
    df_generator,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    central_partition_metadata=True,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    load_dynamic_metadata=True,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
):
    """
    Update a kartothek dataset in store iteratively, using a generator of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------
    df_generator: Iterable[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]
        The dataframe(s) to be stored.

    Returns
    -------
    The dataset metadata object (`kartothek.dataset.DatasetMetadata`).
    """

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        load_dynamic_metadata=load_dynamic_metadata,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory,
                                                   secondary_indices)

    if sort_partitions_by:  # Define function which sorts each partition by column
        sort_partitions_by_fn = partial(sort_values_categorical,
                                        column=sort_partitions_by)

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(
            df, metadata_version=default_metadata_version)

        if sort_partitions_by:
            mp = mp.apply(sort_partitions_by_fn)

        if partition_on:
            mp = mp.partition_on(partition_on=partition_on)

        if secondary_indices:
            mp = mp.build_indices(columns=secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(
            store=store,
            df_serializer=df_serializer,
            dataset_uuid=dataset_uuid,
            store_metadata=not central_partition_metadata,
        )

        new_partitions.append(mp)

    return update_dataset_from_partitions(
        new_partitions,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 6
0
def update_dataset_from_ddf(
    ddf: dd.DataFrame,
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    table: str = SINGLE_TABLE,
    secondary_indices: Optional[List[str]] = None,
    shuffle: bool = False,
    repartition_ratio: Optional[SupportsFloat] = None,
    num_buckets: int = 1,
    sort_partitions_by: Optional[Union[List[str], str]] = None,
    delete_scope: Optional[Iterable[Mapping[str, str]]] = None,
    metadata: Optional[Mapping] = None,
    df_serializer: Optional[DataFrameSerializer] = None,
    metadata_merger: Optional[Callable] = None,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    factory: Optional[DatasetFactory] = None,
    bucket_by: Optional[Union[List[str], str]] = None,
):
    """
    Update a dataset from a dask.dataframe.

    See Also
    --------
    :ref:`mutating_datasets`
    """
    partition_on = normalize_arg("partition_on", partition_on)
    secondary_indices = normalize_arg("secondary_indices", secondary_indices)
    sort_partitions_by = normalize_arg("sort_partitions_by", sort_partitions_by)
    bucket_by = normalize_arg("bucket_by", bucket_by)
    store = normalize_arg("store", store)
    delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope)

    if table is None:
        raise TypeError("The parameter `table` is not optional.")
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        ds_factory=factory,
    )

    inferred_indices = _ensure_compatible_indices(ds_factory, secondary_indices)
    del secondary_indices

    if ds_factory is not None:
        check_single_table_dataset(ds_factory, table)

    mps = _write_dataframe_partitions(
        ddf=ddf,
        store=store,
        dataset_uuid=dataset_uuid or ds_factory.dataset_uuid,
        table=table,
        secondary_indices=inferred_indices,
        shuffle=shuffle,
        repartition_ratio=repartition_ratio,
        num_buckets=num_buckets,
        sort_partitions_by=sort_partitions_by,
        df_serializer=df_serializer,
        metadata_version=metadata_version,
        partition_on=cast(List[str], partition_on),
        bucket_by=bucket_by,
    )
    return dask.delayed(update_dataset_from_partitions)(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 7
0
def update_dataset_from_dataframes__iter(
    df_generator,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
    table_name: str = SINGLE_TABLE,
):
    """
    Update a kartothek dataset in store iteratively, using a generator of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).

    See Also
    --------
    :ref:`mutating_datasets`
    """

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices)

    if sort_partitions_by:  # Define function which sorts each partition by column
        sort_partitions_by_fn = partial(
            sort_values_categorical, columns=sort_partitions_by
        )

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(
            df, metadata_version=metadata_version, table_name=table_name,
        )

        if sort_partitions_by:
            mp = mp.apply(sort_partitions_by_fn)

        if partition_on:
            mp = mp.partition_on(partition_on=partition_on)

        if secondary_indices:
            mp = mp.build_indices(columns=secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(
            store=store, df_serializer=df_serializer, dataset_uuid=dataset_uuid
        )

        new_partitions.append(mp)

    return update_dataset_from_partitions(
        new_partitions,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 8
0
def update_dataset_from_dataframes(
    df_list,
    store=None,
    dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    central_partition_metadata=True,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    load_dynamic_metadata=True,
    sort_partitions_by=None,
    secondary_indices=None,
    factory=None,
):
    """
    Update a kartothek dataset in store at once, using a list of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------
    df_list: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]
        The dataframe(s) to be stored.

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).
    """
    if load_dynamic_metadata is not True:
        warnings.warn(
            "The keyword `load_dynamic_metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if central_partition_metadata is not True:
        warnings.warn(
            "The keyword `central_partition_metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    secondary_indices = _ensure_compatible_indices(ds_factory,
                                                   secondary_indices)

    mp = parse_input_to_metapartition(
        df_list,
        metadata_version=metadata_version,
        expected_secondary_indices=secondary_indices,
    )

    if sort_partitions_by:
        mp = mp.apply(
            partial(sort_values_categorical, column=sort_partitions_by))

    if partition_on:
        mp = mp.partition_on(partition_on)

    if secondary_indices:
        mp = mp.build_indices(secondary_indices)

    mp = mp.store_dataframes(store=store,
                             dataset_uuid=dataset_uuid,
                             df_serializer=df_serializer)

    return update_dataset_from_partitions(
        mp,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 9
0
def update_dataset_from_ddf(
    ddf,
    store=None,
    dataset_uuid=None,
    table=None,
    secondary_indices=None,
    shuffle=False,
    repartition_ratio=None,
    num_buckets=1,
    sort_partitions_by=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    factory=None,
):
    """
    Update a dataset from a dask.dataframe.


    .. admonition:: Behavior without ``shuffle==False``

        In the case without ``partition_on`` every dask partition is mapped to a single kartothek partition

        In the case with ``partition_on`` every dask partition is mapped to N kartothek partitions, where N
        depends on the content of the respective partition, such that every resulting kartothek partition has
        only a single value in the respective ``partition_on`` columns.

    .. admonition:: Behavior with ``shuffle==True``

        ``partition_on`` is mandatory

        Perform a data shuffle to ensure that every primary key will have at most ``num_bucket``.

        .. note::
            The number of allowed buckets will have an impact on the required resources and runtime.
            Using a larger number of allowed buckets will usually reduce resource consumption and in some
            cases also improves runtime performance.

        :Example:

            >>> partition_on="primary_key"
            >>> num_buckets=2  # doctest: +SKIP
            primary_key=1/bucket1.parquet
            primary_key=1/bucket2.parquet

    .. note:: This can only be used for datasets with a single table!

    Parameters
    ----------
    ddf: Union[dask.dataframe.DataFrame, None]
        The dask.Dataframe to be used to calculate the new partitions from. If this parameter is `None`, the update pipeline
        will only delete partitions without creating new ones.
    shuffle: bool
        If True and partition_on is requested, shuffle the data to reduce number of output partitions
    repartition_ratio: Optional[Union[int, float]]
        If provided, repartition the dataframe before calculation starts to ``ceil(ddf.npartitions / repartition_ratio)``
    num_buckets: int
        If provided, the output partitioning will have ``num_buckets`` files per primary key partitioning.
        This effectively splits up the execution ``num_buckets`` times. Setting this parameter may be helpful when
        scaling.
        This only has an effect if ``shuffle==True``
    """
    partition_on = normalize_arg("partition_on", partition_on)
    secondary_indices = normalize_arg("secondary_indices", secondary_indices)
    delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope)

    if table is None:
        raise TypeError("The parameter `table` is not optional.")
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        ds_factory=factory,
    )

    if shuffle and not partition_on:
        raise ValueError(
            "If ``shuffle`` is requested, at least one ``partition_on`` column needs to be provided."
        )
    if ds_factory is not None:
        check_single_table_dataset(ds_factory, table)

    if repartition_ratio and ddf is not None:
        ddf = ddf.repartition(
            npartitions=int(np.ceil(ddf.npartitions / repartition_ratio)))

    if ddf is None:
        mps = [
            parse_input_to_metapartition(
                None, metadata_version=default_metadata_version)
        ]
    else:
        secondary_indices = _ensure_compatible_indices(ds_factory,
                                                       secondary_indices)

        if shuffle and partition_on:
            mps = _update_dask_partitions_shuffle(
                ddf=ddf,
                table=table,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                num_buckets=num_buckets,
                sort_partitions_by=sort_partitions_by,
            )
        else:
            delayed_tasks = ddf.to_delayed()
            delayed_tasks = [{"data": {table: task}} for task in delayed_tasks]
            mps = _update_dask_partitions_one_to_one(
                delayed_tasks=delayed_tasks,
                secondary_indices=secondary_indices,
                metadata_version=metadata_version,
                partition_on=partition_on,
                store_factory=store,
                df_serializer=df_serializer,
                dataset_uuid=dataset_uuid,
                sort_partitions_by=sort_partitions_by,
            )
    return dask.delayed(update_dataset_from_partitions)(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 10
0
def update_dataset_from_dataframes(
    df_list: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]],
    store: Optional[KeyValueStore] = None,
    dataset_uuid: Optional[str] = None,
    delete_scope=None,
    metadata=None,
    df_serializer: Optional[ParquetSerializer] = None,
    metadata_merger: Callable = None,
    central_partition_metadata: bool = True,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    load_dynamic_metadata: bool = True,
    sort_partitions_by: Optional[str] = None,
    secondary_indices: Optional[List[str]] = None,
    factory: Optional[DatasetFactory] = None,
) -> DatasetMetadata:
    """
    Update a kartothek dataset in store at once, using a list of dataframes.

    Useful for datasets which do not fit into memory.

    Parameters
    ----------
    df_list:
        The dataframe(s) to be stored.

    Returns
    -------
    The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).

    See Also
    --------
    :ref:`mutating_datasets`
    """
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    inferred_indices = _ensure_compatible_indices(ds_factory,
                                                  secondary_indices)
    del secondary_indices

    mp = parse_input_to_metapartition(
        df_list,
        metadata_version=metadata_version,
        expected_secondary_indices=inferred_indices,
    )

    if sort_partitions_by:
        mp = mp.apply(
            partial(sort_values_categorical, columns=sort_partitions_by))

    if partition_on:
        mp = mp.partition_on(partition_on)

    if inferred_indices:
        mp = mp.build_indices(inferred_indices)

    mp = mp.store_dataframes(store=store,
                             dataset_uuid=dataset_uuid,
                             df_serializer=df_serializer)

    return update_dataset_from_partitions(
        mp,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
Ejemplo n.º 11
0
def update_dataset_from_ddf(
    ddf: dd.DataFrame,
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    table: str = SINGLE_TABLE,
    secondary_indices: Optional[List[str]] = None,
    shuffle: bool = False,
    repartition_ratio: Optional[SupportsFloat] = None,
    num_buckets: int = 1,
    sort_partitions_by: Optional[Union[List[str], str]] = None,
    delete_scope: Optional[Iterable[Mapping[str, str]]] = None,
    metadata: Optional[Mapping] = None,
    df_serializer: Optional[DataFrameSerializer] = None,
    metadata_merger: Optional[Callable] = None,
    default_metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    factory: Optional[DatasetFactory] = None,
    bucket_by: Optional[Union[List[str], str]] = None,
):
    """
    Update a dataset from a dask.dataframe.

    See Also
    --------
    :ref:`mutating_datasets`
    """
    if table is None:
        raise TypeError("The parameter `table` is not optional.")

    # normalization done by normalize_args but mypy doesn't recognize this
    sort_partitions_by = cast(List[str], sort_partitions_by)
    secondary_indices = cast(List[str], secondary_indices)
    bucket_by = cast(List[str], bucket_by)
    partition_on = cast(List[str], partition_on)

    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
        ds_factory=factory,
    )

    inferred_indices = _ensure_compatible_indices(ds_factory,
                                                  secondary_indices)
    del secondary_indices

    mp_ser = _write_dataframe_partitions(
        ddf=ddf,
        store=ds_factory.store_factory if ds_factory else store,
        dataset_uuid=dataset_uuid or ds_factory.dataset_uuid,
        table=table,
        secondary_indices=inferred_indices,
        shuffle=shuffle,
        repartition_ratio=repartition_ratio,
        num_buckets=num_buckets,
        sort_partitions_by=sort_partitions_by,
        df_serializer=df_serializer,
        metadata_version=metadata_version,
        partition_on=cast(List[str], partition_on),
        bucket_by=bucket_by,
    )

    return mp_ser.reduction(
        chunk=_id,
        aggregate=_commit_update_from_reduction,
        split_every=False,
        token="commit-dataset",
        meta=object,
        aggregate_kwargs={
            "store_factory": store,
            "dataset_uuid": dataset_uuid,
            "ds_factory": ds_factory,
            "delete_scope": delete_scope,
            "metadata": metadata,
            "metadata_merger": metadata_merger,
        },
    )