Esempio n. 1
0
 def test_raises_other_index_missing(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data={
                 SINGLE_TABLE: pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]})
             },
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ).build_indices(["x", "y"]),
         name=cube.seed_dataset,
     )
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data={
                 SINGLE_TABLE: pd.DataFrame(
                     {"x": [0], "y": [0], "p": [0], "q": [0], "i1": [1337]}
                 )
             },
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ),
         name="enrich",
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (
         str(exc.value)
         == 'ExplicitSecondaryIndex or PartitionIndex "i1" is missing in dataset "enrich".'
     )
 def test_accepts_projected_datasets(self, cube, function_store):
     expected = {
         cube.seed_dataset:
         store_data(
             cube=cube,
             function_store=function_store,
             df=MetaPartition(
                 label=gen_uuid(),
                 data=pd.DataFrame({
                     "x": [0],
                     "y": [0],
                     "p": [0],
                     "q": [0]
                 }),
                 metadata_version=KTK_CUBE_METADATA_VERSION,
             ).build_indices(["x", "y"]),
             name=cube.seed_dataset,
         ),
         "x":
         store_data(
             cube=cube,
             function_store=function_store,
             df=MetaPartition(
                 label=gen_uuid(),
                 data=pd.DataFrame({
                     "x": [0],
                     "p": [0],
                     "q": [0],
                     "v1": [42]
                 }),
                 metadata_version=KTK_CUBE_METADATA_VERSION,
             ),
             name="x",
         ),
         "y":
         store_data(
             cube=cube,
             function_store=function_store,
             df=MetaPartition(
                 label=gen_uuid(),
                 data=pd.DataFrame({
                     "y": [0],
                     "p": [0],
                     "q": [0],
                     "v2": [42]
                 }),
                 metadata_version=KTK_CUBE_METADATA_VERSION,
             ),
             name="y",
         ),
     }
     actual = discover_datasets(cube, function_store)
     assert_datasets_equal(actual, expected)
Esempio n. 3
0
 def test_accepts_partition_index_for_index(self, cube, function_store):
     expected = {
         cube.seed_dataset: store_data(
             cube=cube,
             function_store=function_store,
             df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "v1": [0]}),
             name=cube.seed_dataset,
         ),
         "enrich": store_data(
             cube=cube,
             function_store=function_store,
             df=MetaPartition(
                 label=gen_uuid(),
                 data={
                     SINGLE_TABLE: pd.DataFrame(
                         {"x": [0], "y": [0], "i1": [1337], "v2": [42]}
                     )
                 },
                 metadata_version=KTK_CUBE_METADATA_VERSION,
             ),
             name="enrich",
             partition_on=["i1"],
         ),
     }
     actual = discover_datasets(cube, function_store)
     assert_datasets_equal(actual, expected)
Esempio n. 4
0
def test_store_dataframes_as_dataset_mp(metadata_version, store):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })

    df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)})

    mp = MetaPartition(
        label=gen_uuid(),
        data={
            "core": df,
            "helper": df2
        },
        metadata_version=metadata_version,
    )

    dataset = store_dataframes_as_dataset(
        store=store,
        dataset_uuid="dataset_uuid",
        dfs=mp,
        metadata_version=metadata_version,
    )

    assert isinstance(dataset, DatasetMetadata)
    assert len(dataset.partitions) == 1
    assert dataset.metadata_version == metadata_version

    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)

    assert dataset == stored_dataset
Esempio n. 5
0
def test_store_dataframes_as_dataset_mp_partition_on_none(
    metadata_version, store, store_factory, bound_store_dataframes
):
    df = pd.DataFrame(
        {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)}
    )

    df2 = pd.DataFrame({"P": np.arange(0, 10), "info": np.arange(100, 110)})

    mp = MetaPartition(
        label=gen_uuid(),
        data={"core": df, "helper": df2},
        metadata_version=metadata_version,
    )

    df_list = [None, mp]
    dataset = bound_store_dataframes(
        df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
        partition_on=["P"],
    )

    assert isinstance(dataset, DatasetMetadata)
    assert dataset.partition_keys == ["P"]
    assert len(dataset.partitions) == 10
    assert dataset.metadata_version == metadata_version

    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)

    assert dataset == stored_dataset
Esempio n. 6
0
def store_dataframes_as_dataset__iter(
    df_generator,
    store,
    dataset_uuid=None,
    metadata=None,
    partition_on=None,
    df_serializer=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
    secondary_indices=None,
):
    """
    Store `pd.DataFrame` s iteratively as a partitioned dataset with multiple tables (files).

    Useful for datasets which do not fit into memory.

    Parameters
    ----------

    Returns
    -------
    dataset: kartothek.core.dataset.DatasetMetadata
        The stored dataset.

    """

    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(df,
                                          metadata_version=metadata_version)

        if partition_on:
            mp = mp.partition_on(partition_on)

        if secondary_indices:
            mp = mp.build_indices(secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(store=store,
                                 dataset_uuid=dataset_uuid,
                                 df_serializer=df_serializer)

        # Add `kartothek.io_components.metapartition.MetaPartition` object to list to track partitions
        new_partitions.append(mp)

    # Store metadata and return `kartothek.DatasetMetadata` object
    return store_dataset_from_partitions(
        partition_list=new_partitions,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_storage_format=metadata_storage_format,
    )
Esempio n. 7
0
def store_delayed_as_dataset(
    delayed_tasks: List[Delayed],
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    table_name: str = SINGLE_TABLE,
    secondary_indices=None,
) -> Delayed:
    """
    Transform and store a list of dictionaries containing
    dataframes to a kartothek dataset in store.

    Parameters
    ----------
    """
    store = lazy_store(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    raise_if_indices_overlap(partition_on, secondary_indices)

    input_to_mps = partial(
        parse_input_to_metapartition,
        metadata_version=metadata_version,
        table_name=table_name,
    )
    mps = map_delayed(input_to_mps, delayed_tasks)

    if partition_on:
        mps = map_delayed(MetaPartition.partition_on, mps, partition_on=partition_on)

    if secondary_indices:
        mps = map_delayed(MetaPartition.build_indices, mps, columns=secondary_indices)

    mps = map_delayed(
        MetaPartition.store_dataframes,
        mps,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    return delayed(store_dataset_from_partitions)(
        mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )
Esempio n. 8
0
def store_data(
    cube,
    function_store,
    df,
    name,
    partition_on="default",
    metadata_version=KTK_CUBE_METADATA_VERSION,
    metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT,
    metadata=None,
    overwrite=False,
    new_ktk_cube_metadata=True,
    write_suppress_index_on=True,
):
    if partition_on == "default":
        partition_on = cube.partition_columns

    if isinstance(df, pd.DataFrame):
        mp = MetaPartition(
            label=gen_uuid(), data={SINGLE_TABLE: df}, metadata_version=metadata_version
        )

        indices_to_build = set(cube.index_columns) & set(df.columns)
        if name == cube.seed_dataset:
            indices_to_build |= set(cube.dimension_columns) - set(
                cube.suppress_index_on
            )
        mp = mp.build_indices(indices_to_build)
        dfs = mp
    else:
        assert isinstance(df, MetaPartition)
        assert df.metadata_version == metadata_version
        dfs = df

    if metadata is None:
        metadata = {
            KTK_CUBE_METADATA_DIMENSION_COLUMNS: cube.dimension_columns,
            KTK_CUBE_METADATA_KEY_IS_SEED: (name == cube.seed_dataset),
        }
        if new_ktk_cube_metadata:
            metadata.update(
                {KTK_CUBE_METADATA_PARTITION_COLUMNS: cube.partition_columns}
            )
        if write_suppress_index_on:
            metadata.update(
                {KTK_CUBE_METADATA_SUPPRESS_INDEX_ON: list(cube.suppress_index_on)}
            )

    return store_dataframes_as_dataset(
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid(name),
        dfs=dfs,
        partition_on=list(partition_on) if partition_on else None,
        metadata_storage_format=metadata_storage_format,
        metadata_version=metadata_version,
        df_serializer=KTK_CUBE_DF_SERIALIZER,
        metadata=metadata,
        overwrite=overwrite,
    )
Esempio n. 9
0
def parse_input_to_metapartition(
    obj: MetaPartitionInput,
    table_name: str = SINGLE_TABLE,
    metadata_version: Optional[int] = None,
) -> MetaPartition:
    """
    Parses given user input and return a MetaPartition

    The expected input is a :class:`pandas.DataFrame` or a list of
    :class:`pandas.DataFrame`.

    Every element of the list will be treated as a dedicated user input and will
    result in a physical file, if not specified otherwise.

    Parameters
    ----------
    obj
    table_name
        The table name assigned to the partitions
    metadata_version
        The kartothek dataset specification version
    """

    if obj is None:
        obj = []
    if isinstance(obj, list):
        if len(obj) == 0:
            return MetaPartition(label=None, metadata_version=metadata_version)
        first_element = obj[0]
        mp = parse_input_to_metapartition(
            obj=first_element,
            metadata_version=metadata_version,
            table_name=table_name,
        )
        for mp_in in obj[1:]:
            mp = mp.add_metapartition(
                parse_input_to_metapartition(
                    obj=mp_in,
                    metadata_version=metadata_version,
                    table_name=table_name,
                ))
    elif isinstance(obj, pd.DataFrame):
        mp = MetaPartition(
            label=gen_uuid(),
            data=obj,
            metadata_version=metadata_version,
            table_name=table_name,
        )
    elif isinstance(obj, MetaPartition):
        return obj
    else:
        raise ValueError(
            f"Unexpected type during parsing encountered: ({type(obj)}, {obj})"
        )

    return mp
Esempio n. 10
0
def store_dataframes_as_dataset(
    store,
    dataset_uuid,
    dfs,
    metadata=None,
    partition_on=None,
    df_serializer=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files).

    Useful for very small datasets where all data fits into memory.

    Parameters
    ----------
    dfs : dict of pd.DataFrame or pd.DataFrame
        The dataframe(s) to be stored. If only a single dataframe is passed, it will be stored as the `core` table.

    Returns
    -------
    The stored dataset

    """
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if isinstance(dfs, dict):
        dfs = {"data": [(table, df) for table, df in dfs.items()]}

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    mp = parse_input_to_metapartition(dfs, metadata_version)

    if partition_on:
        mp = MetaPartition.partition_on(mp, partition_on)

    mps = mp.store_dataframes(store=store,
                              dataset_uuid=dataset_uuid,
                              df_serializer=df_serializer)

    return store_dataset_from_partitions(
        partition_list=mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_storage_format=metadata_storage_format,
    )
Esempio n. 11
0
 def test_raises_missing_dimension_columns(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data={SINGLE_TABLE: pd.DataFrame({"x": [0], "p": [0], "q": [0]})},
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ).build_indices(["x"]),
         name=cube.seed_dataset,
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (
         str(exc.value) == 'Seed dataset "myseed" has missing dimension columns: y'
     )
Esempio n. 12
0
 def test_raises_wrong_table(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data={"foo": pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]})},
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ),
         name=cube.seed_dataset,
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (
         str(exc.value)
         == "Invalid datasets because table is wrong. Expected table: myseed (foo)"
     )
 def test_raises_dimension_index_missing(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data=pd.DataFrame({
                 "x": [0],
                 "y": [0],
                 "p": [0],
                 "q": [0]
             }),
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ),
         name=cube.seed_dataset,
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (str(exc.value) ==
             'ExplicitSecondaryIndex "x" is missing in dataset "myseed".')
Esempio n. 14
0
def prepare_data_for_ktk(df,
                         ktk_cube_dataset_id,
                         cube,
                         existing_payload,
                         partition_on,
                         consume_df=False):
    """
    Prepare data so it can be handed over to Kartothek.

    Some checks will be applied to the data to ensure it is sane.

    Parameters
    ----------
    df: pandas.DataFrame
        DataFrame to be passed to Kartothek.
    ktk_cube_dataset_id: str
        Ktk_cube dataset UUID (w/o cube prefix).
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    existing_payload: Set[str]
        Existing payload columns.
    partition_on: Iterable[str]
        Partition-on attribute for given dataset.
    consume_df: bool
        Whether the incoming DataFrame can be destroyed while processing it.

    Returns
    -------
    mp: kartothek.io_components.metapartition.MetaPartition
        Kartothek-ready MetaPartition, may be sentinel (aka empty and w/o label).

    Raises
    ------
    ValueError
        In case anything is fishy.
    """
    check_user_df(ktk_cube_dataset_id, df, cube, existing_payload,
                  partition_on)

    if (df is None) or df.empty:
        # fast-path for empty DF
        return MetaPartition(
            label=None,
            metadata_version=KTK_CUBE_METADATA_VERSION,
            partition_keys=list(partition_on),
        )

    # TODO: find a more elegant solution that works w/o copy
    df_orig = df
    df = df.copy()
    if consume_df:
        # the original df is still referenced in the parent scope, so drop it
        df_orig.drop(columns=df_orig.columns,
                     index=df_orig.index,
                     inplace=True)
    df_columns = list(df.columns)
    df_columns_set = set(df_columns)

    # normalize value order and reset index
    sort_keys = [
        col for col in itertools.chain(cube.partition_columns,
                                       cube.dimension_columns)
        if col in df_columns_set
    ]
    df = sort_dataframe(df=df, columns=sort_keys)

    # check duplicate cells
    _check_duplicates(ktk_cube_dataset_id, df, sort_keys, cube)

    # check+convert column names to unicode strings
    df.rename(columns={c: converter_str(c) for c in df_columns}, inplace=True)

    # create MetaPartition object for easier handling
    mp = MetaPartition(
        label=gen_uuid(),
        data=df,
        metadata_version=KTK_CUBE_METADATA_VERSION,
    )
    del df

    # partition data
    mp = mp.partition_on(list(partition_on))

    # reset indices again (because partition_on breaks it)
    for mp2 in mp:
        mp2.data.reset_index(drop=True, inplace=True)
        del mp2

    # calculate indices
    indices_to_build = set(cube.index_columns) & df_columns_set
    if ktk_cube_dataset_id == cube.seed_dataset:
        indices_to_build |= set(cube.dimension_columns) - set(
            cube.suppress_index_on)
    indices_to_build -= set(partition_on)

    mp = mp.build_indices(indices_to_build)

    return mp
Esempio n. 15
0
def store_delayed_as_dataset(
    delayed_tasks,
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    secondary_indices=None,
):
    """
    Transform and store a list of dictionaries containing
    dataframes to a kartothek dataset in store.

    Parameters
    ----------
    delayed_tasks: list of dask.delayed
        Every delayed object represents a partition and should be accepted by
        :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition`


    Returns
    -------
    A dask.delayed dataset object.
    """
    _check_callable(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    input_to_mps = partial(parse_input_to_metapartition,
                           metadata_version=metadata_version)
    mps = map_delayed(delayed_tasks, input_to_mps)

    if partition_on:
        mps = map_delayed(mps,
                          MetaPartition.partition_on,
                          partition_on=partition_on)

    if secondary_indices:
        mps = map_delayed(mps,
                          MetaPartition.build_indices,
                          columns=secondary_indices)

    mps = map_delayed(
        mps,
        MetaPartition.store_dataframes,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    return delayed(store_dataset_from_partitions)(
        mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )
Esempio n. 16
0
def store_bag_as_dataset(
    bag,
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    secondary_indices=None,
):
    """
    Transform and store a dask.bag of dictionaries containing
    dataframes to a kartothek dataset in store.

    This is the dask.bag-equivalent of
    :func:`~kartothek.io.dask.delayed.store_delayed_as_dataset`. See there
    for more detailed documentation on the different possible input types.

    Parameters
    ----------
    bag: dask.bag.Bag
        A dask bag containing dictionaries of dataframes or dataframes.

    """
    store = lazy_store(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    raise_if_indices_overlap(partition_on, secondary_indices)

    input_to_mps = partial(parse_input_to_metapartition,
                           metadata_version=metadata_version)
    mps = bag.map(input_to_mps)

    if partition_on:
        mps = mps.map(MetaPartition.partition_on, partition_on=partition_on)

    if secondary_indices:
        mps = mps.map(MetaPartition.build_indices, columns=secondary_indices)

    mps = mps.map(
        MetaPartition.store_dataframes,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    aggregate = partial(
        _store_dataset_from_partitions_flat,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )

    return mps.reduction(perpartition=list,
                         aggregate=aggregate,
                         split_every=False)
Esempio n. 17
0
 def _f(b_c):
     b, c = b_c
     df = pd.DataFrame({"a": [1, 1], "b": [b, b], "c": c, "d": [b, b + 1]})
     return {"label": gen_uuid(), "data": [(table_name, df)]}