Exemple #1
0
def test_concat_metapartition_wrong_types(df_all_types):
    mp1 = MetaPartition(label="first", data=df_all_types, metadata_version=4)
    df_corrupt = df_all_types.copy()
    df_corrupt["int8"] = "NoInteger"
    mp2 = MetaPartition(label="second", data=df_corrupt, metadata_version=4)

    with pytest.raises(ValueError, match="Schema violation"):
        MetaPartition.concat_metapartitions([mp1, mp2])
Exemple #2
0
def test_concat_metapartition_different_partitioning(df_all_types):
    mp1 = MetaPartition(
        label="int8=1/1234",
        data=df_all_types,
        metadata_version=4,
        partition_keys=["int8"],
    )
    mp2 = MetaPartition(
        label="float8=1.0/4321",
        data=df_all_types,
        metadata_version=4,
        partition_keys=["float8"],
    )

    with pytest.raises(ValueError, match="Schema violation"):
        MetaPartition.concat_metapartitions([mp1, mp2])
def read_dataset_as_metapartitions__iterator(
    dataset_uuid=None,
    store=None,
    columns=None,
    predicate_pushdown_to_io=True,
    categoricals=None,
    dates_as_object: bool = True,
    predicates=None,
    factory=None,
    dispatch_by=None,
):
    """

    A Python iterator to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator`

    Parameters
    ----------

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid, store=store, factory=factory,
    )

    store = ds_factory.store
    mps = dispatch_metapartitions_from_factory(
        ds_factory, predicates=predicates, dispatch_by=dispatch_by,
    )

    for mp in mps:
        if dispatch_by is not None:
            mp = MetaPartition.concat_metapartitions(
                [
                    mp_inner.load_dataframes(
                        store=store,
                        columns=columns,
                        categoricals=categoricals,
                        predicate_pushdown_to_io=predicate_pushdown_to_io,
                        predicates=predicates,
                    )
                    for mp_inner in mp
                ]
            )
        else:
            mp = cast(MetaPartition, mp)
            mp = mp.load_dataframes(
                store=store,
                columns=columns,
                categoricals=categoricals,
                predicate_pushdown_to_io=predicate_pushdown_to_io,
                dates_as_object=dates_as_object,
                predicates=predicates,
            )
        yield mp
Exemple #4
0
def test_concat_metapartition(df_all_types):
    mp1 = MetaPartition(label="first", data=df_all_types, metadata_version=4)
    mp2 = MetaPartition(label="second", data=df_all_types, metadata_version=4)

    new_mp = MetaPartition.concat_metapartitions([mp1, mp2])

    # what the label actually is, doesn't matter so much
    assert new_mp.label is not None
    df_expected = pd.concat([df_all_types, df_all_types])
    df_actual = new_mp.data
    pdt.assert_frame_equal(df_actual, df_expected)
Exemple #5
0
def test_concat_metapartition_categoricals(df_all_types):
    mp1 = MetaPartition(
        label="first",
        data=pd.DataFrame({"a": [0, 0], "b": ["a", "a"]}, dtype="category"),
        metadata_version=4,
        partition_keys=["a"],
    )
    mp2 = MetaPartition(
        label="second",
        data=pd.DataFrame({"a": [1, 1], "b": ["a", "b"]}, dtype="category"),
        metadata_version=4,
        partition_keys=["a"],
    )

    new_mp = MetaPartition.concat_metapartitions([mp1, mp2])

    assert new_mp.table_name == "table"
    assert pd.api.types.is_categorical_dtype(new_mp.data["b"].dtype)
Exemple #6
0
def test_concat_metapartition_partitioned(df_all_types):
    mp1 = MetaPartition(
        label="int8=1/1234",
        data=df_all_types,
        metadata_version=4,
        partition_keys=["int8"],
    )
    mp2 = MetaPartition(
        label="int8=1/4321",
        data=df_all_types,
        metadata_version=4,
        partition_keys=["int8"],
    )

    new_mp = MetaPartition.concat_metapartitions([mp1, mp2])

    df_expected = pd.concat([df_all_types, df_all_types])
    df_actual = new_mp.data
    pdt.assert_frame_equal(df_actual, df_expected)

    assert new_mp.partition_keys == ["int8"]
Exemple #7
0
def _load_and_concat_metapartitions_inner(mps, args, kwargs):
    return MetaPartition.concat_metapartitions(
        [mp.load_dataframes(*args, **kwargs) for mp in mps])
Exemple #8
0
def read_dataset_as_metapartitions__iterator(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
):
    """

    A Python iterator to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator`
    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )
    store = ds_factory.store
    mps = dispatch_metapartitions_from_factory(
        ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
    )

    for mp in mps:
        if concat_partitions_on_primary_index:
            mp = MetaPartition.concat_metapartitions([
                mp_inner.load_dataframes(
                    store=store,
                    tables=tables,
                    columns=columns,
                    categoricals=categoricals,
                    predicate_pushdown_to_io=predicate_pushdown_to_io,
                    predicates=predicates,
                ) for mp_inner in mp
            ])
        else:
            mp = mp.load_dataframes(
                store=store,
                tables=tables,
                columns=columns,
                categoricals=categoricals,
                predicate_pushdown_to_io=predicate_pushdown_to_io,
                dates_as_object=dates_as_object,
                predicates=predicates,
            )
        yield mp
Exemple #9
0
def read_dataset_as_metapartitions__iterator(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
    dispatch_metadata=True,
):
    """

    A Python iterator to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator`

    Parameters
    ----------

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )

    if len(ds_factory.tables) > 1:
        warnings.warn(
            "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next "
            "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube "
            "functionality. "
            "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html",
            DeprecationWarning,
        )

    store = ds_factory.store
    mps = dispatch_metapartitions_from_factory(
        ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
        dispatch_metadata=dispatch_metadata,
    )

    for mp in mps:
        if concat_partitions_on_primary_index or dispatch_by is not None:
            mp = MetaPartition.concat_metapartitions(
                [
                    mp_inner.load_dataframes(
                        store=store,
                        tables=tables,
                        columns=columns,
                        categoricals=categoricals,
                        predicate_pushdown_to_io=predicate_pushdown_to_io,
                        predicates=predicates,
                    )
                    for mp_inner in mp
                ]
            )
        else:
            mp = cast(MetaPartition, mp)
            mp = mp.load_dataframes(
                store=store,
                tables=tables,
                columns=columns,
                categoricals=categoricals,
                predicate_pushdown_to_io=predicate_pushdown_to_io,
                dates_as_object=dates_as_object,
                predicates=predicates,
            )
        yield mp