Ejemplo n.º 1
0
def test_read_dataset_as_dataframes_partition_size(store_factory, metadata_version):
    cluster1 = pd.DataFrame(
        {"A": [1, 1], "B": [10, 10], "C": [1, 2], "Content": ["cluster1", "cluster1"]}
    )
    cluster2 = pd.DataFrame(
        {"A": [1, 1], "B": [10, 10], "C": [2, 3], "Content": ["cluster2", "cluster2"]}
    )
    cluster3 = pd.DataFrame({"A": [1], "B": [20], "C": [1], "Content": ["cluster3"]})
    cluster4 = pd.DataFrame(
        {"A": [2, 2], "B": [10, 10], "C": [1, 2], "Content": ["cluster4", "cluster4"]}
    )
    clusters = [cluster1, cluster2, cluster3, cluster4]
    partitions = [{"data": [("data", c)]} for c in clusters]

    store_dataframes_as_dataset__iter(
        df_generator=partitions,
        store=store_factory,
        dataset_uuid="partitioned_uuid",
        metadata_version=metadata_version,
    )
    for func in [read_dataset_as_dataframe_bag, read_dataset_as_metapartitions_bag]:
        bag = func(
            dataset_uuid="partitioned_uuid", store=store_factory, partition_size=None
        )
        assert bag.npartitions == 4
        bag = func(
            dataset_uuid="partitioned_uuid", store=store_factory, partition_size=2
        )
        assert bag.npartitions == 2
Ejemplo n.º 2
0
def dataset_dispatch_by(metadata_version, store_session_factory,
                        dataset_dispatch_by_uuid):
    cluster1 = pd.DataFrame({
        "A": [1, 1],
        "B": [10, 10],
        "C": [1, 2],
        "Content": ["cluster1", "cluster1"]
    })
    cluster2 = pd.DataFrame({
        "A": [1, 1],
        "B": [10, 10],
        "C": [2, 3],
        "Content": ["cluster2", "cluster2"]
    })
    cluster3 = pd.DataFrame({
        "A": [1],
        "B": [20],
        "C": [1],
        "Content": ["cluster3"]
    })
    cluster4 = pd.DataFrame({
        "A": [2, 2],
        "B": [10, 10],
        "C": [1, 2],
        "Content": ["cluster4", "cluster4"]
    })
    clusters = [cluster1, cluster2, cluster3, cluster4]

    partitions = [{"data": [("data", c)]} for c in clusters]

    store_dataframes_as_dataset__iter(
        df_generator=partitions,
        store=store_session_factory,
        dataset_uuid=dataset_dispatch_by_uuid,
        metadata_version=metadata_version,
        partition_on=["A", "B"],
        secondary_indices=["C"],
    )
    return pd.concat(clusters).sort_values(["A", "B",
                                            "C"]).reset_index(drop=True)
Ejemplo n.º 3
0
def store_dataframes_as_dataset(
    store,
    dataset_uuid,
    dfs,
    metadata=None,
    partition_on=None,
    df_serializer=None,
    overwrite=False,
    secondary_indices=None,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files).

    Useful for very small datasets where all data fits into memory.

    Parameters
    ----------
    dfs: List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]
        The dataframe(s) to be stored.

    Returns
    -------
    The stored dataset

    """
    if isinstance(dfs, (pd.DataFrame, dict)):
        dfs = [dfs]
        warnings.warn(
            "Passing a single dataframe instead of an iterable is deprecated and may "
            "be removed in the next major release.",
            DeprecationWarning,
        )

    return store_dataframes_as_dataset__iter(
        dfs,
        store=store,
        dataset_uuid=dataset_uuid,
        metadata=metadata,
        partition_on=partition_on,
        df_serializer=df_serializer,
        overwrite=overwrite,
        secondary_indices=secondary_indices,
        metadata_storage_format=metadata_storage_format,
        metadata_version=metadata_version,
    )
Ejemplo n.º 4
0
def test_read_dataset_as_dataframes_dispatch_by_multi_col(
    store_factory,
    bound_load_dataframes,
    backend_identifier,
    output_type,
    metadata_version,
):
    if output_type == "table":
        pytest.skip()
    cluster1 = pd.DataFrame({
        "A": [1, 1],
        "B": [10, 10],
        "C": [1, 2],
        "Content": ["cluster1", "cluster1"]
    })
    cluster2 = pd.DataFrame({
        "A": [1, 1],
        "B": [10, 10],
        "C": [2, 3],
        "Content": ["cluster2", "cluster2"]
    })
    cluster3 = pd.DataFrame({
        "A": [1],
        "B": [20],
        "C": [1],
        "Content": ["cluster3"]
    })
    cluster4 = pd.DataFrame({
        "A": [2, 2],
        "B": [10, 10],
        "C": [1, 2],
        "Content": ["cluster4", "cluster4"]
    })
    clusters = [cluster1, cluster2, cluster3, cluster4]
    partitions = [{"data": [("data", c)]} for c in clusters]

    store_dataframes_as_dataset__iter(
        df_generator=partitions,
        store=store_factory,
        dataset_uuid="partitioned_uuid",
        metadata_version=metadata_version,
        partition_on=["A", "B"],
        secondary_indices=["C"],
    )
    for dispatch_by in permutations(("A", "B", "C"), 2):
        dispatched = bound_load_dataframes(
            dataset_uuid="partitioned_uuid",
            store=store_factory,
            dispatch_by=dispatch_by,
        )
        uniques = pd.DataFrame(columns=dispatch_by)
        for part in dispatched:
            if isinstance(part, MetaPartition):
                data = part.data["data"]
            else:
                data = part["data"]
            unique_dispatch = data[list(dispatch_by)].drop_duplicates()
            assert len(unique_dispatch) == 1
            row = unique_dispatch
            uniques.append(row)
        assert not any(uniques.duplicated())
Ejemplo n.º 5
0
def test_read_dataset_as_dataframes_dispatch_by_single_col(
    store_factory,
    bound_load_dataframes,
    backend_identifier,
    dispatch_by,
    output_type,
    metadata_version,
):
    if output_type == "table":
        pytest.skip()
    cluster1 = pd.DataFrame({
        "A": [1, 1],
        "B": [10, 10],
        "C": [1, 2],
        "Content": ["cluster1", "cluster1"]
    })
    cluster2 = pd.DataFrame({
        "A": [1, 1],
        "B": [10, 10],
        "C": [2, 3],
        "Content": ["cluster2", "cluster2"]
    })
    cluster3 = pd.DataFrame({
        "A": [1],
        "B": [20],
        "C": [1],
        "Content": ["cluster3"]
    })
    cluster4 = pd.DataFrame({
        "A": [2, 2],
        "B": [10, 10],
        "C": [1, 2],
        "Content": ["cluster4", "cluster4"]
    })
    clusters = [cluster1, cluster2, cluster3, cluster4]
    partitions = [{"data": [("data", c)]} for c in clusters]

    store_dataframes_as_dataset__iter(
        df_generator=partitions,
        store=store_factory,
        dataset_uuid="partitioned_uuid",
        metadata_version=metadata_version,
        partition_on=["A", "B"],
        secondary_indices=["C"],
    )

    # Dispatch by primary index "A"
    dispatched_a = bound_load_dataframes(dataset_uuid="partitioned_uuid",
                                         store=store_factory,
                                         dispatch_by=[dispatch_by])

    unique_a = set()
    for part in dispatched_a:
        if isinstance(part, MetaPartition):
            data = part.data["data"]
        else:
            data = part["data"]
        unique_dispatch = data[dispatch_by].unique()
        assert len(unique_dispatch) == 1
        unique_dispatch[0] not in unique_a
        unique_a.add(unique_dispatch[0])