Ejemplo n.º 1
0
def test_raises_on_invalid_input(store_factory, bound_update_dataset):
    dataset_uuid = "dataset_uuid"
    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({"p": [1, 2]}))]
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({"p": [2, 3]}))]
        },
    ]

    dataset = store_dataframes_as_dataset(dfs=partitions,
                                          store=store_factory,
                                          dataset_uuid=dataset_uuid)

    with pytest.raises(Exception):
        new_partitions = [({"stuff"}, [("something", {1, 2,
                                                      3})])]  # invalid format
        bound_update_dataset(new_partitions,
                             store=store_factory,
                             dataset_uuid=dataset_uuid)

    # Check no new partitions have been written to storage
    mps = read_dataset_as_metapartitions(store=store_factory,
                                         dataset_uuid=dataset_uuid)
    assert len(mps) == len(dataset.partitions)
Ejemplo n.º 2
0
def test_add_column_to_existing_index(store_factory, metadata_version,
                                      bound_build_dataset_indices):
    dataset_uuid = "dataset_uuid"
    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({
                "p": [1, 2],
                "x": [100, 4500]
            }))],
            "indices": {
                "p":
                ExplicitSecondaryIndex("p",
                                       index_dct={
                                           1: ["cluster_1"],
                                           2: ["cluster_1"]
                                       })
            },
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({
                "p": [4, 3],
                "x": [500, 10]
            }))],
            "indices": {
                "p":
                ExplicitSecondaryIndex("p",
                                       index_dct={
                                           4: ["cluster_2"],
                                           3: ["cluster_2"]
                                       })
            },
        },
    ]

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
    )
    assert dataset.load_all_indices(store=store_factory()).indices.keys() == {
        "p"
    }

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["x"])

    # Assert indices are properly created
    mps = read_dataset_as_metapartitions(store=store_factory,
                                         dataset_uuid=dataset_uuid)
    for column_name in ["p", "x"]:
        assert all([mp.indices[column_name] for mp in mps])

    dataset_factory = DatasetFactory(dataset_uuid,
                                     store_factory,
                                     load_all_indices=True)
    assert dataset_factory.indices.keys() == {"p", "x"}
Ejemplo n.º 3
0
def test_metadata_version(
    store_factory,
    bound_update_dataset,
    mock_default_metadata_version,
    backend_identifier,
):
    if backend_identifier in ("dask.dataframe", "dask.delayed"):
        pytest.skip()  # TODO: fix `io.dask.*.test_update._update_dataset`

    dataset_uuid = "dataset_uuid"
    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({"p": [1, 2]}))]
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({"p": [2, 3]}))]
        },
    ]

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=DEFAULT_METADATA_VERSION,
    )

    with pytest.raises(AssertionError, match="Traversed through mock"):
        # Try to commit data to dataset using a different metadata version
        # and different data format (format is mocked)
        # This does not raise when the `parse_input_to_metapartition`
        # argument is `default_metadata_version` instead of `metadata_version`
        new_partitions = ("core", pd.DataFrame({"p": [2, 3]}))
        bound_update_dataset(
            new_partitions,
            store=store_factory,
            dataset_uuid=dataset_uuid,
            default_metadata_version=mock_default_metadata_version,
        )

    mps = read_dataset_as_metapartitions(store=store_factory,
                                         dataset_uuid=dataset_uuid)
    assert len(mps) == len(dataset.partitions)