Beispiel #1
0
def test_write_single_partition(store_factory, mock_uuid, metadata_version):
    create_empty_dataset_header(
        store=store_factory(),
        table_meta={
            "table1": pd.DataFrame({"col": [1]}),
            "table2": pd.DataFrame({"other_col": ["a"]}),
        },
        dataset_uuid="some_dataset",
        metadata_version=metadata_version,
    )

    new_data = {
        "data": {
            "table1": pd.DataFrame({"col": [1, 2]}),
            "table2": pd.DataFrame({"other_col": ["a", "b"]}),
        }
    }
    keys_in_store = set(store_factory().keys())
    new_mp = write_single_partition(store=store_factory,
                                    dataset_uuid="some_dataset",
                                    data=new_data)

    keys_in_store.add("some_dataset/table1/auto_dataset_uuid.parquet")
    keys_in_store.add("some_dataset/table2/auto_dataset_uuid.parquet")
    assert set(store_factory().keys()) == keys_in_store
    expected_mp = MetaPartition(
        label="auto_dataset_uuid",  # this will be a hash of the input
        files={
            "table1": "some_dataset/table1/auto_dataset_uuid.parquet",
            "table2": "some_dataset/table2/auto_dataset_uuid.parquet",
        },
        metadata_version=4,
        table_meta={
            "table1":
            make_meta(pd.DataFrame({"col": [1, 2]}), origin="table1"),
            "table2":
            make_meta(pd.DataFrame({"other_col": ["a", "b"]}),
                      origin="table2"),
        },
    )

    assert new_mp == expected_mp

    with pytest.raises(ValueError):
        # col is an integer column so this is incompatible.
        new_data["data"]["table1"] = pd.DataFrame(
            {"col": [datetime.date(2010, 1, 1)]})
        write_single_partition(store=store_factory,
                               dataset_uuid="some_dataset",
                               data=new_data)
Beispiel #2
0
def test_initial_commit(store):
    dataset_uuid = "dataset_uuid"
    df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])]))
    dataset = create_empty_dataset_header(
        store=store,
        table_meta={"core": make_meta(df, origin="1")},
        dataset_uuid=dataset_uuid,
        metadata_version=4,
    )
    assert dataset.explicit_partitions is False
    new_data = {"data": {"core": df}}
    new_metapartition = write_single_partition(store=store,
                                               dataset_uuid=dataset.uuid,
                                               data=new_data)

    new_partition = [{
        "label": new_metapartition.label,
        "data": [("core", None)]
    }]
    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset.uuid,
        new_partitions=new_partition,
        delete_scope=None,
        partition_on=None,
    )
    assert updated_dataset.explicit_partitions is True
    actual = read_table(store=store,
                        table="core",
                        dataset_uuid=updated_dataset.uuid)
    df_expected = pd.DataFrame(
        OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])]))

    assert_frame_equal(df_expected, actual)
Beispiel #3
0
def test_commit_dataset_from_nested_metapartition(store):
    """
    Check it is possible to use `commit_dataset` with nested metapartitions as input.
    Original issue: https://github.com/JDASoftwareGroup/kartothek/issues/40
    """

    df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]})

    create_empty_dataset_header(
        store=store,
        dataset_uuid="uuid",
        schema=make_meta(df, "table", ["a"]),
        partition_on=["a"],
    )

    partitions = []
    for x in range(2):
        partitions.append(
            write_single_partition(
                store=store,
                dataset_uuid="uuid",
                data=df,
                partition_on=["a"],
            ))

    partition_labels = {mp_.label for mp in partitions for mp_ in mp}
    dm = commit_dataset(store=store,
                        dataset_uuid="uuid",
                        new_partitions=partitions,
                        partition_on=["a"])
    assert dm.partitions.keys() == partition_labels
Beispiel #4
0
def test_initial_commit(store):
    dataset_uuid = "dataset_uuid"
    df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])]))
    dataset = create_empty_dataset_header(
        store=store,
        schema=make_meta(df, origin="1"),
        dataset_uuid=dataset_uuid,
        metadata_version=4,
    )
    assert dataset.explicit_partitions is False
    new_metapartition = write_single_partition(store=store,
                                               dataset_uuid=dataset.uuid,
                                               data=df)

    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset.uuid,
        # FIXME: is this breaking and if so, is it expected?
        new_partitions=[new_metapartition],
        delete_scope=None,
        partition_on=None,
    )
    assert updated_dataset.explicit_partitions is True
    actual = read_table(store=store, dataset_uuid=updated_dataset.uuid)
    df_expected = pd.DataFrame(
        OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])]))

    assert_frame_equal(df_expected, actual)
Beispiel #5
0
def test_create_empty_header_from_pyarrow_schema(store_factory):
    # GH228
    df = pd.DataFrame(
        [{"part": 1, "id": 1, "col1": "abc"}, {"part": 2, "id": 2, "col1": np.nan}]
    )
    dataset_uuid = "sample_ds"
    schema = pa.Schema.from_pandas(df)

    dm = create_empty_dataset_header(
        store=store_factory,
        dataset_uuid=dataset_uuid,
        table_meta={"table": schema},
        partition_on=["part"],
    )

    new_partitions = [
        write_single_partition(
            store=store_factory,
            dataset_uuid=dataset_uuid,
            data=[{"table": df.loc[df["part"] == 1]}],
            partition_on=["part"],
        )
    ]
    assert len(dm.partitions) == 0
    dm = commit_dataset(
        store=store_factory,
        dataset_uuid=dataset_uuid,
        new_partitions=new_partitions,
        partition_on=["part"],
    )

    assert len(dm.partitions) == 1
Beispiel #6
0
def test_commit_dataset_from_metapartition(dataset_function, store):
    new_data = [
        pd.DataFrame(
            OrderedDict([
                ("P", [5]),
                ("L", [5]),
                ("TARGET", [5]),
                ("DATE", [datetime.date(2016, 3, 23)]),
            ]))
    ]
    new_partition = write_single_partition(store=store,
                                           dataset_uuid=dataset_function.uuid,
                                           data=new_data)
    pre_commit_dataset = DatasetMetadata.load_from_store(
        uuid=dataset_function.uuid, store=store)
    # Cannot assert equal since the metadata is differently ordered
    assert pre_commit_dataset == dataset_function

    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset_function.uuid,
        new_partitions=new_partition,
        delete_scope=None,
        partition_on=None,
    )
    assert updated_dataset != dataset_function

    assert updated_dataset.uuid == dataset_function.uuid
    assert len(
        updated_dataset.partitions) == len(dataset_function.partitions) + 1

    # ensure that the new dataset is actually the one on disc
    loaded_dataset = DatasetMetadata.load_from_store(uuid=updated_dataset.uuid,
                                                     store=store)
    assert loaded_dataset == updated_dataset

    # Read the data and check whether the rows above are included.
    # This checks whether all necessary informations were updated in the header
    # (e.g. files attributes of the partitions)
    actual = read_table(store=store, dataset_uuid=dataset_function.uuid)
    df_expected = pd.DataFrame(
        OrderedDict([
            (
                "DATE",
                [
                    datetime.date(2016, 3, 23),
                    datetime.date(2010, 1, 1),
                    datetime.date(2009, 12, 31),
                ],
            ),
            ("L", [5, 1, 2]),
            ("P", [5, 1, 2]),
            ("TARGET", [5, 1, 2]),
        ]))
    actual = actual.sort_values("DATE", ascending=False).reset_index(drop=True)

    assert_frame_equal(df_expected, actual)
Beispiel #7
0
def test_write_single_partition_different_partitioning(store_factory):
    df = pd.DataFrame(
        OrderedDict([("location", ["0", "1", "2"]), ("other", ["a", "a",
                                                               "a"])]))

    input_ = [{
        "label": "label",
        "data": [("order_proposals", df)],
        "indices": {
            "location": {k: ["label"]
                         for k in df["location"].unique()}
        },
    }]
    dataset = store_dataframes_as_dataset(
        dfs=input_,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        metadata_version=4,
        partition_on=["other"],
    )

    new_data = {
        "data": {
            "order_proposals":
            pd.DataFrame(
                OrderedDict([("other", ["b", "b", "b"]),
                             ("location", ["0", "1", "2"])]))
        }
    }
    initial_keys = len(list(store_factory().keys()))
    with pytest.raises(ValueError):
        write_single_partition(
            store=store_factory,
            dataset_uuid=dataset.uuid,
            data=new_data,
            partition_on="location",
        )
    assert initial_keys == len(list(store_factory().keys()))
    write_single_partition(
        store=store_factory,
        dataset_uuid=dataset.uuid,
        data=new_data,
        partition_on=["other"],
    )
    assert initial_keys + 1 == len(list(store_factory().keys()))

    new_data["label"] = "some_other_label"
    # If no partitioning is given, it will be determined based on the existing dataset
    write_single_partition(store=store_factory,
                           dataset_uuid=dataset.uuid,
                           data=new_data)
    assert initial_keys + 2 == len(list(store_factory().keys()))