コード例 #1
0
ファイル: index.py プロジェクト: x-malet/kartothek
def test_build_indices(store_factory, metadata_version, bound_build_dataset_indices):
    dataset_uuid = "dataset_uuid"
    partitions = [
        {"label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))]},
        {"label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))]},
    ]

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
    )
    dataset = dataset.load_all_indices(store=store_factory)
    assert not dataset.indices

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True)
    expected = {2: ["cluster_1", "cluster_2"], 3: ["cluster_2"], 1: ["cluster_1"]}
    assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)
コード例 #2
0
def test_update_dataset_with_partitions_no_index_input_info(
    store_factory, metadata_version, bound_update_dataset, store
):
    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({"p": [1]}))],
            "indices": {"p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})},
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({"p": [2]}))],
            "indices": {"p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})},
        },
    ]
    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        metadata={"dataset": "metadata"},
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
    )

    # The input information doesn't explicitly provide index information
    # Since the dataset has an index, it must be updated either way
    part3 = {"label": "cluster_3", "data": [("core", pd.DataFrame({"p": [3]}))]}
    dataset_updated = bound_update_dataset(
        [part3],
        store=store_factory,
        dataset_uuid=dataset.uuid,
        delete_scope=[{"p": 1}],
        metadata={"extra": "metadata"},
        default_metadata_version=metadata_version,
        secondary_indices=["p"],
    )
    dataset_updated = dataset_updated.load_all_indices(store)
    assert 3 in dataset_updated.indices["p"].to_dict()
コード例 #3
0
def test_metadata_version(
    store_factory,
    bound_update_dataset,
    mock_default_metadata_version,
    backend_identifier,
):
    if backend_identifier in ("dask.dataframe", "dask.delayed"):
        pytest.skip()  # TODO: fix `io.dask.*.test_update._update_dataset`

    dataset_uuid = "dataset_uuid"
    partitions = [
        {"label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))]},
        {"label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))]},
    ]

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=DEFAULT_METADATA_VERSION,
    )

    with pytest.raises(AssertionError, match="Traversed through mock"):
        # Try to commit data to dataset using a different metadata version
        # and different data format (format is mocked)
        # This does not raise when the `parse_input_to_metapartition`
        # argument is `default_metadata_version` instead of `metadata_version`
        new_partitions = ("core", pd.DataFrame({"p": [2, 3]}))
        bound_update_dataset(
            new_partitions,
            store=store_factory,
            dataset_uuid=dataset_uuid,
            default_metadata_version=mock_default_metadata_version,
        )

    mps = read_dataset_as_metapartitions(store=store_factory, dataset_uuid=dataset_uuid)
    assert len(mps) == len(dataset.partitions)
コード例 #4
0
def test_dispatch_metapartitions_sorted_dispatch_by(store):
    df = pd.DataFrame(
        {"p": np.random.randint(high=100000, low=-100000, size=(100,)), "x": 0}
    )
    # Integers are sorted when using too small values (maybe connected to the
    # singleton implementation of integers in CPython??)
    # Verify this is not happening, otherwise we'll get immediately a sorted
    # index (which is nice in this case but not generally true, of course)
    arr = set(df["p"].unique())
    assert list(arr) != sorted(arr)

    dataset = store_dataframes_as_dataset(
        dfs=[df], dataset_uuid="test", store=store, secondary_indices=["p", "x"]
    )

    wout_preds = list(dispatch_metapartitions(dataset.uuid, store, dispatch_by="p"))
    last = -math.inf
    for mps in wout_preds:
        for mp in mps:
            current = mp.logical_conjunction
            assert len(current) == 1
            current = current[0][2]
            assert current > last
            last = current
コード例 #5
0
def test_commit_dataset_delete_all(store, metadata_version):
    partitions = [pd.DataFrame({"p": [1]})]

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=lambda: store,
        metadata={"dataset": "metadata"},
        dataset_uuid="dataset_uuid",
        secondary_indices="p",
        metadata_version=metadata_version,
    )
    dataset = dataset.load_index("p", store)
    assert len(dataset.partitions) == 1

    delete_scope = [{"p": 1}]
    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset.uuid,
        new_partitions=None,
        delete_scope=delete_scope,
        partition_on=None,
    )
    assert len(updated_dataset.partitions) == 0
    assert updated_dataset.explicit_partitions is True
コード例 #6
0
def build_cube(data,
               cube,
               store,
               metadata=None,
               overwrite=False,
               partition_on=None):
    """
    Store given dataframes as Ktk_cube cube.

    ``data`` can be formatted in multiple ways:

    - single DataFrame::

          pd.DataFrame({
              'x': [0, 1, 2, 3],
              'p': [0, 0, 1, 1],
              'v': [42, 45, 20, 10],
          })

      In that case, the seed dataset will be written.

    - dictionary of DataFrames::

          {
              'seed': pd.DataFrame({
                  'x': [0, 1, 2, 3],
                  'p': [0, 0, 1, 1],
                  'v1': [42, 45, 20, 10],
              }),
              'enrich': pd.DataFrame({
                  'x': [0, 1, 2, 3],
                  'p': [0, 0, 1, 1],
                  'v2': [False, False, True, False],
              }),
          }

      In that case, multiple datasets can be written at the same time. Note that the seed dataset MUST be included.

    - list of anything above::

          [
              # seed data only
              pd.DataFrame({
                  'x': [0, 1, 2, 3],
                  'p': [0, 0, 1, 1],
                  'v1': [42, 45, 20, 10],
              }),
              # seed data only, explicit way
              {
                  'seed': pd.DataFrame({
                      'x': [4, 5, 6, 7],
                      'p': [0, 0, 1, 1],
                      'v1': [12, 32, 22, 9],
                  }),
              },
              # multiple datasets
              {
                  'seed': pd.DataFrame({
                      'x': [8, 9, 10, 11],
                      'p': [0, 0, 1, 1],
                      'v1': [9, 2, 4, 11],
                  }),
                  'enrich': pd.DataFrame({
                      'x': [8, 9, 10, 11],
                      'p': [0, 0, 1, 1],
                      'v2': [True, True, False, False],
                  }),
              },
              # non-seed data only
              {
                  'enrich': pd.DataFrame({
                      'x': [1, 2, 3, 4],
                      'p': [0, 0, 1, 1],
                      'v2': [False, True, False, False],
                  }),
              },
          ]

      In that case, multiple datasets may be written. Note that at least a single list element must contain seed data.

    Extra metdata may be preserved w/ every dataset, e.g.::

        {
            'seed': {
                'source': 'db',
                'host': 'db1.cluster20.company.net',
                'last_event': '230c6edb-b69a-4d30-b56d-28f5dfe20948',
            },
            'enrich': {
                'source': 'python',
                'commit_hash': '8b5d717518439921e6d17c7495956bdad687bc54',
            },
        }

    Note that the given data must be JSON-serializable.

    If the cube already exists, the ``overwrite`` flag must be given. In that case, all datasets that are part of the
    existing cube must be overwritten. Partial overwrites are not allowed.

    Parameters
    ----------
    data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]]
        Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed
        dataset.
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: simplekv.KeyValueStore
        Store to which the data should be written to.
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every dataset.
    overwrite: bool
        If possibly existing datasets should be overwritten.
    partition_on: Optional[Dict[str, Iterable[str]]]
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
        See :ref:`Dimensionality and Partitioning Details` for details.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        DatasetMetadata for every dataset written.
    """
    data = _normalize_user_input(data, cube)
    ktk_cube_dataset_ids = set(data.keys())
    partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids,
                                            partition_on)
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store)
    check_datasets_prebuild(data, cube, existing_datasets)

    # do all data preparation before writing anything
    data = _prepare_data_for_ktk_all(data=data,
                                     cube=cube,
                                     existing_payload=set(),
                                     partition_on=partition_on)

    datasets = {}
    for ktk_cube_dataset_id, part in data.items():
        datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset(
            store=store,
            dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
            dfs=part,
            metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata),
            partition_on=list(partition_on[ktk_cube_dataset_id]),
            metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT,
            metadata_version=KTK_CUBE_METADATA_VERSION,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
            overwrite=overwrite,
        )

    return apply_postwrite_checks(datasets=datasets,
                                  cube=cube,
                                  store=store,
                                  existing_datasets=existing_datasets)
コード例 #7
0
def extend_cube(data,
                cube,
                store,
                metadata=None,
                overwrite=False,
                partition_on=None):
    """
    Store given dataframes into an existing Kartothek cube.

    For details on ``data`` and ``metadata``, see :meth:`build_cube`.

    Parameters
    ----------
    data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]]
        Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed
        dataset.
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: simplekv.KeyValueStore
        Store to which the data should be written to.
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every dataset.
    overwrite: bool
        If possibly existing datasets should be overwritten.
    partition_on: Optional[Dict[str, Iterable[str]]]
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
        See :ref:`Dimensionality and Partitioning Details` for details.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        DatasetMetadata for every dataset written.
    """
    data = _normalize_user_input(data, cube)
    ktk_cube_dataset_ids = set(data.keys())
    partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids,
                                            partition_on)
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    check_datasets_preextend(data, cube)

    existing_datasets = discover_datasets(cube, store)
    if overwrite:
        existing_datasets_cut = {
            ktk_cube_dataset_id: ds
            for ktk_cube_dataset_id, ds in existing_datasets.items()
            if ktk_cube_dataset_id not in data
        }
    else:
        existing_datasets_cut = existing_datasets
    existing_payload = get_cube_payload(existing_datasets_cut, cube)

    # do all data preparation before writing anything
    data = _prepare_data_for_ktk_all(
        data=data,
        cube=cube,
        existing_payload=existing_payload,
        partition_on=partition_on,
    )

    datasets = {}
    for ktk_cube_dataset_id, part in data.items():
        datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset(
            store=store,
            dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
            dfs=part,
            metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata),
            partition_on=list(partition_on[ktk_cube_dataset_id]),
            metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT,
            metadata_version=KTK_CUBE_METADATA_VERSION,
            df_serializer=KTK_CUBE_DF_SERIALIZER,
            overwrite=overwrite,
        )

    return apply_postwrite_checks(datasets=datasets,
                                  cube=cube,
                                  store=store,
                                  existing_datasets=existing_datasets)
コード例 #8
0
def test_update_dataset_with_partitions__reducer_partitions(
    store_factory, frozen_time_em, bound_update_dataset
):

    assert set(store_factory().keys()) == set()

    df1 = pd.DataFrame(
        {"P": [1, 2, 3, 1, 2, 3], "L": [1, 1, 1, 1, 1, 1], "TARGET": np.arange(10, 16)}
    )
    df2 = df1.copy(deep=True)
    df2.L = 2
    df2.TARGET += 2
    df_list = [
        {
            "label": "cluster_1",
            "data": [("core", df1)],
            "indices": {"L": {k: ["cluster_1"] for k in df1["L"].unique()}},
        },
        {
            "label": "cluster_2",
            "data": [("core", df2)],
            "indices": {"L": {k: ["cluster_2"] for k in df2["L"].unique()}},
        },
    ]
    dataset = store_dataframes_as_dataset(
        dfs=df_list,
        store=store_factory,
        dataset_uuid="dataset_uuid",
        partition_on=["P"],
        metadata_version=4,
    )
    dataset_loadedidx = dataset.load_all_indices(store=store_factory())
    cluster_1_label = (
        dataset_loadedidx.indices["L"].eval_operator(op="==", value=1).pop()
    )
    cluster_1_label = cluster_1_label.split("/")[-1]
    cluster_2_label = (
        dataset_loadedidx.indices["L"].eval_operator(op="==", value=2).pop()
    )
    cluster_2_label = cluster_2_label.split("/")[-1]

    df3 = df2.copy(deep=True)
    df3.TARGET -= 5

    part3 = {
        "label": "cluster_3",
        "data": {"core": df3},
        "indices": {"L": {k: ["cluster_3"] for k in df3["L"].unique()}},
    }

    dataset_updated = bound_update_dataset(
        [part3],
        store=store_factory,
        dataset_uuid="dataset_uuid",
        delete_scope=[{"L": 2}],
        metadata={"extra": "metadata"},
        partition_on=["P"],
        secondary_indices=["L"],
    )
    dataset_updated_loadedidx = dataset_updated.load_all_indices(store=store_factory())
    cluster_3_labels = dataset_updated_loadedidx.indices["L"].eval_operator(
        op="==", value=2
    )

    cluster_3_label = {c3_label.split("/")[-1] for c3_label in cluster_3_labels}
    assert len(cluster_3_label) == 1
    cluster_3_label = cluster_3_label.pop()
    exp_partitions = [
        "P=1/{}".format(cluster_1_label),
        "P=1/{}".format(cluster_3_label),
        "P=2/{}".format(cluster_1_label),
        "P=2/{}".format(cluster_3_label),
        "P=3/{}".format(cluster_1_label),
        "P=3/{}".format(cluster_3_label),
    ]
    assert sorted(exp_partitions) == sorted(dataset_updated.partitions.keys())
    updated_idx_keys = sorted(dataset_updated.indices.keys())
    assert sorted(dataset.indices.keys()) == updated_idx_keys

    expected_new_idx = {}
    for k, v in dataset_loadedidx.indices["P"].index_dct.items():
        val = [pl.replace(cluster_2_label, cluster_3_label) for pl in v]
        expected_new_idx[k] = val

    updated_P_idx_dct = dataset_updated_loadedidx.indices["P"].index_dct

    assert sorted(expected_new_idx.keys()) == sorted(updated_P_idx_dct.keys())

    for k, v in updated_P_idx_dct.items():
        assert sorted(expected_new_idx[k]) == sorted(v)
コード例 #9
0
ファイル: update.py プロジェクト: hoffmann/kartothek
def test_update_dataset_with_partitions__reducer_delete_only(
        store, metadata_version, frozen_time_em, bound_update_dataset):
    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({"p": [1]}))],
            "indices": {
                "p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})
            },
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({"p": [2]}))],
            "indices": {
                "p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})
            },
        },
    ]
    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=lambda: store,
        metadata={"dataset": "metadata"},
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
    )
    dataset = dataset.load_index("p", store)

    empty_part = []
    dataset_updated = bound_update_dataset(
        [empty_part],
        store=lambda: store,
        dataset_uuid="dataset_uuid",
        delete_scope=[{
            "p": 1
        }],
        metadata={"extra": "metadata"},
        default_metadata_version=metadata_version,
        secondary_indices=["p"],
    )
    dataset_updated = dataset_updated.load_index("p", store)

    assert sorted(dataset.partitions) == ["cluster_1", "cluster_2"]
    assert list(dataset_updated.partitions) == ["cluster_2"]

    store_files = list(store.keys())
    # 1 dataset metadata file and 1 index file and 2 partition files
    # note: the update writes a new index file but due to frozen_time this gets
    # the same name as the previous one and overwrites it.
    expected_number_files = 4
    # common metadata for v4 datasets (1 table)
    expected_number_files += 1
    assert len(store_files) == expected_number_files

    assert dataset.indices["p"].index_dct == {
        1: ["cluster_1"],
        2: ["cluster_2"]
    }
    assert dataset_updated.indices["p"].index_dct == {2: ["cluster_2"]}

    # Ensure the dataset can be loaded properly
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    stored_dataset = stored_dataset.load_index("p", store)
    assert dataset_updated == stored_dataset
コード例 #10
0
def test_overwrite_rollback_ktk(driver, function_store):
    """
    Checks that require a rollback (like overlapping columns) should recover the former state correctly.
    """
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3", "i4"],
    )

    df_source1 = pd.DataFrame(
        {
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1],
            "v1": [10, 11, 12, 13],
            "i1": [10, 11, 12, 13],
        }
    )
    df_enrich1 = pd.DataFrame(
        {
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1],
            "i2": [20, 21, 22, 23],
            "v1": [20, 21, 22, 23],
        }
    )
    store_dataframes_as_dataset(
        dfs=[{"ktk_source": df_source1, "ktk_enrich": df_enrich1}],
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid(cube.seed_dataset),
        metadata_version=KTK_CUBE_METADATA_VERSION,
        secondary_indices=["i1", "i2"],
    )

    df_source2 = pd.DataFrame(
        {
            "x": [10, 11],
            "p": [10, 10],
            "v1": [10.0, 11.0],  # also use another dtype here (was int)
            "i3": [10, 11],
        }
    )
    df_enrich2 = pd.DataFrame(
        {
            "x": [10, 11],
            "p": [10, 10],
            "v1": [20.0, 21.0],  # also use another dtype here (was int)
            "i4": [20, 21],
        }
    )
    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(
            data={"source": df_source2, "enrich": df_enrich2},
            cube=cube,
            store=function_store,
            overwrite=True,
        )
    cause = exc_info.value.__cause__
    assert str(cause).startswith("Found columns present in multiple datasets:")

    ds_source = DatasetMetadata.load_from_store(
        uuid=cube.ktk_dataset_uuid(cube.seed_dataset), store=function_store()
    ).load_all_indices(function_store())

    assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)

    assert len(ds_source.partitions) == 1

    assert ds_source.table_meta["ktk_source"].field("v1").type == pa.int64()
    assert ds_source.table_meta["ktk_enrich"].field("v1").type == pa.int64()
コード例 #11
0
def test_dispatch_metapartitions_complex_or_predicates(store_factory):
    dataset_uuid = "test"
    df = pd.DataFrame({"A": range(10), "B": ["A", "B"] * 5, "C": range(-10, 0)})

    store_dataframes_as_dataset(
        store=store_factory,
        dataset_uuid=dataset_uuid,
        dfs=[df],
        partition_on=["A", "B"],
    )
    predicates = [[("A", "<", 3)], [("B", "==", "B")]]
    mps = [
        mp.load_dataframes(store_factory)
        for mp in dispatch_metapartitions(
            dataset_uuid, store_factory, predicates=predicates
        )
    ]
    actual = pd.concat([mp.data["table"] for mp in mps])
    actual = actual.sort_values(by="A", ignore_index=True)
    expected = pd.DataFrame(
        data={
            "A": [0, 1, 2, 3, 5, 7, 9],
            "B": ["A", "B", "A", "B", "B", "B", "B"],
            "C": [-10, -9, -8, -7, -5, -3, -1],
        }
    )
    pd.testing.assert_frame_equal(actual, expected)

    predicates = [[("A", "<", 3)], [("B", "==", "notthere")]]
    mps = [
        mp.load_dataframes(store_factory)
        for mp in dispatch_metapartitions(
            dataset_uuid, store_factory, predicates=predicates
        )
    ]
    actual = pd.concat([mp.data["table"] for mp in mps])
    actual = actual.sort_values(by="A", ignore_index=True)
    expected = pd.DataFrame(
        data={"A": [0, 1, 2], "B": ["A", "B", "A"], "C": [-10, -9, -8]}
    )
    pd.testing.assert_frame_equal(actual, expected)

    predicates = [[("A", "<", 3), ("B", "==", "A")], [("B", "==", "B"), ("A", ">", 2)]]
    mps = [
        mp.load_dataframes(store_factory)
        for mp in dispatch_metapartitions(
            dataset_uuid, store_factory, predicates=predicates
        )
    ]
    actual = pd.concat([mp.data["table"] for mp in mps])
    actual = actual.sort_values(by="A", ignore_index=True)
    expected = pd.DataFrame(
        data={
            "A": [0, 2, 3, 5, 7, 9],
            "B": ["A", "A", "B", "B", "B", "B"],
            "C": [-10, -8, -7, -5, -3, -1],
        }
    )
    pd.testing.assert_frame_equal(actual, expected)

    predicates = [[("A", "<", 3)], [("B", "==", "B"), ("A", ">", 2)]]
    mps = [
        mp.load_dataframes(store_factory)
        for mp in dispatch_metapartitions(
            dataset_uuid, store_factory, predicates=predicates
        )
    ]
    actual = pd.concat([mp.data["table"] for mp in mps])
    actual = actual.sort_values(by="A", ignore_index=True)
    expected = pd.DataFrame(
        data={
            "A": [0, 1, 2, 3, 5, 7, 9],
            "B": ["A", "B", "A", "B", "B", "B", "B"],
            "C": [-10, -9, -8, -7, -5, -3, -1],
        }
    )
    pd.testing.assert_frame_equal(actual, expected)
コード例 #12
0
def assert_hive_compat(df, store_factory, uuid, **kwargs):
    TABLE_NAME = uuid  # Hive table name

    dm = store_dataframes_as_dataset(store=store_factory,
                                     dataset_uuid=uuid,
                                     dfs=[df],
                                     **kwargs)

    store = store_factory()

    print(f"Dataset location: {VOLUME_LOCATION}")

    # Use Pyhive to query hive
    conn = hive.Connection(host="hive-server", port=10000)
    cursor = conn.cursor()

    # TODO: test partitioned dataset
    for filepath in store.iter_keys():
        if filepath.endswith(".parquet"):
            parquet_file_parentdir = f"{VOLUME_LOCATION}/{os.path.dirname(filepath)}"
            break

    if kwargs.get("partition_on"):
        for i in kwargs.get(
                "partition_on"
        ):  # Get the parent directory of the parquet file for each column it is partitioned on
            # Note. Parquet filepath looks like: `/tmp/uuid/table/partition_col1=x/partition_col2=y/1300dadda3.parquet`
            parquet_file_parentdir = os.path.dirname(parquet_file_parentdir)

    # Create Hive table
    ## Non-nested columns not included: `np.uint64` (max value is too large for `BIGINT`)
    ## The `null` column can be specified as multiple types (at least `STRING` and `FLOAT`)
    # TODO: have a mapping from kartothek/arrow dtypes to Hive dtypes
    selected_columns_and_dtypes = """\
                bool BOOLEAN,
                bytes BINARY,
                date_ DATE,
                datetime64 BIGINT,
                float32 FLOAT,
                float64 DOUBLE,
                int8 TINYINT,
                int16 SMALLINT,
                int32 INT,
                int64 BIGINT,
                uint8 SMALLINT,
                uint16 INT,
                uint32 BIGINT,
                unicode STRING,
                null_ FLOAT"""
    # Hive allows us to only select a subset of columns to be loaded from the Parquet file
    hive_query = f"""
      CREATE external table {TABLE_NAME} (
        {selected_columns_and_dtypes}
        )
      STORED AS PARQUET
      LOCATION "{parquet_file_parentdir}"
    """

    print(f"Hive query: {hive_query}")
    cursor.execute(hive_query)

    # Get column names from query substring
    selected_columns = [
        l.strip().split(" ")[0]
        for l in selected_columns_and_dtypes.splitlines()
    ]
    # Read hive table into pandas
    hive_df = pd.read_sql(f"SELECT * FROM {TABLE_NAME}", conn)
    hive_df.columns = selected_columns
    # Pyarrow stores timestamp as microseconds from epoch, convert to date
    hive_df["datetime64"] = pd.to_datetime(hive_df.loc[:, "datetime64"] * 1000,
                                           unit="ns")
    # Output from hive is a string, parse this to date
    hive_df["date_"] = pd.to_datetime(
        hive_df.loc[:, "date_"], format="%Y-%m-%d").apply(lambda x: x.date())

    # Ignore dtype for numeric comparisons (e.g. int32 with int64)
    pdt.assert_frame_equal(df[selected_columns], hive_df, check_dtype=False)
    print(f"Test completed for the following data types: {selected_columns}")
コード例 #13
0
def test_indices_uints(store_factory, metadata_version,
                       bound_build_dataset_indices):
    dataset_uuid = "dataset_uuid"

    # min uint64
    p1 = 0

    # max uint64 => cannot even be cast to float32
    p2 = int(~np.uint64(0))

    # number that would be cut if converted to float64 and back
    p3 = 17128351978467489013

    partitions = [
        {
            "label":
            "cluster_1",
            "data":
            [("core", pd.DataFrame({"p": pd.Series([p1], dtype=np.uint64)}))],
        },
        {
            "label":
            "cluster_2",
            "data":
            [("core", pd.DataFrame({"p": pd.Series([p2], dtype=np.uint64)}))],
        },
        {
            "label":
            "cluster_3",
            "data":
            [("core", pd.DataFrame({"p": pd.Series([p3], dtype=np.uint64)}))],
        },
    ]
    expected = {p1: ["cluster_1"], p2: ["cluster_2"], p3: ["cluster_3"]}

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
    )
    dataset = dataset.load_all_indices(store=store_factory)
    assert not dataset.indices

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid,
                                     store_factory,
                                     load_all_indices=True)
    assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)

    # Re-create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["p"])

    # Assert indices are properly created
    dataset_factory = DatasetFactory(dataset_uuid,
                                     store_factory,
                                     load_all_indices=True)
    assert_index_dct_equal(expected, dataset_factory.indices["p"].index_dct)