Exemple #1
0
def test_table_meta(store):
    mp = MetaPartition(
        label="label_1",
        data=pd.DataFrame(
            {
                "i32": np.array([1, 2, 3, 1, 2, 3], dtype="int32"),
                "float": np.array([1, 1, 1, 2, 2, 2], dtype="float64"),
            }
        ),
        metadata_version=4,
    )

    assert mp.schema is not None
    expected_meta = make_meta(
        pd.DataFrame(
            {"i32": np.array([], dtype="int32"), "float": np.array([], dtype="float64")}
        ),
        origin="1",
    )
    actual_meta = mp.schema
    assert actual_meta == expected_meta

    mp = mp.store_dataframes(store, "dataset_uuid")

    actual_meta = mp.schema
    assert actual_meta == expected_meta
Exemple #2
0
def test_store_single_dataframe_as_partition(store, metadata_version):
    df = pd.DataFrame(
        {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)}
    )
    mp = MetaPartition(label="test_label", data=df, metadata_version=metadata_version)

    meta_partition = mp.store_dataframes(
        store=store, df_serializer=ParquetSerializer(), dataset_uuid="dataset_uuid",
    )

    assert meta_partition.data is None

    expected_key = "dataset_uuid/table/test_label.parquet"

    assert meta_partition.file == expected_key
    assert meta_partition.label == "test_label"

    files_in_store = list(store.keys())

    expected_num_files = 1
    assert len(files_in_store) == expected_num_files
    stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_key)
    pdt.assert_frame_equal(df, stored_df)
    files_in_store.remove(expected_key)
    assert len(files_in_store) == expected_num_files - 1
Exemple #3
0
def test_load_dataframe_logical_conjunction(store, metadata_version):
    df = pd.DataFrame(
        {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)}
    )
    mp = MetaPartition(
        label="cluster_1",
        data=df,
        metadata_version=metadata_version,
        logical_conjunction=[("P", ">", 4)],
    )
    meta_partition = mp.store_dataframes(
        store=store, df_serializer=None, dataset_uuid="dataset_uuid",
    )
    predicates = None
    loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates)
    data = pd.DataFrame(
        {"P": [5, 6, 7, 8, 9], "L": [5, 6, 7, 8, 9], "TARGET": [15, 16, 17, 18, 19]}
    ).set_index(np.arange(5, 10))
    pdt.assert_frame_equal(loaded_mp.data, data)

    predicates = [[("L", ">", 6), ("TARGET", "<", 18)]]
    loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates)
    data = pd.DataFrame({"P": [7], "L": [7], "TARGET": [17]}).set_index(np.array([7]))
    pdt.assert_frame_equal(loaded_mp.data, data)

    predicates = [[("L", ">", 2), ("TARGET", "<", 17)], [("TARGET", "==", 19)]]
    loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates)
    data = pd.DataFrame(
        {"P": [5, 6, 9], "L": [5, 6, 9], "TARGET": [15, 16, 19]}
    ).set_index(np.array([5, 6, 9]))
    pdt.assert_frame_equal(loaded_mp.data, data)
Exemple #4
0
def test_get_parquet_metadata_empty_df(store):
    df = pd.DataFrame()
    mp = MetaPartition(label="test_label", data=df)
    meta_partition = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid")

    actual = meta_partition.get_parquet_metadata(store=store)
    actual.drop(
        columns=[
            "serialized_size",
            "row_group_compressed_size",
            "row_group_uncompressed_size",
        ],
        axis=1,
        inplace=True,
    )

    expected = pd.DataFrame(
        {
            "partition_label": ["test_label"],
            "row_group_id": 0,
            "number_rows_total": 0,
            "number_row_groups": 1,
            "number_rows_per_row_group": 0,
        }
    )

    pd.testing.assert_frame_equal(actual, expected)
Exemple #5
0
def test_get_parquet_metadata_row_group_size(store):
    df = pd.DataFrame({"P": np.arange(0, 10), "L": np.arange(0, 10)})
    mp = MetaPartition(label="test_label", data=df)
    ps = ParquetSerializer(chunk_size=5)

    meta_partition = mp.store_dataframes(
        store=store, dataset_uuid="dataset_uuid", df_serializer=ps
    )
    actual = meta_partition.get_parquet_metadata(store=store)
    actual.drop(
        columns=[
            "serialized_size",
            "row_group_compressed_size",
            "row_group_uncompressed_size",
        ],
        axis=1,
        inplace=True,
    )

    expected = pd.DataFrame(
        {
            "partition_label": ["test_label", "test_label"],
            "row_group_id": [0, 1],
            "number_rows_total": [10, 10],
            "number_row_groups": [2, 2],
            "number_rows_per_row_group": [5, 5],
        }
    )
    pd.testing.assert_frame_equal(actual, expected)
def test_store_single_dataframe_as_partition_no_metadata(
        store, metadata_version):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })
    mp = MetaPartition(label="test_label",
                       data={"core": df},
                       metadata_version=metadata_version)
    partition = mp.store_dataframes(
        store=store,
        df_serializer=ParquetSerializer(),
        dataset_uuid="dataset_uuid",
        store_metadata=False,
    )

    assert len(partition.data) == 0

    expected_file = "dataset_uuid/core/test_label.parquet"

    assert partition.files == {"core": expected_file}
    assert partition.label == "test_label"

    # One meta one actual file
    files_in_store = list(store.keys())
    assert len(files_in_store) == 1

    stored_df = DataFrameSerializer.restore_dataframe(store=store,
                                                      key=expected_file)
    pdt.assert_frame_equal(df, stored_df)
def test_store_single_dataframe_as_partition(store, metadata_storage_format,
                                             metadata_version, expected_key):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })
    mp = MetaPartition(label="test_label",
                       data={"core": df},
                       metadata_version=metadata_version)

    meta_partition = mp.store_dataframes(
        store=store,
        df_serializer=ParquetSerializer(),
        dataset_uuid="dataset_uuid",
        store_metadata=True,
        metadata_storage_format=metadata_storage_format,
    )

    assert len(meta_partition.data) == 0

    assert meta_partition.files == {"core": expected_key}
    assert meta_partition.label == "test_label"

    files_in_store = list(store.keys())

    expected_num_files = 1
    assert len(files_in_store) == expected_num_files
    stored_df = DataFrameSerializer.restore_dataframe(store=store,
                                                      key=expected_key)
    pdt.assert_frame_equal(df, stored_df)
    files_in_store.remove(expected_key)
    assert len(files_in_store) == expected_num_files - 1
def test_table_meta(store):
    mp = MetaPartition(
        label="label_1",
        data={
            "core":
            pd.DataFrame({
                "i32":
                np.array([1, 2, 3, 1, 2, 3], dtype="int32"),
                "float":
                np.array([1, 1, 1, 2, 2, 2], dtype="float64"),
            })
        },
        metadata_version=4,
    )

    assert len(mp.table_meta) == 1
    assert "core" in mp.table_meta
    expected_meta = make_meta(
        pd.DataFrame({
            "i32": np.array([], dtype="int32"),
            "float": np.array([], dtype="float64")
        }),
        origin="1",
    )
    actual_meta = mp.table_meta["core"]
    assert actual_meta == expected_meta

    mp = mp.store_dataframes(store, "dataset_uuid")

    actual_meta = mp.table_meta["core"]
    assert actual_meta == expected_meta
Exemple #9
0
def _multiplex_store(data, cube, store):
    result = {}
    for k in sorted(data.keys()):
        v = data.pop(k)
        result[k] = MetaPartition.store_dataframes(
            v,
            dataset_uuid=cube.ktk_dataset_uuid(k),
            df_serializer=KTK_CUBE_DF_SERIALIZER,
            store=store,
        )
        del v
    return result
def test_store_multiple_dataframes_as_partition(store, metadata_storage_format,
                                                metadata_version):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })
    df_2 = pd.DataFrame({
        "P": np.arange(0, 10),
        "info": string.ascii_lowercase[:10]
    })
    mp = MetaPartition(
        label="cluster_1",
        data={
            "core": df,
            "helper": df_2
        },
        metadata_version=metadata_version,
    )
    meta_partition = mp.store_dataframes(
        store=store,
        df_serializer=None,
        dataset_uuid="dataset_uuid",
        store_metadata=True,
        metadata_storage_format=metadata_storage_format,
    )

    expected_file = "dataset_uuid/core/cluster_1.parquet"
    expected_file_helper = "dataset_uuid/helper/cluster_1.parquet"

    assert meta_partition.files == {
        "core": expected_file,
        "helper": expected_file_helper,
    }
    assert meta_partition.label == "cluster_1"

    files_in_store = list(store.keys())
    assert len(files_in_store) == 2

    stored_df = DataFrameSerializer.restore_dataframe(store=store,
                                                      key=expected_file)
    pdt.assert_frame_equal(df, stored_df)
    files_in_store.remove(expected_file)

    stored_df = DataFrameSerializer.restore_dataframe(store=store,
                                                      key=expected_file_helper)
    pdt.assert_frame_equal(df_2, stored_df)
    files_in_store.remove(expected_file_helper)
Exemple #11
0
def _multiplex_store(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    result = {}
    for k in sorted(data.keys()):
        v = data.pop(k)
        result[k] = MetaPartition.store_dataframes(
            v,
            dataset_uuid=cube.ktk_dataset_uuid(k),
            df_serializer=df_serializer or KTK_CUBE_DF_SERIALIZER,
            store=store,
        )
        del v
    return result