Beispiel #1
0
def _get_meta_partitions_with_dataframe(metadata_version):
    df = pd.DataFrame(
        OrderedDict(
            [
                ("P", [1]),
                ("L", [1]),
                ("TARGET", [1]),
                ("DATE", [datetime.date(2010, 1, 1)]),
            ]
        )
    )
    df_2 = pd.DataFrame(OrderedDict([("P", [1]), ("info", ["a"])]))
    mp = MetaPartition(
        label="cluster_1",
        data={SINGLE_TABLE: df, "helper": df_2},
        metadata_version=metadata_version,
    )
    df_3 = pd.DataFrame(
        OrderedDict(
            [
                ("P", [2]),
                ("L", [2]),
                ("TARGET", [2]),
                ("DATE", [datetime.date(2009, 12, 31)]),
            ]
        )
    )
    df_4 = pd.DataFrame(OrderedDict([("P", [2]), ("info", ["b"])]))
    mp2 = MetaPartition(
        label="cluster_2",
        data={SINGLE_TABLE: df_3, "helper": df_4},
        metadata_version=metadata_version,
    )
    return [mp, mp2]
def test_nested_copy():
    mp = MetaPartition(
        label="label_1",
        file="file",
        data=pd.DataFrame({"test": [1, 2, 3]}),
        indices={"test": {
            1: "label_1",
            2: "label_2",
            3: "label_3"
        }},
    )

    mp_2 = MetaPartition(
        label="label_2",
        data=pd.DataFrame({"test": [4, 5, 6]}),
        indices={"test": [4, 5, 6]},
    )
    mp = mp.add_metapartition(mp_2)
    assert len(mp.metapartitions) == 2
    new_mp = mp.copy()

    # Check if the copy is identical
    assert len(new_mp.metapartitions) == len(mp.metapartitions)
    assert new_mp == mp
    # ... but not the same object
    assert id(new_mp) != id(mp)
def test_partition_on_one_level_ts():
    original_df = pd.DataFrame({
        "test": [
            pd.Timestamp("2001-01-01"),
            pd.Timestamp("2001-01-02"),
            pd.Timestamp("2001-01-03"),
        ],
        "some_values": [1, 2, 3],
    })
    mp = MetaPartition(label="label_1",
                       file="file",
                       data=original_df,
                       metadata_version=4)

    new_mp = mp.partition_on(["test"])

    assert len(new_mp.metapartitions) == 3

    labels = set()
    for mp in new_mp:
        labels.add(mp.label)
        assert len(mp.data) == 1
        assert mp.data is not None
        df = mp.data
        assert df._is_view

        # try to be agnostic about the order
        assert len(df) == 1
        assert "test" not in df
    expected_labels = set([
        "test=2001-01-01%2000%3A00%3A00/label_1",
        "test=2001-01-02%2000%3A00%3A00/label_1",
        "test=2001-01-03%2000%3A00%3A00/label_1",
    ])
    assert labels == expected_labels
def test_partition_on_one_level():
    original_df = pd.DataFrame({"test": [1, 2, 3], "some_values": [1, 2, 3]})
    mp = MetaPartition(
        label="label_1",
        files={"core": "file"},
        data={"core": original_df},
        dataset_metadata={"dataset": "metadata"},
        metadata_version=4,
    )

    new_mp = mp.partition_on(["test"])

    assert len(new_mp.metapartitions) == 3

    labels = set()
    for mp in new_mp:
        labels.add(mp.label)
        assert len(mp.data) == 1
        assert "core" in mp.data
        df = mp.data["core"]
        assert df._is_view

        # try to be agnostic about the order
        assert len(df) == 1
        assert "test" not in df
    expected_labels = set(
        ["test=1/label_1", "test=2/label_1", "test=3/label_1"])
    assert labels == expected_labels
def test_to_dict(metadata_version):
    df = pd.DataFrame({"A": [1]})
    schema = make_meta(df, origin="test")
    mp = MetaPartition(
        label="label_1",
        file="file",
        data=df,
        indices={"test": [1, 2, 3]},
        metadata_version=metadata_version,
        schema=schema,
    )
    mp_dct = mp.to_dict()
    assert mp_dct == {
        "label": "label_1",
        "data": df,
        "file": "file",
        "indices": {
            "test": [1, 2, 3]
        },
        "metadata_version": metadata_version,
        "schema": schema,
        "partition_keys": [],
        "logical_conjunction": None,
        "table_name": SINGLE_TABLE,
    }
def test_store_single_dataframe_as_partition_no_metadata(
        store, metadata_version):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })
    mp = MetaPartition(label="test_label",
                       data={"core": df},
                       metadata_version=metadata_version)
    partition = mp.store_dataframes(
        store=store,
        df_serializer=ParquetSerializer(),
        dataset_uuid="dataset_uuid",
        store_metadata=False,
    )

    assert len(partition.data) == 0

    expected_file = "dataset_uuid/core/test_label.parquet"

    assert partition.files == {"core": expected_file}
    assert partition.label == "test_label"

    # One meta one actual file
    files_in_store = list(store.keys())
    assert len(files_in_store) == 1

    stored_df = DataFrameSerializer.restore_dataframe(store=store,
                                                      key=expected_file)
    pdt.assert_frame_equal(df, stored_df)
def test_to_dict(metadata_version):
    mp = MetaPartition(
        label="label_1",
        files={"core": "file"},
        data={"core": "placeholder"},
        indices={"test": [1, 2, 3]},
        metadata_version=metadata_version,
        table_meta={"core": {
            "test": "int8"
        }},
    )
    mp_dct = mp.to_dict()

    assert mp_dct == {
        "label": "label_1",
        "data": {
            "core": "placeholder"
        },
        "files": {
            "core": "file"
        },
        "indices": {
            "test": [1, 2, 3]
        },
        "dataset_metadata": {},
        "metadata_version": metadata_version,
        "table_meta": {
            "core": {
                "test": "int8"
            }
        },
        "partition_keys": [],
    }
def test_store_single_dataframe_as_partition(store, metadata_storage_format,
                                             metadata_version, expected_key):
    df = pd.DataFrame({
        "P": np.arange(0, 10),
        "L": np.arange(0, 10),
        "TARGET": np.arange(10, 20)
    })
    mp = MetaPartition(label="test_label",
                       data={"core": df},
                       metadata_version=metadata_version)

    meta_partition = mp.store_dataframes(
        store=store,
        df_serializer=ParquetSerializer(),
        dataset_uuid="dataset_uuid",
        store_metadata=True,
        metadata_storage_format=metadata_storage_format,
    )

    assert len(meta_partition.data) == 0

    assert meta_partition.files == {"core": expected_key}
    assert meta_partition.label == "test_label"

    files_in_store = list(store.keys())

    expected_num_files = 1
    assert len(files_in_store) == expected_num_files
    stored_df = DataFrameSerializer.restore_dataframe(store=store,
                                                      key=expected_key)
    pdt.assert_frame_equal(df, stored_df)
    files_in_store.remove(expected_key)
    assert len(files_in_store) == expected_num_files - 1
def test_load_dataframes(meta_partitions_files_only, store_session,
                         predicate_pushdown_to_io):
    expected_df = pd.DataFrame(
        OrderedDict([
            ("P", [1]),
            ("L", [1]),
            ("TARGET", [1]),
            ("DATE", pd.to_datetime([date(2010, 1, 1)])),
        ]))
    expected_df_2 = pd.DataFrame(OrderedDict([("P", [1]), ("info", ["a"])]))
    mp = meta_partitions_files_only[0]
    assert len(mp.files) > 0
    assert len(mp.data) == 0
    mp = meta_partitions_files_only[0].load_dataframes(
        store=store_session, predicate_pushdown_to_io=predicate_pushdown_to_io)
    assert len(mp.data) == 2
    data = mp.data

    pdt.assert_frame_equal(data["core"], expected_df, check_dtype=False)
    pdt.assert_frame_equal(data["helper"], expected_df_2, check_dtype=False)

    empty_mp = MetaPartition("empty_mp", metadata_version=mp.metadata_version)
    empty_mp.load_dataframes(store_session,
                             predicate_pushdown_to_io=predicate_pushdown_to_io)
    assert empty_mp.data == {}
def test_reconstruct_index_categories(store):
    ser = ParquetSerializer()
    df = pd.DataFrame({
        "index_col": [1, 1],
        "second_index_col": [2, 2],
        "column": list("ab")
    })

    label = "dontcare"
    key_prefix = "uuid/table/index_col=2/second_index_col=2/{}".format(label)
    key = ser.store(store, key_prefix, df)

    schema = make_meta(df, origin="1", partition_keys="index_col")
    store_schema_metadata(schema, "uuid", store, "table")

    mp = MetaPartition(
        label="index_col=2/dontcare",
        files={"table": key},
        metadata_version=4,
        table_meta={"table": schema},
        partition_keys=["index_col", "second_index_col"],
    )
    categories = ["second_index_col", "index_col"]
    mp = mp.load_dataframes(store, categoricals={"table": categories})
    df_actual = mp.data["table"]
    df_expected = pd.DataFrame(
        OrderedDict([
            ("index_col", [2, 2]),
            ("second_index_col", [2, 2]),
            ("column", list("ab")),
        ]))
    df_expected = df_expected.astype({col: "category" for col in categories})
    pdt.assert_frame_equal(df_actual, df_expected)
Beispiel #11
0
def test_table_meta(store):
    mp = MetaPartition(
        label="label_1",
        data=pd.DataFrame(
            {
                "i32": np.array([1, 2, 3, 1, 2, 3], dtype="int32"),
                "float": np.array([1, 1, 1, 2, 2, 2], dtype="float64"),
            }
        ),
        metadata_version=4,
    )

    assert mp.schema is not None
    expected_meta = make_meta(
        pd.DataFrame(
            {"i32": np.array([], dtype="int32"), "float": np.array([], dtype="float64")}
        ),
        origin="1",
    )
    actual_meta = mp.schema
    assert actual_meta == expected_meta

    mp = mp.store_dataframes(store, "dataset_uuid")

    actual_meta = mp.schema
    assert actual_meta == expected_meta
Beispiel #12
0
class BuildIndex(AsvBenchmarkConfig):
    params = ([-1, 1], [10**3, 10**4], [10, 100])
    param_names = ["cardinality", "num_values", "partitions_to_merge"]

    def setup(self, cardinality, num_values, partitions_to_merge):
        self.column = "column"
        self.table = "table"
        self.merge_indices = []
        for n in range(partitions_to_merge):
            if cardinality < 0:
                array = [
                    "{:010f}".format(x) for x in np.random.randn(num_values)
                ]
            else:
                unique_vals = ["{:010d}".format(n) for n in range(cardinality)]
                array = [
                    unique_vals[x % len(unique_vals)]
                    for x in range(num_values)
                ]
            self.df = pd.DataFrame({self.column: array})
            self.mp = MetaPartition(label=self.table,
                                    data={"core": self.df},
                                    metadata_version=4)
            self.mp_indices = self.mp.build_indices([self.column])
            self.merge_indices.append(self.mp_indices)

    def time_metapartition_build_index(self, cardinality, num_values,
                                       partitions_to_merge):
        self.mp.build_indices([self.column])

    def time_merge_indices(self, cardinality, num_values, partitions_to_merge):
        MetaPartition.merge_indices(self.merge_indices)
Beispiel #13
0
def test_partition_urlencode():
    original_df = pd.DataFrame({"ÖŒå": [1, 2, 3], "some_values": [1, 2, 3]})
    mp = MetaPartition(label="label_1", data=original_df, metadata_version=4)

    new_mp = mp.partition_on(["ÖŒå"])

    assert len(new_mp.metapartitions) == 3

    labels = set()
    for mp in new_mp:
        labels.add(mp.label)
        assert len(mp.data) == 1
        assert mp.data is not None
        df = mp.data
        assert df._is_view

        # try to be agnostic about the order
        assert len(df) == 1
        assert "ÖŒå" not in df
    expected_labels = set(
        [
            "%C3%96%C5%92%C3%A5=1/label_1",
            "%C3%96%C5%92%C3%A5=2/label_1",
            "%C3%96%C5%92%C3%A5=3/label_1",
        ]
    )
    assert labels == expected_labels
Beispiel #14
0
def test_reconstruct_index_empty_df(store, categoricals):
    ser = ParquetSerializer()
    df = pd.DataFrame({"index_col": [1, 1], "column": list("ab")})
    df = df[0:0]

    label = "dontcare"
    key_prefix = "uuid/table/index_col=2/{}".format(label)
    key = ser.store(store, key_prefix, df)

    schema = make_meta(df, origin="1", partition_keys="index_col")
    store_schema_metadata(schema, "uuid", store)

    mp = MetaPartition(
        label="index_col=2/dontcare",
        file=key,
        metadata_version=4,
        schema=schema,
        partition_keys=["index_col"],
    )
    categoricals = None
    if categoricals:
        categoricals = ["index_col"]
    mp = mp.load_dataframes(store, categoricals=categoricals)
    df_actual = mp.data
    df_expected = pd.DataFrame(
        OrderedDict([("index_col", [2, 2]), ("column", list("ab"))])
    )
    df_expected = df_expected[0:0]
    if categoricals:
        df_expected = df_expected.astype({"index_col": "category"})
    pdt.assert_frame_equal(df_actual, df_expected)
Beispiel #15
0
def meta_partitions_evaluation_dataframe(metadata_version):
    """
    Create a list of MetaPartitions for testing. The partitions
    include in-memory pd.DataFrames without external references, i.e. files
     are empty

    """
    df = pd.DataFrame(
        OrderedDict([("P", [1]), ("L", [1]), ("HORIZON", [1]), ("PRED", [10])])
    )
    mp = MetaPartition(
        label="cluster_1_1", data={"PRED": df}, metadata_version=metadata_version
    )
    df_2 = pd.DataFrame(
        OrderedDict([("P", [1]), ("L", [1]), ("HORIZON", [2]), ("PRED", [20])])
    )
    mp2 = MetaPartition(
        label="cluster_1_2", data={"PRED": df_2}, metadata_version=metadata_version
    )
    df_3 = pd.DataFrame(
        OrderedDict([("P", [2]), ("L", [2]), ("HORIZON", [1]), ("PRED", [10])])
    )
    mp3 = MetaPartition(
        label="cluster_2_1", data={"PRED": df_3}, metadata_version=metadata_version
    )
    df_4 = pd.DataFrame(
        OrderedDict([("P", [2]), ("L", [2]), ("HORIZON", [2]), ("PRED", [20])])
    )
    mp4 = MetaPartition(
        label="cluster_2_2", data={"PRED": df_4}, metadata_version=metadata_version
    )
    return [mp, mp2, mp3, mp4]
Beispiel #16
0
def test_load_dataframe_logical_conjunction(store, metadata_version):
    df = pd.DataFrame(
        {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)}
    )
    mp = MetaPartition(
        label="cluster_1",
        data=df,
        metadata_version=metadata_version,
        logical_conjunction=[("P", ">", 4)],
    )
    meta_partition = mp.store_dataframes(
        store=store, df_serializer=None, dataset_uuid="dataset_uuid",
    )
    predicates = None
    loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates)
    data = pd.DataFrame(
        {"P": [5, 6, 7, 8, 9], "L": [5, 6, 7, 8, 9], "TARGET": [15, 16, 17, 18, 19]}
    ).set_index(np.arange(5, 10))
    pdt.assert_frame_equal(loaded_mp.data, data)

    predicates = [[("L", ">", 6), ("TARGET", "<", 18)]]
    loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates)
    data = pd.DataFrame({"P": [7], "L": [7], "TARGET": [17]}).set_index(np.array([7]))
    pdt.assert_frame_equal(loaded_mp.data, data)

    predicates = [[("L", ">", 2), ("TARGET", "<", 17)], [("TARGET", "==", 19)]]
    loaded_mp = meta_partition.load_dataframes(store=store, predicates=predicates)
    data = pd.DataFrame(
        {"P": [5, 6, 9], "L": [5, 6, 9], "TARGET": [15, 16, 19]}
    ).set_index(np.array([5, 6, 9]))
    pdt.assert_frame_equal(loaded_mp.data, data)
Beispiel #17
0
 def test_raises_other_index_missing(self, cube, function_store):
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data={
                 SINGLE_TABLE: pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]})
             },
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ).build_indices(["x", "y"]),
         name=cube.seed_dataset,
     )
     store_data(
         cube=cube,
         function_store=function_store,
         df=MetaPartition(
             label=gen_uuid(),
             data={
                 SINGLE_TABLE: pd.DataFrame(
                     {"x": [0], "y": [0], "p": [0], "q": [0], "i1": [1337]}
                 )
             },
             metadata_version=KTK_CUBE_METADATA_VERSION,
         ),
         name="enrich",
     )
     with pytest.raises(ValueError) as exc:
         discover_datasets(cube, function_store)
     assert (
         str(exc.value)
         == 'ExplicitSecondaryIndex or PartitionIndex "i1" is missing in dataset "enrich".'
     )
Beispiel #18
0
def test_store_single_dataframe_as_partition(store, metadata_version):
    df = pd.DataFrame(
        {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)}
    )
    mp = MetaPartition(label="test_label", data=df, metadata_version=metadata_version)

    meta_partition = mp.store_dataframes(
        store=store, df_serializer=ParquetSerializer(), dataset_uuid="dataset_uuid",
    )

    assert meta_partition.data is None

    expected_key = "dataset_uuid/table/test_label.parquet"

    assert meta_partition.file == expected_key
    assert meta_partition.label == "test_label"

    files_in_store = list(store.keys())

    expected_num_files = 1
    assert len(files_in_store) == expected_num_files
    stored_df = DataFrameSerializer.restore_dataframe(store=store, key=expected_key)
    pdt.assert_frame_equal(df, stored_df)
    files_in_store.remove(expected_key)
    assert len(files_in_store) == expected_num_files - 1
Beispiel #19
0
def test_merge_indices():
    indices = [
        MetaPartition(
            label="label1",
            indices={"location": {"Loc1": ["label1"], "Loc2": ["label1"]}},
        ),
        MetaPartition(
            label="label2",
            indices={
                "location": {"Loc3": ["label2"], "Loc2": ["label2"]},
                "product": {"Product1": ["label2"], "Product2": ["label2"]},
            },
        ),
    ]
    result = MetaPartition.merge_indices(indices)
    expected = {
        "location": ExplicitSecondaryIndex(
            "location",
            {"Loc1": ["label1"], "Loc2": ["label1", "label2"], "Loc3": ["label2"]},
        ),
        "product": ExplicitSecondaryIndex(
            "product", {"Product1": ["label2"], "Product2": ["label2"]}
        ),
    }
    assert result == expected
Beispiel #20
0
def test_get_parquet_metadata_row_group_size(store):
    df = pd.DataFrame({"P": np.arange(0, 10), "L": np.arange(0, 10)})
    mp = MetaPartition(label="test_label", data=df)
    ps = ParquetSerializer(chunk_size=5)

    meta_partition = mp.store_dataframes(
        store=store, dataset_uuid="dataset_uuid", df_serializer=ps
    )
    actual = meta_partition.get_parquet_metadata(store=store)
    actual.drop(
        columns=[
            "serialized_size",
            "row_group_compressed_size",
            "row_group_uncompressed_size",
        ],
        axis=1,
        inplace=True,
    )

    expected = pd.DataFrame(
        {
            "partition_label": ["test_label", "test_label"],
            "row_group_id": [0, 1],
            "number_rows_total": [10, 10],
            "number_row_groups": [2, 2],
            "number_rows_per_row_group": [5, 5],
        }
    )
    pd.testing.assert_frame_equal(actual, expected)
Beispiel #21
0
def test_get_parquet_metadata_empty_df(store):
    df = pd.DataFrame()
    mp = MetaPartition(label="test_label", data=df)
    meta_partition = mp.store_dataframes(store=store, dataset_uuid="dataset_uuid")

    actual = meta_partition.get_parquet_metadata(store=store)
    actual.drop(
        columns=[
            "serialized_size",
            "row_group_compressed_size",
            "row_group_uncompressed_size",
        ],
        axis=1,
        inplace=True,
    )

    expected = pd.DataFrame(
        {
            "partition_label": ["test_label"],
            "row_group_id": 0,
            "number_rows_total": 0,
            "number_row_groups": 1,
            "number_rows_per_row_group": 0,
        }
    )

    pd.testing.assert_frame_equal(actual, expected)
def test_table_meta(store):
    mp = MetaPartition(
        label="label_1",
        data={
            "core":
            pd.DataFrame({
                "i32":
                np.array([1, 2, 3, 1, 2, 3], dtype="int32"),
                "float":
                np.array([1, 1, 1, 2, 2, 2], dtype="float64"),
            })
        },
        metadata_version=4,
    )

    assert len(mp.table_meta) == 1
    assert "core" in mp.table_meta
    expected_meta = make_meta(
        pd.DataFrame({
            "i32": np.array([], dtype="int32"),
            "float": np.array([], dtype="float64")
        }),
        origin="1",
    )
    actual_meta = mp.table_meta["core"]
    assert actual_meta == expected_meta

    mp = mp.store_dataframes(store, "dataset_uuid")

    actual_meta = mp.table_meta["core"]
    assert actual_meta == expected_meta
Beispiel #23
0
def test_partition_on_scalar_intermediate(df_not_nested, col):
    """
    Test against a bug where grouping leaves a scalar value
    """
    assert len(df_not_nested) == 1
    mp = MetaPartition(label="somelabel", data=df_not_nested, metadata_version=4)
    new_mp = mp.partition_on(col)
    assert len(new_mp) == 1
Beispiel #24
0
def test_concat_metapartition_wrong_types(df_all_types):
    mp1 = MetaPartition(label="first", data=df_all_types, metadata_version=4)
    df_corrupt = df_all_types.copy()
    df_corrupt["int8"] = "NoInteger"
    mp2 = MetaPartition(label="second", data=df_corrupt, metadata_version=4)

    with pytest.raises(ValueError, match="Schema violation"):
        MetaPartition.concat_metapartitions([mp1, mp2])
Beispiel #25
0
def store_data(
    cube,
    function_store,
    df,
    name,
    partition_on="default",
    metadata_version=KTK_CUBE_METADATA_VERSION,
    metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT,
    metadata=None,
    overwrite=False,
    new_ktk_cube_metadata=True,
    write_suppress_index_on=True,
):
    if partition_on == "default":
        partition_on = cube.partition_columns

    if isinstance(df, pd.DataFrame):
        mp = MetaPartition(
            label=gen_uuid(), data={SINGLE_TABLE: df}, metadata_version=metadata_version
        )

        indices_to_build = set(cube.index_columns) & set(df.columns)
        if name == cube.seed_dataset:
            indices_to_build |= set(cube.dimension_columns) - set(
                cube.suppress_index_on
            )
        mp = mp.build_indices(indices_to_build)
        dfs = mp
    else:
        assert isinstance(df, MetaPartition)
        assert df.metadata_version == metadata_version
        dfs = df

    if metadata is None:
        metadata = {
            KTK_CUBE_METADATA_DIMENSION_COLUMNS: cube.dimension_columns,
            KTK_CUBE_METADATA_KEY_IS_SEED: (name == cube.seed_dataset),
        }
        if new_ktk_cube_metadata:
            metadata.update(
                {KTK_CUBE_METADATA_PARTITION_COLUMNS: cube.partition_columns}
            )
        if write_suppress_index_on:
            metadata.update(
                {KTK_CUBE_METADATA_SUPPRESS_INDEX_ON: list(cube.suppress_index_on)}
            )

    return store_dataframes_as_dataset(
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid(name),
        dfs=dfs,
        partition_on=list(partition_on) if partition_on else None,
        metadata_storage_format=metadata_storage_format,
        metadata_version=metadata_version,
        df_serializer=KTK_CUBE_DF_SERIALIZER,
        metadata=metadata,
        overwrite=overwrite,
    )
def test_partition_on_explicit_index():
    original_df = pd.DataFrame({
        "level1": [1, 2, 1, 2, 1, 2],
        "level2": [1, 1, 1, 2, 2, 2],
        "explicit_index_col": np.arange(0, 6),
    })
    mp = MetaPartition(
        label="label_1",
        file="file",
        data=original_df,
        indices={
            "explicit_index_col":
            {value: ["label_1"]
             for value in np.arange(0, 6)}
        },
        metadata_version=4,
    )
    new_mp = mp.partition_on(["level1", "level2"])
    assert len(new_mp) == 4

    expected_indices = {
        "explicit_index_col":
        ExplicitSecondaryIndex(
            "explicit_index_col",
            {
                0: ["level1=1/level2=1/label_1"],
                2: ["level1=1/level2=1/label_1"]
            },
        )
    }
    assert expected_indices == new_mp["level1=1/level2=1/label_1"].indices

    expected_indices = {
        "explicit_index_col":
        ExplicitSecondaryIndex("explicit_index_col",
                               {4: ["level1=1/level2=2/label_1"]})
    }
    assert expected_indices == new_mp["level1=1/level2=2/label_1"].indices

    expected_indices = {
        "explicit_index_col":
        ExplicitSecondaryIndex("explicit_index_col",
                               {1: ["level1=2/level2=1/label_1"]})
    }
    assert expected_indices == new_mp["level1=2/level2=1/label_1"].indices

    expected_indices = {
        "explicit_index_col":
        ExplicitSecondaryIndex(
            "explicit_index_col",
            {
                3: ["level1=2/level2=2/label_1"],
                5: ["level1=2/level2=2/label_1"]
            },
        )
    }
    assert expected_indices == new_mp["level1=2/level2=2/label_1"].indices
Beispiel #27
0
def test_partition_on_raises_no_cols_left(empty):
    original_df = pd.DataFrame({"test": [1, 2, 3]})
    if empty:
        original_df = original_df.loc[[]]
    mp = MetaPartition(
        label="label_1", file="file", data=original_df, metadata_version=4
    )
    with pytest.raises(ValueError) as e:
        mp.partition_on(["test"])
    assert str(e.value) == "No data left to save outside partition columns"
Beispiel #28
0
def test_partition_on_raises_pocols_missing(empty):
    original_df = pd.DataFrame({"test": [1, 2, 3]})
    if empty:
        original_df = original_df.loc[[]]
    mp = MetaPartition(
        label="label_1", file="file", data=original_df, metadata_version=4
    )
    with pytest.raises(ValueError) as e:
        mp.partition_on(["test", "foo", "bar"])
    assert str(e.value) == "Partition column(s) missing: bar, foo"
Beispiel #29
0
def test_partition_on_valid_schemas():
    """
    Ensure that partitioning is possible even if the output schemas of the
    sub partitions may be different
    """
    df = pd.DataFrame({"partition_col": [0, 1], "values": [None, "str"]})
    mp = MetaPartition(label="base_label", data=df, metadata_version=4)
    mp = mp.partition_on(["partition_col"])
    assert len(mp) == 2
    expected_meta = make_meta(df, origin="1", partition_keys="partition_col")
    assert mp.schema == expected_meta
Beispiel #30
0
def test_concat_metapartition(df_all_types):
    mp1 = MetaPartition(label="first", data=df_all_types, metadata_version=4)
    mp2 = MetaPartition(label="second", data=df_all_types, metadata_version=4)

    new_mp = MetaPartition.concat_metapartitions([mp1, mp2])

    # what the label actually is, doesn't matter so much
    assert new_mp.label is not None
    df_expected = pd.concat([df_all_types, df_all_types])
    df_actual = new_mp.data
    pdt.assert_frame_equal(df_actual, df_expected)