def test_load_dataframes(meta_partitions_files_only, store_session,
                         predicate_pushdown_to_io):
    expected_df = pd.DataFrame(
        OrderedDict([
            ("P", [1]),
            ("L", [1]),
            ("TARGET", [1]),
            ("DATE", pd.to_datetime([date(2010, 1, 1)])),
        ]))
    expected_df_2 = pd.DataFrame(OrderedDict([("P", [1]), ("info", ["a"])]))
    mp = meta_partitions_files_only[0]
    assert len(mp.files) > 0
    assert len(mp.data) == 0
    mp = meta_partitions_files_only[0].load_dataframes(
        store=store_session, predicate_pushdown_to_io=predicate_pushdown_to_io)
    assert len(mp.data) == 2
    data = mp.data

    pdt.assert_frame_equal(data["core"], expected_df, check_dtype=False)
    pdt.assert_frame_equal(data["helper"], expected_df_2, check_dtype=False)

    empty_mp = MetaPartition("empty_mp", metadata_version=mp.metadata_version)
    empty_mp.load_dataframes(store_session,
                             predicate_pushdown_to_io=predicate_pushdown_to_io)
    assert empty_mp.data == {}
Beispiel #2
0
def test_reconstruct_index_empty_df(store, categoricals):
    ser = ParquetSerializer()
    df = pd.DataFrame({"index_col": [1, 1], "column": list("ab")})
    df = df[0:0]

    label = "dontcare"
    key_prefix = "uuid/table/index_col=2/{}".format(label)
    key = ser.store(store, key_prefix, df)

    schema = make_meta(df, origin="1", partition_keys="index_col")
    store_schema_metadata(schema, "uuid", store)

    mp = MetaPartition(
        label="index_col=2/dontcare",
        file=key,
        metadata_version=4,
        schema=schema,
        partition_keys=["index_col"],
    )
    categoricals = None
    if categoricals:
        categoricals = ["index_col"]
    mp = mp.load_dataframes(store, categoricals=categoricals)
    df_actual = mp.data
    df_expected = pd.DataFrame(
        OrderedDict([("index_col", [2, 2]), ("column", list("ab"))])
    )
    df_expected = df_expected[0:0]
    if categoricals:
        df_expected = df_expected.astype({"index_col": "category"})
    pdt.assert_frame_equal(df_actual, df_expected)
Beispiel #3
0
def test_partition_on_roundtrip(store):
    original_df = pd.DataFrame(
        OrderedDict([("test", [1, 2, 3]), ("some_values", [1, 2, 3])])
    )
    mp = MetaPartition(label="label_1", data=original_df, metadata_version=4)

    new_mp = mp.partition_on(["test"])
    new_mp = new_mp.store_dataframes(store=store, dataset_uuid="some_uuid")
    store_schema_metadata(new_mp.schema, "some_uuid", store)
    # Test immediately after dropping and later once with new metapartition to check table meta reloading
    new_mp = new_mp.load_dataframes(store=store)
    assert len(new_mp.metapartitions) == 3
    dfs = []
    for internal_mp in new_mp:
        dfs.append(internal_mp.data)
    actual_df = pd.concat(dfs).sort_values(by="test").reset_index(drop=True)
    pdt.assert_frame_equal(original_df, actual_df)

    for i in range(1, 4):
        # Check with fresh metapartitions
        new_mp = MetaPartition(
            label=f"test={i}/label_1",
            file=f"some_uuid/table/test={i}/label_1.parquet",
            metadata_version=4,
        )
        new_mp = new_mp.load_dataframes(store=store)

        actual_df = new_mp.data

        expected_df = pd.DataFrame(OrderedDict([("test", [i]), ("some_values", [i])]))
        pdt.assert_frame_equal(expected_df, actual_df)
def test_reconstruct_index_categories(store):
    ser = ParquetSerializer()
    df = pd.DataFrame({
        "index_col": [1, 1],
        "second_index_col": [2, 2],
        "column": list("ab")
    })

    label = "dontcare"
    key_prefix = "uuid/table/index_col=2/second_index_col=2/{}".format(label)
    key = ser.store(store, key_prefix, df)

    schema = make_meta(df, origin="1", partition_keys="index_col")
    store_schema_metadata(schema, "uuid", store, "table")

    mp = MetaPartition(
        label="index_col=2/dontcare",
        files={"table": key},
        metadata_version=4,
        table_meta={"table": schema},
        partition_keys=["index_col", "second_index_col"],
    )
    categories = ["second_index_col", "index_col"]
    mp = mp.load_dataframes(store, categoricals={"table": categories})
    df_actual = mp.data["table"]
    df_expected = pd.DataFrame(
        OrderedDict([
            ("index_col", [2, 2]),
            ("second_index_col", [2, 2]),
            ("column", list("ab")),
        ]))
    df_expected = df_expected.astype({col: "category" for col in categories})
    pdt.assert_frame_equal(df_actual, df_expected)
def test_load_dataframes(meta_partitions_files_only, store_session,
                         predicate_pushdown_to_io):
    expected_df = pd.DataFrame(
        OrderedDict([("P", [1]), ("L", [1]), ("TARGET", [1]),
                     ("DATE", [date(2010, 1, 1)])]))
    mp = meta_partitions_files_only[0]
    assert mp.file
    assert mp.data is not None
    mp = meta_partitions_files_only[0].load_dataframes(
        store=store_session, predicate_pushdown_to_io=predicate_pushdown_to_io)
    assert mp.data is not None
    data = mp.data

    pdt.assert_frame_equal(data, expected_df, check_dtype=False)

    empty_mp = MetaPartition("empty_mp", metadata_version=mp.metadata_version)
    empty_mp.load_dataframes(store_session,
                             predicate_pushdown_to_io=predicate_pushdown_to_io)
    assert empty_mp.data is None
Beispiel #6
0
def test_column_string_cast(df_all_types, store, metadata_version):
    original_columns = df_all_types.columns.copy()
    df_all_types.columns = df_all_types.columns.str.encode("utf-8")
    ser = ParquetSerializer()
    key = ser.store(store, "uuid/table/something", df_all_types)
    mp = MetaPartition(
        label="something",
        file=key,
        schema=make_meta(df_all_types, origin="table"),
        metadata_version=metadata_version,
    )
    mp = mp.load_dataframes(store)
    df = mp.data
    assert all(original_columns == df.columns)
Beispiel #7
0
def test_reconstruct_date_index(store, metadata_version, dates_as_object):
    ser = ParquetSerializer()
    # If the parquet file does include the primary index col, still use the reconstructed index and ignore the content of the file
    df = pd.DataFrame(
        {"index_col": [date(2018, 6, 1), date(2018, 6, 1)], "column": list("ab")}
    )

    label = "dontcare"
    key_prefix = "uuid/table/index_col=2018-06-02/{}".format(label)
    key = ser.store(store, key_prefix, df)

    schema = make_meta(df, origin="1", partition_keys="index_col")
    store_schema_metadata(schema, "uuid", store)

    mp = MetaPartition(
        label="dontcare",
        file=key,
        metadata_version=metadata_version,
        schema=schema,
        partition_keys=["index_col"],
    )

    mp = mp.load_dataframes(store, dates_as_object=dates_as_object)
    df_actual = mp.data
    if dates_as_object:
        dt_constructor = date
    else:
        dt_constructor = datetime
    df_expected = pd.DataFrame(
        OrderedDict(
            [
                ("index_col", [dt_constructor(2018, 6, 2), dt_constructor(2018, 6, 2)]),
                ("column", list("ab")),
            ]
        )
    )
    pdt.assert_frame_equal(df_actual, df_expected)