Exemple #1
0
def test_read_categorical(store):
    df = pd.DataFrame({"col": ["a"]}).astype({"col": "category"})

    serialiser = ParquetSerializer()
    key = serialiser.store(store, "prefix", df)

    df = serialiser.restore_dataframe(store, key)
    assert df.dtypes["col"] == "O"

    df = serialiser.restore_dataframe(store, key, categories=["col"])
    assert df.dtypes["col"] == pd.CategoricalDtype(["a"], ordered=False)
Exemple #2
0
def test_pushdown_null_itermediate(store):
    binary = b"\x8f\xb6\xe5@\x90\xdc\x11\xe8\x00\xae\x02B\xac\x12\x01\x06"
    df = pd.DataFrame({"byte_with_null": [binary]})
    serialiser = ParquetSerializer(chunk_size=1)
    key = serialiser.store(store, "key", df)
    predicate = [[("byte_with_null", "==", binary)]]
    restored = serialiser.restore_dataframe(store, key, predicates=predicate)
    pdt.assert_frame_equal(restored, df)
Exemple #3
0
def test_read_empty_file_with_predicates(store):
    ser = ParquetSerializer()
    df = pd.DataFrame(dict(col=pd.Series([], dtype=str)))
    key = ser.store(store, "key", df)
    restored_df = ser.restore_dataframe(store,
                                        key,
                                        columns=["col"],
                                        predicates=[[("col", "==", "1")]])
    pdt.assert_frame_equal(restored_df, df)
Exemple #4
0
def test_date_as_object(store, chunk_size):
    ser = ParquetSerializer(chunk_size=chunk_size)
    df = pd.DataFrame({"date": [date(2000, 1, 1), date(2000, 1, 2)]})
    key = ser.store(store, "key", df)
    restored_df = ser.restore_dataframe(store,
                                        key,
                                        categories=["date"],
                                        date_as_object=True)
    categories = pd.Series([date(2000, 1, 1), date(2000, 1, 2)])
    expected_df = pd.DataFrame({"date": pd.Categorical(categories)})
    # expected_df.date = expected_df.date.cat.rename_categories([date(2000, 1, 1)])
    pdt.assert_frame_equal(restored_df, expected_df)

    restored_df = ser.restore_dataframe(store,
                                        key,
                                        date_as_object=True,
                                        predicates=[[("date", "==",
                                                      "2000-01-01")]])
    expected_df = pd.DataFrame({"date": [date(2000, 1, 1)]})
    pdt.assert_frame_equal(restored_df, expected_df)
Exemple #5
0
class TimeRestore(object):
    """
    An example benchmark that times the performance of various kinds
    of iterating over dictionaries in Python.
    """

    params = [(10 ** 3, 10 ** 4), (10, 10 ** 2, 10 ** 3)]
    param_names = ["num_rows", "chunk_size"]

    def setup(self, num_rows, chunk_size):
        self.df = get_dataframe_not_nested(num_rows)
        self.serialiser = ParquetSerializer(chunk_size=chunk_size)
        self.store = get_store_from_url("memory://")
        self.key = self.serialiser.store(self.store, "key_prefix", self.df)
        self.predicates = [[("int16", "==", 123)]]

    def time_predicate_pushdown(self, num_rows, chunk_size):
        self.serialiser.restore_dataframe(
            self.store,
            self.key,
            predicate_pushdown_to_io=True,
            predicates=self.predicates,
        )
Exemple #6
0
def test_int64_statistics_overflow(reference_store, predicate_pushdown_to_io):
    # Test case for ARROW-5166
    ser = ParquetSerializer()

    v = 705449463447499237
    predicates = [[("x", "==", v)]]
    result = ser.restore_dataframe(
        reference_store,
        "int64_statistics_overflow.parquet",
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        predicates=predicates,
    )
    assert not result.empty
    assert (result["x"] == v).all()
Exemple #7
0
def test_pushdown_binaries(store, dataframe_not_nested, binary_value,
                           chunk_size):
    if _check_contains_null(binary_value):
        pytest.xfail("Null-terminated binary strings are not supported")
    serialiser = ParquetSerializer(chunk_size=chunk_size)
    key = serialiser.store(store, "prefix", dataframe_not_nested)

    predicates = [[("bytes", "==", binary_value)]]

    df_restored = serialiser.restore_dataframe(store,
                                               key,
                                               predicates=predicates)
    assert len(df_restored) == 1
    assert df_restored.iloc[0].bytes == binary_value
Exemple #8
0
def _validate_predicate_pushdown(df, column, value, store, chunk_size):

    serialiser = ParquetSerializer(chunk_size=chunk_size)
    key = serialiser.store(store, "prefix", df)

    predicates = [[(column, "==", value)]]

    df_restored = serialiser.restore_dataframe(store,
                                               key,
                                               predicates=predicates)
    # date objects are converted to datetime in pyarrow
    df_restored["date"] = df_restored["date"].dt.date

    expected = df.iloc[[3]]
    # ARROW-5138 index isn't preserved when doing predicate pushdown
    pdt.assert_frame_equal(df_restored.reset_index(drop=True),
                           expected.reset_index(drop=True))
Exemple #9
0
def test_predicate_not_in_columns(store, chunk_size):
    ser = ParquetSerializer(chunk_size=chunk_size)
    df = pd.DataFrame({
        "date": [date(2000, 1, 1),
                 date(2000, 1, 2),
                 date(2000, 1, 2)],
        "col": [1, 2, 1],
    })
    key = ser.store(store, "key", df)
    restored_df = ser.restore_dataframe(store,
                                        key,
                                        columns=[],
                                        predicates=[[("col", "==", 1)]])
    if chunk_size:
        expected_df = pd.DataFrame(index=[0, 1])
    else:
        expected_df = pd.DataFrame(index=[0, 2])

    pdt.assert_frame_equal(restored_df, expected_df)
Exemple #10
0
def test_compat_old_rw_path(df_all_types, store):
    # strip down DF before some column types weren't supported before anyway
    df = df_all_types[
        [
            c
            for c in df_all_types.columns
            if (
                not c.startswith("array_")  # array types (always null)
                and c != "unicode"  # unicode type (alway null)
                and "8" not in c  # 8 bit types are casted to 64 bit
                and "16" not in c  # 16 bit types are casted to 64 bit
                and "32" not in c  # 32 bit types are casted to 64 bit
            )
        ]
    ]
    expected_meta = make_meta(df, origin="df")

    # old schema write path
    old_meta = dask_make_meta(df)
    pa_table = pa.Table.from_pandas(old_meta)
    buf = pa.BufferOutputStream()
    pq.write_table(pa_table, buf, version="2.0")
    key_old = _get_common_metadata_key("dataset_uuid_old", "table")
    store.put(key_old, buf.getvalue().to_pybytes())

    actual_meta = read_schema_metadata(
        dataset_uuid="dataset_uuid_old", store=store, table="table"
    )
    validate_compatible([actual_meta, expected_meta])

    store_schema_metadata(
        schema=make_meta(df, origin="df"),
        dataset_uuid="dataset_uuid_new",
        store=store,
        table="table",
    )
    key_new = _get_common_metadata_key("dataset_uuid_new", "table")
    actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store)
    actual_df["date"] = actual_df["date"].dt.date
    pdt.assert_frame_equal(actual_df, old_meta)