Ejemplo n.º 1
0
def test_parquet_stats(io_test_dir: str) -> None:
    file = path.join(io_test_dir, "binary_stats.parquet")
    df1 = pd.DataFrame({"a": [None, 1, None, 2, 3, 3, 4, 4, 5, 5]})
    df1.to_parquet(file, engine="pyarrow")
    df = (
        pl.scan_parquet(file)
        .filter(pl.col("a").is_not_null() & (pl.col("a") > 4))
        .collect()
    )
    assert df["a"].to_list() == [5.0, 5.0]

    assert (
        pl.scan_parquet(file).filter(pl.col("a") > 4).select(pl.col("a").sum())
    ).collect()[0, "a"] == 10.0

    assert (
        pl.scan_parquet(file).filter(pl.col("a") < 4).select(pl.col("a").sum())
    ).collect()[0, "a"] == 9.0

    assert (
        pl.scan_parquet(file).filter(4 > pl.col("a")).select(pl.col("a").sum())
    ).collect()[0, "a"] == 9.0

    assert (
        pl.scan_parquet(file).filter(4 < pl.col("a")).select(pl.col("a").sum())
    ).collect()[0, "a"] == 10.0
Ejemplo n.º 2
0
def test_row_count(foods_parquet: str) -> None:
    df = pl.read_parquet(foods_parquet, row_count_name="row_count")
    assert df["row_count"].to_list() == list(range(27))

    df = (pl.scan_parquet(foods_parquet, row_count_name="row_count").filter(
        pl.col("category") == pl.lit("vegetables")).collect())

    assert df["row_count"].to_list() == [0, 6, 11, 13, 14, 20, 25]

    df = (pl.scan_parquet(
        foods_parquet, row_count_name="row_count").with_row_count(
            "foo",
            10).filter(pl.col("category") == pl.lit("vegetables")).collect())

    assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]
Ejemplo n.º 3
0
def test_lazy_self_join_file_cache_prop_3979(io_test_dir: str) -> None:
    path = os.path.join(io_test_dir, "small.parquet")
    a = pl.scan_parquet(path)
    b = pl.DataFrame({"a": [1]}).lazy()

    assert a.join(b, how="cross").collect().shape == (3, 17)
    assert b.join(a, how="cross").collect().shape == (3, 17)
Ejemplo n.º 4
0
def test_categorical_parquet_statistics(io_test_dir: str) -> None:
    file = path.join(io_test_dir, "books.parquet")
    (
        pl.DataFrame(
            {
                "book": [
                    "bookA",
                    "bookA",
                    "bookB",
                    "bookA",
                    "bookA",
                    "bookC",
                    "bookC",
                    "bookC",
                ],
                "transaction_id": [1, 2, 3, 4, 5, 6, 7, 8],
                "user": ["bob", "bob", "bob", "tim", "lucy", "lucy", "lucy", "lucy"],
            }
        )
        .with_column(pl.col("book").cast(pl.Categorical))
        .write_parquet(file, statistics=True)
    )

    for par in [True, False]:
        df = (
            pl.scan_parquet(file, parallel=par)
            .filter(pl.col("book") == "bookA")
            .collect()
        )
    assert df.shape == (4, 3)
Ejemplo n.º 5
0
def test_scan_parquet() -> None:
    df = pl.scan_parquet(Path(__file__).parent / "files" / "small.parquet")
    assert df.collect().shape == (4, 3)
Ejemplo n.º 6
0
def test_glob_parquet(io_test_dir: str) -> None:
    path = os.path.join(io_test_dir, "small*.parquet")
    assert pl.read_parquet(path).shape == (3, 16)
    assert pl.scan_parquet(path).collect().shape == (3, 16)