Esempio n. 1
0
def test_arrow():
    a = Series("a", [1, 2, 3, None])
    out = a.to_arrow()
    assert out == pa.array([1, 2, 3, None])

    a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8()))
    s = pl.Series("a", a)
    assert s.dtype == pl.Utf8
Esempio n. 2
0
def test_rechunk():
    a = Series("a", [1, 2, 3])
    b = Series("b", [4, 5, 6])
    a.append(b)
    assert a.n_chunks() == 2
    assert a.rechunk(in_place=False).n_chunks() == 1
    a.rechunk(in_place=True)
    assert a.n_chunks() == 1
Esempio n. 3
0
def test_ufunc():
    a = Series("a", [1.0, 2.0, 3.0, 4.0])
    b = np.multiply(a, 4)
    assert isinstance(b, Series)
    assert b == [4, 8, 12, 16]

    # test if null bitmask is preserved
    a = Series("a", [1.0, None, 3.0], nullable=True)
    b = np.exp(a)
    assert b.null_count() == 1
Esempio n. 4
0
def test_strategy_shape(df1: pl.DataFrame, df2: pl.DataFrame, s1: pl.Series,
                        s2: pl.Series) -> None:
    assert df1.shape == (5, 5)
    assert df1.columns == ["col0", "col1", "col2", "col3", "col4"]

    assert 2 <= len(df2.columns) <= 5
    assert 3 <= len(df2) <= 8

    assert s1.len() == 5
    assert 3 <= s2.len() <= 8
    assert s1.name == ""
    assert s2.name == "col"
Esempio n. 5
0
def test_cast():
    a = Series("a", range(20))

    assert a.cast_f32().dtype == "f32"
    assert a.cast_f64().dtype == "f64"
    assert a.cast_i32().dtype == "i32"
    assert a.cast_u32().dtype == "u32"
    assert a.cast_date64().dtype == "date64"
    assert a.cast_time64ns().dtype == "time64(ns)"
    assert a.cast_date32().dtype == "date32"
Esempio n. 6
0
def test_df_fold():
    df = DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})

    assert df.fold(lambda s1, s2: s1 + s2).series_equal(
        Series("a", [4.0, 5.0, 9.0]))
    assert df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)).series_equal(
        Series("a", [1.0, 1.0, 3.0]))

    df = DataFrame({
        "a": ["foo", "bar", "2"],
        "b": [1, 2, 3],
        "c": [1.0, 2.0, 3.0]
    })
    out = df.fold(lambda s1, s2: s1 + s2)
    out.series_equal(Series("", ["foo11", "bar22", "233"]))
Esempio n. 7
0
def test_filter():
    a = Series("a", range(20))
    assert a[a > 1].len() == 18
    assert a[a < 1].len() == 1
    assert a[a <= 1].len() == 2
    assert a[a >= 1].len() == 19
    assert a[a == 1].len() == 1
    assert a[a != 1].len() == 19
Esempio n. 8
0
def test_replace():
    df = DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})
    s = Series("c", [True, False, True])
    df.replace("a", s)
    assert df.frame_equal(DataFrame({
        "c": [True, False, True],
        "b": [1, 2, 3]
    }))
Esempio n. 9
0
def test_join():
    df_left = DataFrame(
        {"a": ["a", "b", "a", "z"], "b": [1, 2, 3, 4], "c": [6, 5, 4, 3],}
    )
    df_right = DataFrame(
        {"a": ["b", "c", "b", "a"], "k": [0, 3, 9, 6], "c": [1, 0, 2, 1],}
    )

    joined = df_left.join(df_right, left_on="a", right_on="a").sort("a")
    assert joined["b"].series_equal(Series("", [1, 3, 2, 2]))
    joined = df_left.join(df_right, left_on="a", right_on="a", how="left").sort("a")
    assert joined["c_right"].is_null().sum() == 1
    assert joined["b"].series_equal(Series("", [1, 3, 2, 2, 4]))
    joined = df_left.join(df_right, left_on="a", right_on="a", how="outer").sort("a")
    assert joined["c_right"].null_count() == 1
    assert joined["c"].null_count() == 2
    assert joined["b"].null_count() == 2
Esempio n. 10
0
def test_to_pandas():
    df = get_complete_df()
    df.to_arrow()
    df.to_pandas()
    # test shifted df
    df.shift(2).to_pandas()
    df = DataFrame({"col": Series([True, False, True])})
    df.shift(2).to_pandas()
Esempio n. 11
0
def test_df_fold():
    df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})

    assert df.fold(lambda s1, s2: s1 + s2).series_equal(Series("a", [4.0, 5.0, 9.0]))
    assert df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)).series_equal(
        Series("a", [1.0, 1.0, 3.0])
    )

    df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
    out = df.fold(lambda s1, s2: s1 + s2)
    out.series_equal(Series("", ["foo11", "bar22", "233"]))

    df = pl.DataFrame({"a": [3, 2, 1], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
    # just check dispatch. values are tested on rust side.
    assert df.sum(axis=1).shape == (3, 1)
    assert df.mean(axis=1).shape == (3, 1)
    assert df.min(axis=1).shape == (3, 1)
    assert df.max(axis=1).shape == (3, 1)
Esempio n. 12
0
def test_to_python():
    a = Series("a", range(20))
    b = a.to_list()
    assert isinstance(b, list)
    assert len(b) == 20

    a = Series("a", [1, None, 2], nullable=True)
    assert a.null_count() == 1
    assert a.to_list() == [1, None, 2]
Esempio n. 13
0
def test_downsample():
    s = Series(
        "datetime",
        [
            946684800000,
            946684860000,
            946684920000,
            946684980000,
            946685040000,
            946685100000,
            946685160000,
            946685220000,
            946685280000,
            946685340000,
            946685400000,
            946685460000,
            946685520000,
            946685580000,
            946685640000,
            946685700000,
            946685760000,
            946685820000,
            946685880000,
            946685940000,
        ],
    ).cast(Date64)
    s2 = s.clone()
    df = DataFrame({"a": s, "b": s2})
    out = df.downsample("a", rule="minute", n=5).first()
    assert out.shape == (4, 2)

    # OLHC
    out = df.downsample("a", rule="minute", n=5).agg(
        {"b": ["first", "min", "max", "last"]}
    )
    assert out.shape == (4, 5)

    # test to_pandas as well.
    out = df.to_pandas()
    assert out["a"].dtype == "datetime64[ns]"
Esempio n. 14
0
def test_apply():
    a = Series("a", [1, 2, None], nullable=True)
    b = a.apply(lambda x: x**2)
    assert b == [1, 4, None]

    a = Series("a", ["foo", "bar", None], nullable=True)
    b = a.apply(lambda x: x + "py")
    assert b == ["foopy", "barpy", None]

    b = a.apply(lambda x: len(x), dtype_out=Int32)
    assert b == [3, 3, None]

    b = a.apply(lambda x: len(x))
    assert b == [3, 3, None]
Esempio n. 15
0
def test_join():
    df_left = DataFrame({
        "a": ["a", "b", "a", "z"],
        "b": [1, 2, 3, 4],
        "c": [6, 5, 4, 3],
    })
    df_right = DataFrame({
        "a": ["b", "c", "b", "a"],
        "k": [0, 3, 9, 6],
        "c": [1, 0, 2, 1],
    })

    joined = df_left.join(df_right, left_on="a", right_on="a").sort("a")
    assert joined["b"].series_equal(Series("", [1, 3, 2, 2]))
    joined = df_left.join(df_right, left_on="a", right_on="a",
                          how="left").sort("a")
    assert joined["c_right"].is_null().sum() == 1
    assert joined["b"].series_equal(Series("", [1, 3, 2, 2, 4]))
    joined = df_left.join(df_right, left_on="a", right_on="a",
                          how="outer").sort("a")
    assert joined["c_right"].null_count() == 1
    assert joined["c"].null_count() == 2
    assert joined["b"].null_count() == 2

    df_a = DataFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]})
    df_b = DataFrame({
        "foo": [1, 1, 1],
        "bar": ["a", "c", "c"],
        "ham": ["let", "var", "const"]
    })

    # just check if join on multiple columns runs
    df_a.join(df_b, left_on=["a", "b"], right_on=["foo", "bar"])

    eager_join = df_a.join(df_b, left_on="a", right_on="foo")

    lazy_join = df_a.lazy().join(df_b.lazy(), left_on="a",
                                 right_on="foo").collect()
    assert lazy_join.shape == eager_join.shape
Esempio n. 16
0
def test_custom_groupby():
    df = DataFrame({"A": ["a", "a", "c", "c"], "B": [1, 3, 5, 2]})
    assert df.groupby("A").select("B").apply(lambda x: x.sum()).shape == (2, 2)
    assert df.groupby("A").select("B").apply(
        lambda x: Series("", np.array(x))).shape == (
            2,
            2,
        )

    df = DataFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]})

    out = (df.lazy().groupby("b").agg(
        [col("a").apply(lambda x: x.sum(), dtype_out=int)]).collect())
    assert out.shape == (3, 2)
Esempio n. 17
0
def test_series_slice(
    srs: pl.Series,
    start: int | None,
    stop: int | None,
    step: int | None,
) -> None:
    py_data = srs.to_list()

    s = slice(start, stop, step)
    sliced_py_data = py_data[s]
    sliced_pl_data = srs[s].to_list()

    assert sliced_py_data == sliced_pl_data, f"slice [{start}:{stop}:{step}] failed"
    assert_series_equal(srs, srs, check_exact=True)
Esempio n. 18
0
def test_cast():
    a = Series("a", range(20))

    assert a.cast(Float32).dtype == Float32
    assert a.cast(Float64).dtype == Float64
    assert a.cast(Int32).dtype == Int32
    assert a.cast(UInt32).dtype == UInt32
    assert a.cast(Date64).dtype == Date64
    assert a.cast(Date32).dtype == Date32
Esempio n. 19
0
def verify_series_and_expr_api(
    input: pl.Series, expected: pl.Series, op: str, *args: Any, **kwargs: Any
) -> None:
    """
    Small helper function to test element-wise functions for both the series and expressions api.

    Examples
    --------
    >>> s = pl.Series([1, 3, 2])
    >>> expected = pl.Series([1, 2, 3])
    >>> verify_series_and_expr_api(s, expected, "sort")
    """
    expr = _getattr_multi(pl.col("*"), op)(*args, **kwargs)
    result_expr: pl.Series = input.to_frame().select(expr)[:, 0]  # type: ignore
    result_series = _getattr_multi(input, op)(*args, **kwargs)
    testing.assert_series_equal(result_expr, expected)
    testing.assert_series_equal(result_series, expected)
Esempio n. 20
0
def test_equality():
    a = create_series()
    b = a

    cmp = a == b
    assert isinstance(cmp, Series)
    assert cmp.sum() == 2
    assert (a != b).sum() == 0
    assert (a >= b).sum() == 2
    assert (a <= b).sum() == 2
    assert (a > b).sum() == 0
    assert (a < b).sum() == 0
    assert a.sum() == 3
    assert a.series_equal(b)

    a = Series("name", ["ham", "foo", "bar"])
    assert (a == "ham").to_list() == [True, False, False]
Esempio n. 21
0
def test_strategy_null_probability(
    s: pl.Series,
    df1: pl.DataFrame,
    df2: pl.DataFrame,
    df3: pl.DataFrame,
) -> None:
    for obj in (s, df1, df2, df3):
        assert len(obj) == 50  # type: ignore[arg-type]

    assert s.null_count() < df1.null_count().fold(sum).sum()
    assert df1.null_count().fold(sum).sum() < df2.null_count().fold(sum).sum()
    assert df2.null_count().fold(sum).sum() < df3.null_count().fold(sum).sum()

    nulls_col0, nulls_col1 = df2.null_count().rows()[0]
    assert nulls_col0 > nulls_col1
    assert nulls_col0 < 50

    nulls_col0, nulls_colx = df3.null_count().rows()[0]
    assert nulls_col0 > nulls_colx
    assert nulls_col0 == 50
Esempio n. 22
0
def test_hstack():
    df = DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
    df.hstack([Series("stacked", [-1, -1, -1])], in_place=True)
    assert df.shape == (3, 3)
    assert df.columns == ["a", "b", "stacked"]
Esempio n. 23
0
def test_groupby():
    df = DataFrame({
        "a": ["a", "b", "a", "b", "b", "c"],
        "b": [1, 2, 3, 4, 5, 6],
        "c": [6, 5, 4, 3, 2, 1],
    })

    # use __getitem__ to map to select
    assert (df.groupby("a")["b"].sum().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [4, 11, 6]
        })))

    assert (df.groupby("a").select("b").sum().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [4, 11, 6]
        })))
    assert (df.groupby("a").select("c").sum().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [10, 10, 1]
        })))
    assert (df.groupby("a").select("b").min().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [1, 2, 6]
        })))
    assert (df.groupby("a").select("b").max().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [3, 5, 6]
        })))
    assert (df.groupby("a").select("b").mean().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [2.0, (2 + 4 + 5) / 3, 6.0]
        })))
    assert (df.groupby("a").select("b").last().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [3, 5, 6]
        })))
    # check if it runs
    (df.groupby("a").select("b").n_unique())

    (df.groupby("a").select("b").quantile(0.3))
    (df.groupby("a").select("b").agg_list())

    gb_df = df.groupby("a").agg({"b": ["sum", "min"], "c": "count"})
    assert "b_sum" in gb_df.columns
    assert "b_min" in gb_df.columns

    #
    # # TODO: is false because count is u32
    # df.groupby(by="a", select="b", agg="count").frame_equal(
    #     DataFrame({"a": ["a", "b", "c"], "": [2, 3, 1]})
    # )
    assert df.groupby("a").apply(lambda df: df[["c"]].sum()).sort(
        "c")["c"][0] == 1

    assert df.groupby("a").groups().sort("a")["a"].series_equal(
        Series(["a", "b", "c"]))

    for subdf in df.groupby("a"):
        if subdf["a"][0] == "b":
            assert subdf.shape == (3, 3)

    assert df.groupby("a").get_group("c").shape == (1, 3)
    assert df.groupby("a").get_group("b").shape == (3, 3)
    assert df.groupby("a").get_group("a").shape == (2, 3)

    # Use lazy API in eager groupby
    assert df.groupby("a").agg([pl.sum("b")]).shape == (3, 2)
Esempio n. 24
0
def create_series() -> "Series":
    return Series("a", [1, 2])
Esempio n. 25
0
def test_various():
    a = create_series()

    assert a.is_null().sum() == 0
    assert a.name == "a"
    a.rename("b")
    assert a.name == "b"
    assert a.len() == 2
    assert len(a) == 2
    b = a.slice(1, 1)
    assert b.len() == 1
    assert b.series_equal(Series("", [2]))
    a.append(b)
    assert a.series_equal(Series("", [1, 2, 2]))

    a = Series("a", range(20))
    assert a.head(5).len() == 5
    assert a.tail(5).len() == 5
    assert a.head(5) != a.tail(5)

    a = Series("a", [2, 1, 4])
    a.sort(in_place=True)
    assert a.series_equal(Series("", [1, 2, 4]))
    a = Series("a", [2, 1, 1, 4, 4, 4])
    assert a.arg_unique().to_list() == [0, 1, 3]

    assert a.take([2, 3]).series_equal(Series("", [1, 4]))
    assert a.is_numeric()
    a = Series("bool", [True, False])
    assert not a.is_numeric()
Esempio n. 26
0
def test_shape():
    s = Series([1, 2, 3])
    assert s.shape == (3, )
Esempio n. 27
0
def test_quantile():
    s = Series([1, 2, 3])
    assert s.quantile(0.5) == 2
Esempio n. 28
0
def test_median():
    s = Series([1, 2, 3])
    assert s.median() == 2
Esempio n. 29
0
def test_object():
    vals = [[12], "foo", 9]
    a = Series("a", vals)
    assert a.dtype == Object
    assert a.to_list() == vals
    assert a[1] == "foo"
Esempio n. 30
0
def test_rolling():
    a = Series("a", [1, 2, 3, 2, 1])
    assert a.rolling_min(2) == [None, 1, 2, 2, 1]
    assert a.rolling_max(2) == [None, 2, 3, 3, 2]
    assert a.rolling_sum(2) == [None, 3, 5, 5, 3]