Example #1
0
def test_cast_inner() -> None:
    a = pl.Series([[1, 2]])
    for t in [bool, pl.Boolean]:
        b = a.cast(pl.List(t))
        assert b.dtype == pl.List(pl.Boolean)
        assert b.to_list() == [[True, True]]

    # this creates an inner null type
    df = pl.from_pandas(pd.DataFrame(data=[[[]], [[]]], columns=["A"]))
    assert df["A"].cast(pl.List(int)).dtype.inner == pl.Int64  # type: ignore[arg-type, attr-defined]
Example #2
0
def test_init_only_columns() -> None:
    df = pl.DataFrame(columns=["a", "b", "c"])
    truth = pl.DataFrame({"a": [], "b": [], "c": []})
    assert df.shape == (0, 3)
    assert df.frame_equal(truth, null_equal=True)
    assert df.dtypes == [pl.Float32, pl.Float32, pl.Float32]

    # Validate construction with various flavours of no/empty data
    no_data: Any
    for no_data in (None, {}, []):
        df = pl.DataFrame(
            data=no_data,
            columns=[  # type: ignore[arg-type]
                ("a", pl.Date),
                ("b", pl.UInt64),
                ("c", pl.datatypes.Int8),
                ("d", pl.List(pl.UInt8)),
            ],
        )
        truth = pl.DataFrame({
            "a": [],
            "b": [],
            "c": []
        }).with_columns([
            pl.col("a").cast(pl.Date),
            pl.col("b").cast(pl.UInt64),
            pl.col("c").cast(pl.Int8),
        ])
        truth.insert_at_idx(3, pl.Series("d", [], pl.List(pl.UInt8)))

        assert df.shape == (0, 4)
        assert df.frame_equal(truth, null_equal=True)
        assert df.dtypes == [pl.Date, pl.UInt64, pl.Int8, pl.List]
        assert df.schema["d"].inner == pl.UInt8  # type: ignore[attr-defined]

        dfe = df.cleared()
        assert (df.schema == dfe.schema) and (dfe.shape == df.shape)
Example #3
0
def recursive_logical_type() -> None:
    df = pl.DataFrame({
        "str": ["A", "B", "A", "B", "C"],
        "group": [1, 1, 2, 1, 2]
    })
    df = df.with_column(pl.col("str").cast(pl.Categorical))

    df_groups = df.groupby("group").agg(
        [pl.col("str").list().alias("cat_list")])
    f = io.BytesIO()
    df_groups.write_parquet(f, use_pyarrow=True)
    f.seek(0)
    read = pl.read_parquet(f, use_pyarrow=True)
    assert read.dtypes == [pl.Int64, pl.List(pl.Categorical)]
    assert read.shape == (2, 2)
Example #4
0
def test_dtype() -> None:
    # inferred
    a = pl.Series("a", [[1, 2, 3], [2, 5], [6, 7, 8, 9]])
    assert a.dtype == pl.List
    assert a.inner_dtype == pl.Int64
    assert a.dtype.inner == pl.Int64  # type: ignore[attr-defined]

    # explicit
    df = pl.DataFrame(
        data={
            "i": [[1, 2, 3]],
            "tm": [[time(10, 30, 45)]],
            "dt": [[date(2022, 12, 31)]],
            "dtm": [[datetime(2022, 12, 31, 1, 2, 3)]],
        },
        columns=[
            ("i", pl.List(pl.Int8)),
            ("tm", pl.List(pl.Time)),
            ("dt", pl.List(pl.Date)),
            ("dtm", pl.List(pl.Datetime)),
        ],
    )
    assert df.schema == {
        "i": pl.List(pl.Int8),
        "tm": pl.List(pl.Time),
        "dt": pl.List(pl.Date),
        "dtm": pl.List(pl.Datetime),
    }
    assert df.schema["i"].inner == pl.Int8  # type: ignore[attr-defined]
    assert df.rows() == [
        (
            [1, 2, 3],
            [time(10, 30, 45)],
            [date(2022, 12, 31)],
            [datetime(2022, 12, 31, 1, 2, 3)],
        )
    ]
Example #5
0
def test_list_hash() -> None:
    out = pl.DataFrame({"a": [[1, 2, 3], [3, 4], [1, 2, 3]]}).with_column(
        pl.col("a").hash().alias("b")
    )
    assert out.dtypes == [pl.List(pl.Int64), pl.UInt64]
    assert out[0, "b"] == out[2, "b"]
Example #6
0
def test_init_dict() -> None:
    # Empty dictionary
    df = pl.DataFrame({})
    assert df.shape == (0, 0)

    # Empty dictionary/values
    df = pl.DataFrame({"a": [], "b": []})
    assert df.shape == (0, 2)
    assert df.schema == {"a": pl.Float32, "b": pl.Float32}

    for df in (
            pl.DataFrame({}, columns={
                "a": pl.Date,
                "b": pl.Utf8
            }),
            pl.DataFrame({
                "a": [],
                "b": []
            },
                         columns={
                             "a": pl.Date,
                             "b": pl.Utf8
                         }),
    ):
        assert df.shape == (0, 2)
        assert df.schema == {"a": pl.Date, "b": pl.Utf8}

    # List of empty list/tuple
    df = pl.DataFrame({"a": [[]], "b": [()]})
    assert df.schema == {"a": pl.List(pl.Float64), "b": pl.List(pl.Float64)}
    assert df.rows() == [([], [])]

    # Mixed dtypes
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    assert df.shape == (3, 2)
    assert df.columns == ["a", "b"]
    assert df.dtypes == [pl.Int64, pl.Float64]

    df = pl.DataFrame(
        data={
            "a": [1, 2, 3],
            "b": [1.0, 2.0, 3.0]
        },
        columns=[("a", pl.Int8), ("b", pl.Float32)],
    )
    assert df.schema == {"a": pl.Int8, "b": pl.Float32}

    # Values contained in tuples
    df = pl.DataFrame({"a": (1, 2, 3), "b": [1.0, 2.0, 3.0]})
    assert df.shape == (3, 2)

    # Datetime/Date types (from both python and integer values)
    py_datetimes = (
        datetime(2022, 12, 31, 23, 59, 59),
        datetime(2022, 12, 31, 23, 59, 59),
    )
    py_dates = (date(2022, 12, 31), date(2022, 12, 31))
    int_datetimes = [1672531199000000, 1672531199000000]
    int_dates = [19357, 19357]

    for dates, datetimes, coldefs in (
            # test inferred and explicit (given both py/polars dtypes)
        (py_dates, py_datetimes, None),
        (py_dates, py_datetimes, [("dt", date), ("dtm", datetime)]),
        (py_dates, py_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]),
        (int_dates, int_datetimes, [("dt", date), ("dtm", datetime)]),
        (int_dates, int_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]),
    ):
        df = pl.DataFrame(
            data={
                "dt": dates,
                "dtm": datetimes
            },
            columns=coldefs,
        )
        assert df.schema == {"dt": pl.Date, "dtm": pl.Datetime}
        assert df.rows() == list(zip(py_dates, py_datetimes))

    # Overriding dict column names/types
    df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["c", "d"])
    assert df.columns == ["c", "d"]

    df = pl.DataFrame(
        {
            "a": [1, 2, 3],
            "b": [4, 5, 6]
        },
        columns=["c", ("d", pl.Int8)]  # type: ignore[arg-type]
    )  # partial type info (allowed, but mypy doesn't like it ;p)
    assert df.schema == {"c": pl.Int64, "d": pl.Int8}

    df = pl.DataFrame({
        "a": [1, 2, 3],
        "b": [4, 5, 6]
    },
                      columns=[("c", pl.Int8), ("d", pl.Int16)])
    assert df.schema == {"c": pl.Int8, "d": pl.Int16}

    dfe = df.cleared()
    assert (df.schema == dfe.schema) and (len(dfe) == 0)