Exemple #1
0
def test_nested_dictionary() -> None:
    with pl.StringCache():
        df = (pl.DataFrame({
            "str": ["A", "B", "A", "B", "C"],
            "group": [1, 1, 2, 1, 2]
        }).with_column(pl.col("str").cast(
            pl.Categorical)).groupby("group").agg(
                [pl.col("str").list().alias("cat_list")]))
        f = io.BytesIO()
        df.write_parquet(f)
        f.seek(0)

        read_df = pl.read_parquet(f)
        assert df.frame_equal(read_df)
Exemple #2
0
def test_categorical_outer_join() -> None:
    with pl.StringCache():
        df1 = pl.DataFrame([
            pl.Series("key1", [42]),
            pl.Series("key2", ["bar"], dtype=pl.Categorical),
            pl.Series("val1", [1]),
        ]).lazy()

        df2 = pl.DataFrame([
            pl.Series("key1", [42]),
            pl.Series("key2", ["bar"], dtype=pl.Categorical),
            pl.Series("val2", [2]),
        ]).lazy()

    out = df1.join(df2, on=["key1", "key2"], how="outer").collect()
    expected = pl.DataFrame({
        "key1": [42],
        "key2": ["bar"],
        "val1": [1],
        "val2": [2]
    })

    assert out.frame_equal(expected)
    with pl.StringCache():
        dfa = pl.DataFrame([
            pl.Series("key", ["foo", "bar"], dtype=pl.Categorical),
            pl.Series("val1", [3, 1]),
        ])
        dfb = pl.DataFrame([
            pl.Series("key", ["bar", "baz"], dtype=pl.Categorical),
            pl.Series("val2", [6, 8]),
        ])

    df = dfa.join(dfb, on="key", how="outer")
    # the cast is important to test the rev map
    assert df["key"].cast(pl.Utf8).to_list() == ["bar", "baz", "foo"]
Exemple #3
0
def test_categorical_is_in_list() -> None:
    # this requires type coercion to cast.
    # we should not cast within the function as this would be expensive within a groupby context
    # that would be a cast per group
    with pl.StringCache():
        df = pl.DataFrame({
            "a": [1, 2, 3, 1, 2],
            "b": ["a", "b", "c", "d", "e"]
        }).with_column(pl.col("b").cast(pl.Categorical))

        cat_list = ["a", "b", "c"]
        assert df.filter(pl.col("b").is_in(cat_list)).to_dict(False) == {
            "a": [1, 2, 3],
            "b": ["a", "b", "c"],
        }
Exemple #4
0
def test_categorical_lexical_ordering_after_concat() -> None:
    with pl.StringCache():
        ldf1 = (pl.DataFrame([
            pl.Series("key1", [8, 5]),
            pl.Series("key2", ["fox", "baz"])
        ]).lazy().with_column(
            pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical")))
        ldf2 = (pl.DataFrame([
            pl.Series("key1", [6, 8, 6]),
            pl.Series("key2", ["fox", "foo", "bar"])
        ]).lazy().with_column(
            pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical")))
        df = (pl.concat([ldf1, ldf2]).with_column(
            pl.col("key2").cat.set_ordering("lexical")).collect())

        df.sort(["key1", "key2"])
Exemple #5
0
def test_cat_int_types_3500() -> None:
    with pl.StringCache():
        # Create an enum / categorical / dictionary typed pyarrow array
        # Most simply done by creating a pandas categorical series first
        categorical_df = pd.Series(["a", "a", "b"], dtype="category")
        pyarrow_array = pa.Array.from_pandas(categorical_df)

        # The in-memory representation of each category can either be a signed or unsigned 8-bit integer
        # Pandas uses Int8...
        int_dict_type = pa.dictionary(index_type=pa.int8(),
                                      value_type=pa.utf8())
        # ... while DuckDB uses UInt8
        uint_dict_type = pa.dictionary(index_type=pa.uint8(),
                                       value_type=pa.utf8())

        for t in [int_dict_type, uint_dict_type]:
            s = pl.from_arrow(pyarrow_array.cast(t))
            assert s.series_equal(
                pl.Series(["a", "a", "b"]).cast(pl.Categorical))
Exemple #6
0
def test_string_cache_eager_lazy():
    # tests if the global string cache is really global and not interfered by the lazy execution.
    # first the global settings was thread-local and this breaks with the parallel execution of lazy
    with pl.StringCache():
        df1 = pl.DataFrame({
            "region_ids": ["reg1", "reg2", "reg3", "reg4", "reg5"]
        }).select([pl.col("region_ids").cast(pl.Categorical)])
        df2 = pl.DataFrame({
            "seq_name": ["reg4", "reg2", "reg1"],
            "score": [3.0, 1.0, 2.0]
        }).select([pl.col("seq_name").cast(pl.Categorical),
                   pl.col("score")])

    expected = pl.DataFrame({
        "region_ids": ["reg1", "reg2", "reg3", "reg4", "reg5"],
        "score": [2.0, 1.0, None, 3.0, None],
    })
    assert df1.join(df2, left_on="region_ids", right_on="seq_name",
                    how="left").frame_equal(expected, null_equal=True)
Exemple #7
0
data_name = os.environ["SRC_DATANAME"]
src_jn_x = os.path.join("data", data_name + ".csv")
y_data_name = join_to_tbls(data_name)
src_jn_y = [
    os.path.join("data", y_data_name[0] + ".csv"),
    os.path.join("data", y_data_name[1] + ".csv"),
    os.path.join("data", y_data_name[2] + ".csv")
]
if len(src_jn_y) != 3:
    raise Exception("Something went wrong in preparing files used for join")

print("loading datasets " + data_name + ", " + y_data_name[0] + ", " +
      y_data_name[2] + ", " + y_data_name[2],
      flush=True)

with pl.StringCache():
    x = pl.read_csv(src_jn_x,
                    dtype={
                        "id1": pl.Int32,
                        "id2": pl.Int32,
                        "id3": pl.Int32,
                        "v1": pl.Float64
                    })
    x["id4"] = x["id4"].cast(pl.Categorical)
    x["id5"] = x["id5"].cast(pl.Categorical)
    x["id6"] = x["id6"].cast(pl.Categorical)
    small = pl.read_csv(src_jn_y[0], dtype={"id1": pl.Int32, "v2": pl.Float64})
    small["id4"] = small["id4"].cast(pl.Categorical)
    medium = pl.read_csv(src_jn_y[1],
                         dtype={
                             "id1": pl.Int32,