def test_nested_dictionary() -> None: with pl.StringCache(): df = (pl.DataFrame({ "str": ["A", "B", "A", "B", "C"], "group": [1, 1, 2, 1, 2] }).with_column(pl.col("str").cast( pl.Categorical)).groupby("group").agg( [pl.col("str").list().alias("cat_list")])) f = io.BytesIO() df.write_parquet(f) f.seek(0) read_df = pl.read_parquet(f) assert df.frame_equal(read_df)
def test_categorical_outer_join() -> None: with pl.StringCache(): df1 = pl.DataFrame([ pl.Series("key1", [42]), pl.Series("key2", ["bar"], dtype=pl.Categorical), pl.Series("val1", [1]), ]).lazy() df2 = pl.DataFrame([ pl.Series("key1", [42]), pl.Series("key2", ["bar"], dtype=pl.Categorical), pl.Series("val2", [2]), ]).lazy() out = df1.join(df2, on=["key1", "key2"], how="outer").collect() expected = pl.DataFrame({ "key1": [42], "key2": ["bar"], "val1": [1], "val2": [2] }) assert out.frame_equal(expected) with pl.StringCache(): dfa = pl.DataFrame([ pl.Series("key", ["foo", "bar"], dtype=pl.Categorical), pl.Series("val1", [3, 1]), ]) dfb = pl.DataFrame([ pl.Series("key", ["bar", "baz"], dtype=pl.Categorical), pl.Series("val2", [6, 8]), ]) df = dfa.join(dfb, on="key", how="outer") # the cast is important to test the rev map assert df["key"].cast(pl.Utf8).to_list() == ["bar", "baz", "foo"]
def test_categorical_is_in_list() -> None: # this requires type coercion to cast. # we should not cast within the function as this would be expensive within a groupby context # that would be a cast per group with pl.StringCache(): df = pl.DataFrame({ "a": [1, 2, 3, 1, 2], "b": ["a", "b", "c", "d", "e"] }).with_column(pl.col("b").cast(pl.Categorical)) cat_list = ["a", "b", "c"] assert df.filter(pl.col("b").is_in(cat_list)).to_dict(False) == { "a": [1, 2, 3], "b": ["a", "b", "c"], }
def test_categorical_lexical_ordering_after_concat() -> None: with pl.StringCache(): ldf1 = (pl.DataFrame([ pl.Series("key1", [8, 5]), pl.Series("key2", ["fox", "baz"]) ]).lazy().with_column( pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical"))) ldf2 = (pl.DataFrame([ pl.Series("key1", [6, 8, 6]), pl.Series("key2", ["fox", "foo", "bar"]) ]).lazy().with_column( pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical"))) df = (pl.concat([ldf1, ldf2]).with_column( pl.col("key2").cat.set_ordering("lexical")).collect()) df.sort(["key1", "key2"])
def test_cat_int_types_3500() -> None: with pl.StringCache(): # Create an enum / categorical / dictionary typed pyarrow array # Most simply done by creating a pandas categorical series first categorical_df = pd.Series(["a", "a", "b"], dtype="category") pyarrow_array = pa.Array.from_pandas(categorical_df) # The in-memory representation of each category can either be a signed or unsigned 8-bit integer # Pandas uses Int8... int_dict_type = pa.dictionary(index_type=pa.int8(), value_type=pa.utf8()) # ... while DuckDB uses UInt8 uint_dict_type = pa.dictionary(index_type=pa.uint8(), value_type=pa.utf8()) for t in [int_dict_type, uint_dict_type]: s = pl.from_arrow(pyarrow_array.cast(t)) assert s.series_equal( pl.Series(["a", "a", "b"]).cast(pl.Categorical))
def test_string_cache_eager_lazy(): # tests if the global string cache is really global and not interfered by the lazy execution. # first the global settings was thread-local and this breaks with the parallel execution of lazy with pl.StringCache(): df1 = pl.DataFrame({ "region_ids": ["reg1", "reg2", "reg3", "reg4", "reg5"] }).select([pl.col("region_ids").cast(pl.Categorical)]) df2 = pl.DataFrame({ "seq_name": ["reg4", "reg2", "reg1"], "score": [3.0, 1.0, 2.0] }).select([pl.col("seq_name").cast(pl.Categorical), pl.col("score")]) expected = pl.DataFrame({ "region_ids": ["reg1", "reg2", "reg3", "reg4", "reg5"], "score": [2.0, 1.0, None, 3.0, None], }) assert df1.join(df2, left_on="region_ids", right_on="seq_name", how="left").frame_equal(expected, null_equal=True)
data_name = os.environ["SRC_DATANAME"] src_jn_x = os.path.join("data", data_name + ".csv") y_data_name = join_to_tbls(data_name) src_jn_y = [ os.path.join("data", y_data_name[0] + ".csv"), os.path.join("data", y_data_name[1] + ".csv"), os.path.join("data", y_data_name[2] + ".csv") ] if len(src_jn_y) != 3: raise Exception("Something went wrong in preparing files used for join") print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[2] + ", " + y_data_name[2], flush=True) with pl.StringCache(): x = pl.read_csv(src_jn_x, dtype={ "id1": pl.Int32, "id2": pl.Int32, "id3": pl.Int32, "v1": pl.Float64 }) x["id4"] = x["id4"].cast(pl.Categorical) x["id5"] = x["id5"].cast(pl.Categorical) x["id6"] = x["id6"].cast(pl.Categorical) small = pl.read_csv(src_jn_y[0], dtype={"id1": pl.Int32, "v2": pl.Float64}) small["id4"] = small["id4"].cast(pl.Categorical) medium = pl.read_csv(src_jn_y[1], dtype={ "id1": pl.Int32,