import pypolars as pl from pypolars.lazy import * reddit = pl.scan_csv("data/reddit.csv").select( [pl.sum("comment_karma"), pl.min("link_karma")] ) if __name__ == "__main__": df = reddit.fetch() with open("book/src/outputs/how_can_i_aggregate.txt", "w") as f: f.write(str(df))
def test_fold(): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = df.lazy().select(pl.sum(["a", "b"])).collect() assert out["sum"].series_equal(Series("sum", [2, 4, 6]))
id037,id097,id0000062401,48,8,53992,5,15,83.565443 id073,id081,id0000017280,54,90,28480,5,4,17.078693 id081,id073,id0000073423,51,22,39788,2,12,45.883758 id062,id080,id0000092749,1,75,67857,3,10,80.418674 id045,id031,id0000076210,2,42,80312,4,5,48.668692 id082,id048,id0000080227,56,62,16760,3,11,34.933239 id035,id032,id0000033279,55,13,80560,5,5,61.372678 id053,id013,id0000073898,61,63,12387,4,7,29.949863""" f = io.BytesIO(csv) x = pl.read_csv( f, dtype={ "id4": pl.Int32, "id5": pl.Int32, "id6": pl.Int32, "v1": pl.Int32, "v2": pl.Int32, "v3": pl.Float64, }, ) x["id1"] = x["id1"].cast(pl.Categorical) x["id2"] = x["id2"].cast(pl.Categorical) x["id3"] = x["id3"].cast(pl.Categorical) x = x.lazy() question = "sum v1 by id1" # q1 ans = x.groupby("id1").agg(pl.sum("v1")).collect() print(ans.shape, flush=True)
x["id1"] = x["id1"].cast(pl.Categorical) x["id2"] = x["id2"].cast(pl.Categorical) x["id3"] = x["id3"].cast(pl.Categorical) in_rows = x.shape[0] x = x.lazy() print(len(x.collect()), flush=True) task_init = timeit.default_timer() print("grouping...", flush=True) question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer() ans = x.groupby("id1").agg(pl.sum("v1")).collect() print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = [ans["v1_sum"].sum()] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() ans = x.groupby("id1").agg(pl.sum("v1")).collect() print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer()
import pypolars as pl df = pl.DataFrame({ "A": [1, 2, 3, 4, 5], "fruits": ["banana", "banana", "apple", "apple", "banana"], "B": [5, 4, 3, 2, 1], "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], }) windows = df.lazy().with_columns([ pl.sum("A").over("fruits").alias("fruit_sum_A"), pl.first("B").over("fruits").alias("fruit_first_B"), pl.max("B").over("cars").alias("cars_max_B"), ]) if __name__ == "__main__": with open("book/src/outputs/how_can_i_apply_window_functions_0.txt", "w") as f: f.write(str(df)) with open("book/src/outputs/how_can_i_apply_window_functions_1.txt", "w") as f: f.write(str(windows.collect()))
x["id1"] = x["id1"].cast(pl.Categorical) x["id2"] = x["id2"].cast(pl.Categorical) x["id3"] = x["id3"].cast(pl.Categorical) in_rows = x.shape[0] x = x.lazy() print(len(x.collect()), flush=True) task_init = timeit.default_timer() print("grouping...", flush=True) question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer() ans = x.groupby("id1").agg(pl.sum("v1")).collect() print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = [ans["v1_sum"].cast(pl.Int64).sum()] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git,
import pypolars as pl from pypolars.lazy import * import time t0 = time.time() left = pl.scan_csv("data/join_left_80000.csv") right = pl.scan_csv("data/join_right_80000.csv") other = pl.scan_csv("data/10000000.csv") q = (left.join(right, on="key", how="inner").filter(col("value") > 0.5).with_column( (col("value") * 10).cast(int)).join( other.groupby("groups").agg(pl.sum("values")), left_on="value", right_on="groups", how="inner", ).select(["key", "values_sum"])) print(q._la) df = q.collect() t = time.time() - t0 # with open("data/macro_bench_polars.txt", "w") as f: # f.write(str(t)) print(df) print(q.describe_optimized_plan())