Beispiel #1
0
import pypolars as pl
from pypolars.lazy import *

reddit = pl.scan_csv("data/reddit.csv").select(
    [pl.sum("comment_karma"), pl.min("link_karma")]
)

if __name__ == "__main__":
    df = reddit.fetch()
    with open("book/src/outputs/how_can_i_aggregate.txt", "w") as f:
        f.write(str(df))
Beispiel #2
0
def test_fold():
    df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = df.lazy().select(pl.sum(["a", "b"])).collect()
    assert out["sum"].series_equal(Series("sum", [2, 4, 6]))
Beispiel #3
0
id037,id097,id0000062401,48,8,53992,5,15,83.565443
id073,id081,id0000017280,54,90,28480,5,4,17.078693
id081,id073,id0000073423,51,22,39788,2,12,45.883758
id062,id080,id0000092749,1,75,67857,3,10,80.418674
id045,id031,id0000076210,2,42,80312,4,5,48.668692
id082,id048,id0000080227,56,62,16760,3,11,34.933239
id035,id032,id0000033279,55,13,80560,5,5,61.372678
id053,id013,id0000073898,61,63,12387,4,7,29.949863"""

f = io.BytesIO(csv)

x = pl.read_csv(
    f,
    dtype={
        "id4": pl.Int32,
        "id5": pl.Int32,
        "id6": pl.Int32,
        "v1": pl.Int32,
        "v2": pl.Int32,
        "v3": pl.Float64,
    },
)
x["id1"] = x["id1"].cast(pl.Categorical)
x["id2"] = x["id2"].cast(pl.Categorical)
x["id3"] = x["id3"].cast(pl.Categorical)
x = x.lazy()

question = "sum v1 by id1"  # q1
ans = x.groupby("id1").agg(pl.sum("v1")).collect()
print(ans.shape, flush=True)
x["id1"] = x["id1"].cast(pl.Categorical)
x["id2"] = x["id2"].cast(pl.Categorical)
x["id3"] = x["id3"].cast(pl.Categorical)

in_rows = x.shape[0]
x = x.lazy()

print(len(x.collect()), flush=True)

task_init = timeit.default_timer()
print("grouping...", flush=True)

question = "sum v1 by id1" # q1
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id1").agg(pl.sum("v1")).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = [ans["v1_sum"].sum()]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id1").agg(pl.sum("v1")).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
import pypolars as pl

df = pl.DataFrame({
    "A": [1, 2, 3, 4, 5],
    "fruits": ["banana", "banana", "apple", "apple", "banana"],
    "B": [5, 4, 3, 2, 1],
    "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
})

windows = df.lazy().with_columns([
    pl.sum("A").over("fruits").alias("fruit_sum_A"),
    pl.first("B").over("fruits").alias("fruit_first_B"),
    pl.max("B").over("cars").alias("cars_max_B"),
])

if __name__ == "__main__":
    with open("book/src/outputs/how_can_i_apply_window_functions_0.txt",
              "w") as f:
        f.write(str(df))

    with open("book/src/outputs/how_can_i_apply_window_functions_1.txt",
              "w") as f:
        f.write(str(windows.collect()))
Beispiel #6
0
x["id1"] = x["id1"].cast(pl.Categorical)
x["id2"] = x["id2"].cast(pl.Categorical)
x["id3"] = x["id3"].cast(pl.Categorical)

in_rows = x.shape[0]
x = x.lazy()

print(len(x.collect()), flush=True)

task_init = timeit.default_timer()
print("grouping...", flush=True)

question = "sum v1 by id1"  # q1
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id1").agg(pl.sum("v1")).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = [ans["v1_sum"].cast(pl.Int64).sum()]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=in_rows,
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
          version=ver,
          git=git,
Beispiel #7
0
import pypolars as pl
from pypolars.lazy import *
import time

t0 = time.time()

left = pl.scan_csv("data/join_left_80000.csv")
right = pl.scan_csv("data/join_right_80000.csv")
other = pl.scan_csv("data/10000000.csv")

q = (left.join(right, on="key",
               how="inner").filter(col("value") > 0.5).with_column(
                   (col("value") * 10).cast(int)).join(
                       other.groupby("groups").agg(pl.sum("values")),
                       left_on="value",
                       right_on="groups",
                       how="inner",
                   ).select(["key", "values_sum"]))
print(q._la)
df = q.collect()

t = time.time() - t0
# with open("data/macro_bench_polars.txt", "w") as f:
#     f.write(str(t))
print(df)
print(q.describe_optimized_plan())