import pypolars as pl from pypolars.lazy import * # A scan is a lazy read. This means nothing happens. reddit = pl.scan_csv("data/reddit.csv") reddit = ( reddit.filter(col("comment_karma") > 0) # only positive comment karma .filter(col("link_karma") > 0) # only positive link karma .filter( col("name").str_contains(r"^a")) # filter name that start with an "a" ) if __name__ == "__main__": df = reddit.fetch(int(1e7)) with open("book/src/outputs/predicate_pushdown_0.txt", "w") as f: f.write(str(df)) reddit.show_graph(optimized=False, show=False, output_path="book/src/img/predicate_pushdown_0.png") reddit.show_graph( optimized=True, show=False, output_path="book/src/img/predicate_pushdown_0_optimized.png", )
import pypolars as pl from pypolars.lazy import * reddit = pl.scan_csv("data/reddit.csv").select( [pl.sum("comment_karma"), pl.min("link_karma")] ) if __name__ == "__main__": df = reddit.fetch() with open("book/src/outputs/how_can_i_aggregate.txt", "w") as f: f.write(str(df))
import pypolars as pl from pypolars.lazy import * reddit = ( pl.scan_csv("data/reddit.csv") .groupby("comment_karma") .agg([col("name").n_unique().alias("unique_names"), col("link_karma").max()]) .sort(by_column="unique_names", reverse=True) ) if __name__ == "__main__": df = reddit.fetch() with open("book/src/outputs/how_can_i_groupby.txt", "w") as f: f.write(str(df))
import pypolars as pl from pypolars.lazy import * reddit = pl.scan_csv("data/reddit.csv") runescape = pl.scan_csv("data/runescape.csv", has_headers=False).select( col("column_1").alias("name")) reddit = (reddit.filter(col("comment_karma") > 0).filter( col("link_karma") > 0).filter(col("name").str_contains(r"^a"))) joined = reddit.join(runescape, on="name", how="inner").select( ["name", "comment_karma", "link_karma"]) if __name__ == "__main__": joined.show_graph( optimized=False, show=False, output_path="book/src/img/projection_pushdown_0.png", ) joined.show_graph( optimized=True, show=False, output_path="book/src/img/projection_pushdown_0_optimized.png", ) df = joined.fetch(int(1e7)) with open("book/src/outputs/projection_pushdown_0.txt", "w") as f: f.write(str(df))
import pypolars as pl from pypolars.lazy import * import time reddit = pl.scan_csv("data/reddit.csv") runestar = pl.scan_csv("data/runescape.csv", has_headers=False).with_column( col("column_1").alias("name") ) reddit = ( reddit.filter(col("comment_karma") > 0) .filter(col("link_karma") > 0) .filter(col("name").str_contains(r"^a")) # filter name that start with an "a" ) joined = reddit.join(runestar, on="name", how="inner").select( ["name", "comment_karma", "link_karma"] ) t0 = time.time() joined.show_graph(True) df = joined.fetch(int(1e7)) print(time.time() - t0) print(df)
import pypolars as pl from pypolars.lazy import * import time t0 = time.time() left = pl.scan_csv("data/join_left_80000.csv") right = pl.scan_csv("data/join_right_80000.csv") other = pl.scan_csv("data/10000000.csv") q = (left.join(right, on="key", how="inner").filter(col("value") > 0.5).with_column( (col("value") * 10).cast(int)).join( other.groupby("groups").agg(pl.sum("values")), left_on="value", right_on="groups", how="inner", ).select(["key", "values_sum"])) print(q._la) df = q.collect() t = time.time() - t0 # with open("data/macro_bench_polars.txt", "w") as f: # f.write(str(t)) print(df) print(q.describe_optimized_plan())