os.path.join("data", y_data_name[0] + ".csv"), os.path.join("data", y_data_name[1] + ".csv"), os.path.join("data", y_data_name[2] + ".csv") ] if len(src_jn_y) != 3: raise Exception("Something went wrong in preparing files used for join") print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[2] + ", " + y_data_name[2], flush=True) with pl.StringCache(): x = pl.read_csv(src_jn_x, dtype={ "id1": pl.Int32, "id2": pl.Int32, "id3": pl.Int32, "v1": pl.Float64 }) x["id4"] = x["id4"].cast(pl.Categorical) x["id5"] = x["id5"].cast(pl.Categorical) x["id6"] = x["id6"].cast(pl.Categorical) small = pl.read_csv(src_jn_y[0], dtype={"id1": pl.Int32, "v2": pl.Float64}) small["id4"] = small["id4"].cast(pl.Categorical) medium = pl.read_csv(src_jn_y[1], dtype={ "id1": pl.Int32, "id2": pl.Int32, "v2": pl.Float64 }) medium["id4"] = medium["id4"].cast(pl.Categorical)
import pypolars as pl df = pl.read_csv("./data/reddit.csv", stop_after_n_rows=10) df.head() with open("book/src/outputs/head_reddit.txt", "w") as f: f.write(str(df.head())) df = pl.read_csv( "./data/runescape.csv", has_headers=False, stop_after_n_rows=10, ) df.head() with open("book/src/outputs/head_runescape.txt", "w") as f: f.write(str(df.head()))
from utils import peak_memory, gb_data_files, simple_bench import pypolars as pl files = gb_data_files() def groupby(): df.groupby("str").select("values").first() if __name__ == "__main__": with open("data/polars_bench_gb.txt", "w") as fh: with open("data/polars_gb_mem.txt", "w") as fh_mem: for fn in files: print(fn) df = pl.read_csv(fn) df["str"] = df["str"].cast(str) ms = simple_bench(groupby) print(ms) fh.write(f"{ms}\n") fh_mem.write(str(peak_memory()) + "\n")
id037,id097,id0000062401,48,8,53992,5,15,83.565443 id073,id081,id0000017280,54,90,28480,5,4,17.078693 id081,id073,id0000073423,51,22,39788,2,12,45.883758 id062,id080,id0000092749,1,75,67857,3,10,80.418674 id045,id031,id0000076210,2,42,80312,4,5,48.668692 id082,id048,id0000080227,56,62,16760,3,11,34.933239 id035,id032,id0000033279,55,13,80560,5,5,61.372678 id053,id013,id0000073898,61,63,12387,4,7,29.949863""" f = io.BytesIO(csv) x = pl.read_csv( f, dtype={ "id4": pl.Int32, "id5": pl.Int32, "id6": pl.Int32, "v1": pl.Int32, "v2": pl.Int32, "v3": pl.Float64, }, ) x["id1"] = x["id1"].cast(pl.Categorical) x["id2"] = x["id2"].cast(pl.Categorical) x["id3"] = x["id3"].cast(pl.Categorical) x = x.lazy() question = "sum v1 by id1" # q1 ans = x.groupby("id1").agg(pl.sum("v1")).collect() print(ans.shape, flush=True)
exec(open("./_helpers/helpers.py").read()) ver = pl.__version__ git = "" task = "groupby" solution = "polars" fun = ".groupby" cache = "TRUE" on_disk = "FALSE" data_name = os.environ["SRC_DATANAME"] src_grp = os.path.join("data", data_name + ".csv") print("loading dataset %s" % data_name, flush=True) x = pl.read_csv(src_grp, dtype={"id4":pl.Int32, "id5":pl.Int32, "id6":pl.Int32, "v1":pl.Int32, "v2":pl.Int32, "v3":pl.Float64}) x["id1"] = x["id1"].cast(pl.Categorical) x["id2"] = x["id2"].cast(pl.Categorical) x["id3"] = x["id3"].cast(pl.Categorical) in_rows = x.shape[0] x = x.lazy() print(len(x.collect()), flush=True) task_init = timeit.default_timer() print("grouping...", flush=True) question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer()
import timeit import pypolars as pl start = timeit.default_timer() df = pl.read_csv('./data/users.csv') stop = timeit.default_timer() print('Loading csv file took pypolars: ', stop - start) start = timeit.default_timer() df.sort(by_column='n', reverse=True) stop = timeit.default_timer() print('Sorting the data frame took pypolars: ', stop - start)
fw.write(f"{mean}\n") print("left join {} μs".format(mean)) print("shape:", joined.shape) durations = [] for _ in range(10): t0 = datetime.datetime.now() joined = left.merge(right, on="key", how="outer") duration = datetime.datetime.now() - t0 durations.append(duration.microseconds) mean = np.mean(durations) fw.write(f"{mean}\n") print("outer join {} μs".format(mean)) print("shape:", joined.shape) left = pl.read_csv("data/join_left_80000.csv") right = pl.read_csv("data/join_right_80000.csv") print("polars") fw = open("data/polars_bench_join.txt", "w") durations = [] for _ in range(10): t0 = datetime.datetime.now() joined = left.join(right, on="key", how="inner") duration = datetime.datetime.now() - t0 durations.append(duration.microseconds) mean = np.mean(durations) fw.write(f"{mean}\n") print("inner join {} μs".format(mean)) print("shape:", joined.shape)
def test_read_web_file(): url = "https://raw.githubusercontent.com/ritchie46/polars/master/examples/aggregate_multiple_files_in_chunks/datasets/foods1.csv" df = pl.read_csv(url) assert df.shape == (27, 4)
def read_polars(): pl.read_csv(fn, n_threads=8, rechunk=False)