Ejemplo n.º 1
0
    os.path.join("data", y_data_name[0] + ".csv"),
    os.path.join("data", y_data_name[1] + ".csv"),
    os.path.join("data", y_data_name[2] + ".csv")
]
if len(src_jn_y) != 3:
    raise Exception("Something went wrong in preparing files used for join")

print("loading datasets " + data_name + ", " + y_data_name[0] + ", " +
      y_data_name[2] + ", " + y_data_name[2],
      flush=True)

with pl.StringCache():
    x = pl.read_csv(src_jn_x,
                    dtype={
                        "id1": pl.Int32,
                        "id2": pl.Int32,
                        "id3": pl.Int32,
                        "v1": pl.Float64
                    })
    x["id4"] = x["id4"].cast(pl.Categorical)
    x["id5"] = x["id5"].cast(pl.Categorical)
    x["id6"] = x["id6"].cast(pl.Categorical)
    small = pl.read_csv(src_jn_y[0], dtype={"id1": pl.Int32, "v2": pl.Float64})
    small["id4"] = small["id4"].cast(pl.Categorical)
    medium = pl.read_csv(src_jn_y[1],
                         dtype={
                             "id1": pl.Int32,
                             "id2": pl.Int32,
                             "v2": pl.Float64
                         })
    medium["id4"] = medium["id4"].cast(pl.Categorical)
Ejemplo n.º 2
0
import pypolars as pl

df = pl.read_csv("./data/reddit.csv", stop_after_n_rows=10)
df.head()

with open("book/src/outputs/head_reddit.txt", "w") as f:
    f.write(str(df.head()))

df = pl.read_csv(
    "./data/runescape.csv",
    has_headers=False,
    stop_after_n_rows=10,
)
df.head()

with open("book/src/outputs/head_runescape.txt", "w") as f:
    f.write(str(df.head()))
Ejemplo n.º 3
0
from utils import peak_memory, gb_data_files, simple_bench
import pypolars as pl

files = gb_data_files()


def groupby():
    df.groupby("str").select("values").first()


if __name__ == "__main__":
    with open("data/polars_bench_gb.txt", "w") as fh:
        with open("data/polars_gb_mem.txt", "w") as fh_mem:
            for fn in files:
                print(fn)
                df = pl.read_csv(fn)
                df["str"] = df["str"].cast(str)
                ms = simple_bench(groupby)
                print(ms)
                fh.write(f"{ms}\n")

                fh_mem.write(str(peak_memory()) + "\n")
Ejemplo n.º 4
0
id037,id097,id0000062401,48,8,53992,5,15,83.565443
id073,id081,id0000017280,54,90,28480,5,4,17.078693
id081,id073,id0000073423,51,22,39788,2,12,45.883758
id062,id080,id0000092749,1,75,67857,3,10,80.418674
id045,id031,id0000076210,2,42,80312,4,5,48.668692
id082,id048,id0000080227,56,62,16760,3,11,34.933239
id035,id032,id0000033279,55,13,80560,5,5,61.372678
id053,id013,id0000073898,61,63,12387,4,7,29.949863"""

f = io.BytesIO(csv)

x = pl.read_csv(
    f,
    dtype={
        "id4": pl.Int32,
        "id5": pl.Int32,
        "id6": pl.Int32,
        "v1": pl.Int32,
        "v2": pl.Int32,
        "v3": pl.Float64,
    },
)
x["id1"] = x["id1"].cast(pl.Categorical)
x["id2"] = x["id2"].cast(pl.Categorical)
x["id3"] = x["id3"].cast(pl.Categorical)
x = x.lazy()

question = "sum v1 by id1"  # q1
ans = x.groupby("id1").agg(pl.sum("v1")).collect()
print(ans.shape, flush=True)
Ejemplo n.º 5
0
exec(open("./_helpers/helpers.py").read())

ver = pl.__version__
git = ""
task = "groupby"
solution = "polars"
fun = ".groupby"
cache = "TRUE"
on_disk = "FALSE"

data_name = os.environ["SRC_DATANAME"]
src_grp = os.path.join("data", data_name + ".csv")
print("loading dataset %s" % data_name, flush=True)

x = pl.read_csv(src_grp, dtype={"id4":pl.Int32, "id5":pl.Int32, "id6":pl.Int32, "v1":pl.Int32, "v2":pl.Int32, "v3":pl.Float64})
x["id1"] = x["id1"].cast(pl.Categorical)
x["id2"] = x["id2"].cast(pl.Categorical)
x["id3"] = x["id3"].cast(pl.Categorical)

in_rows = x.shape[0]
x = x.lazy()

print(len(x.collect()), flush=True)

task_init = timeit.default_timer()
print("grouping...", flush=True)

question = "sum v1 by id1" # q1
gc.collect()
t_start = timeit.default_timer()
import timeit
import pypolars as pl

start = timeit.default_timer()
df = pl.read_csv('./data/users.csv')
stop = timeit.default_timer()
print('Loading csv file took pypolars: ', stop - start)

start = timeit.default_timer()
df.sort(by_column='n', reverse=True)
stop = timeit.default_timer()
print('Sorting the data frame took pypolars: ', stop - start)
Ejemplo n.º 7
0
fw.write(f"{mean}\n")
print("left join {} μs".format(mean))
print("shape:", joined.shape)

durations = []
for _ in range(10):
    t0 = datetime.datetime.now()
    joined = left.merge(right, on="key", how="outer")
    duration = datetime.datetime.now() - t0
    durations.append(duration.microseconds)
mean = np.mean(durations)
fw.write(f"{mean}\n")
print("outer join {} μs".format(mean))
print("shape:", joined.shape)

left = pl.read_csv("data/join_left_80000.csv")
right = pl.read_csv("data/join_right_80000.csv")

print("polars")
fw = open("data/polars_bench_join.txt", "w")

durations = []
for _ in range(10):
    t0 = datetime.datetime.now()
    joined = left.join(right, on="key", how="inner")
    duration = datetime.datetime.now() - t0
    durations.append(duration.microseconds)
mean = np.mean(durations)
fw.write(f"{mean}\n")
print("inner join {} μs".format(mean))
print("shape:", joined.shape)
Ejemplo n.º 8
0
def test_read_web_file():
    url = "https://raw.githubusercontent.com/ritchie46/polars/master/examples/aggregate_multiple_files_in_chunks/datasets/foods1.csv"
    df = pl.read_csv(url)
    assert df.shape == (27, 4)
Ejemplo n.º 9
0
def read_polars():
    pl.read_csv(fn, n_threads=8, rechunk=False)