def test_insert(pod): # Write with workers label = "my_label" repo = Repo(pod=pod) # Create collection and label collection = repo.create_collection(schema, "my_collection") token = pod.token cluster = LocalCluster(processes=False) client = Client(cluster) args = [(token, label, y) for y in years] with timeit(f"\nWRITE ({pod.protocol})"): fut = client.map(insert, args) assert sum(client.gather(fut)) == 10_519_200 client.close() cluster.close() # Merge everything and read series with timeit(f"\nMERGE ({pod.protocol})"): collection.merge() with timeit(f"\nREAD ({pod.protocol})"): series = collection / label df = series["2015-01-01":"2015-01-02"].df() assert len(df) == 1440 df = series["2015-12-31":"2016-01-02"].df() assert len(df) == 2880
from lakota.utils import hextime, timeit suffix = hextime() SIZE = 100_000 values = random(SIZE) timestamps = date_range("1970-01-01", freq="5min", periods=SIZE) df = DataFrame({ "ts": timestamps, "value": values, }) df.to_csv(f"timeseries-{suffix}.csv") df.to_parquet(f"timeseries-{suffix}.snappy.pqt", compression='snappy') df.to_parquet(f"timeseries-{suffix}.brotli.pqt", compression='brotli') with timeit('pqt'): df.to_parquet(f"timeseries-{suffix}.gzip.pqt", compression='gzip') repo = Repo("repo") schema = Schema(ts="timestamp*", value="float") clct = repo / "my_collection" if not clct: clct = repo.create_collection(schema, "my_collection") series = clct / "my_series" with timeit('lk'): series.write(df) ## Results # $ python examples/data_size.py
CHUNK_SIZES = (500, 5_000, 50_000, 500_000) def create_df(start, stop): ts = arange(start, stop) value = arange(start, stop) random.shuffle(value) return DataFrame({"timestamp": ts, "value": value}) def call(cmd): proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) stdout, _ = proc.communicate() return stdout for chunk_size in CHUNK_SIZES: df = create_df(0, SIZE) with timeit(f"chunk size {chunk_size}:"): schema = Schema(timestamp="timestamp*", value="float") repo = Repo("test-db") collection = repo.create_collection(schema, "test") series = collection / "test" for i in range(0, SIZE, chunk_size): series.write(df[i : i + chunk_size]) res = call("du -hs test-db") print("Disk use", res.split()[0].decode()) call("rm -r test-db")
schema = Schema(key="int*", **{x: "float" for x in cols}) frm = { "key": range(SIZE), } for x in cols: frm[x] = sin(arange(SIZE)) # Simulate network lag def lag(fn, delay): def wrapper(*a, **kw): sleep(delay) return fn(*a, **kw) return wrapper mempod_write = MemPOD.write for delay in (0.001, 0.01, 0.1): MemPOD.write = lag(mempod_write, delay) for threaded in (False, True): settings.threaded = threaded with timeit(f"{delay}-{threaded}"): repo = Repo() clc = repo.create_collection(schema, "clc") with clc.multi(): for name in "ABC": series = clc / name series.write(frm)
def read_lk(): repo = Repo("test-db") collection = repo / "test" series = collection / "test" return series.frame() def read_pg(): conn = psycopg2.connect("postgresql:///test") cursor = conn.cursor() cursor.execute("select * from test") return list(cursor) ts = pandas.date_range("1970-01-01", "2020-01-01", freq="5min") value = sin(arange(len(ts))) df = pandas.DataFrame({ "timestamp": ts, "value": value, }) with timeit("write pg"): write_pg(df) with timeit("write lk"): write_lk(df) with timeit("read pg"): read_pg() with timeit("read lk"): read_lk()