def test_series_squash_stability(): label = "LABEL" local_repo = Repo() local_coll = local_repo.create_collection(schema, "a_collection") remote_repo = Repo() remote_coll = remote_repo.create_collection(schema, "a_collection") series = local_coll / label months = list(range(1, 12)) delta = timedelta(days=1) for start, stop in zip(months[:-1], months[1:]): ts = drange(f"2020-{start:02}-01", f"2020-{stop:02}-01", delta) values = [start] * len(ts) series.write({"timestamp": ts, "value": values}) local_coll.push(remote_coll) local_coll.squash() remote_coll.squash() local_files = local_coll.pod.walk() remote_files = remote_coll.pod.walk() local_digests = set( Revision.from_path(local_coll.changelog, f).digests for f in local_files if "." in f) remote_digests = set( Revision.from_path(remote_coll.changelog, f).digests for f in remote_files if "." in f) assert local_digests == remote_digests
def test_refresh(): pod = MemPOD(".") repo = Repo(pod=pod) repo.create_collection(SCHEMA, "collection") assert repo.ls() == ["collection"] repo2 = Repo(pod=pod) repo2.delete("collection") # repo is out of sync assert repo.ls() == ["collection"] # refresh slove ths repo.refresh() assert repo.ls() == []
def test_label_delete_push(squash): kv_schema = Schema.kv(timestamp="int*", value="float") labels = list("abcd") local_repo = Repo() local_clct = local_repo.create_collection(kv_schema, "a_collection") remote_repo = Repo() remote_clct = remote_repo.create_collection(kv_schema, "a_collection") # Write some data frm = { "timestamp": [1, 2, 3], "value": [1, 2, 3], } for label in labels: series = local_clct / label series.write(frm) # Create some labels and push them local_clct.push(remote_clct) if squash: remote_clct.squash() assert local_clct.ls() == labels assert remote_clct.ls() == labels # Delete one local label and push again local_clct.delete("c") local_clct.push(remote_clct) if squash: remote_clct.merge() remote_clct.squash() else: remote_clct.refresh() assert remote_clct.ls() == list("abd") assert local_clct.ls() == list("abd") # Delete one remote label and pull sleep(0.1) # Needed to avoid concurrent writes remote_clct.delete("d") local_clct.pull(remote_clct) if squash: local_clct.squash() else: local_clct.refresh() assert remote_clct.ls() == list("ab") assert local_clct.ls() == list("ab")
def test_insert(pod): # Write with workers label = "my_label" repo = Repo(pod=pod) # Create collection and label collection = repo.create_collection(schema, "my_collection") token = pod.token cluster = LocalCluster(processes=False) client = Client(cluster) args = [(token, label, y) for y in years] with timeit(f"\nWRITE ({pod.protocol})"): fut = client.map(insert, args) assert sum(client.gather(fut)) == 10_519_200 client.close() cluster.close() # Merge everything and read series with timeit(f"\nMERGE ({pod.protocol})"): collection.merge() with timeit(f"\nREAD ({pod.protocol})"): series = collection / label df = series["2015-01-01":"2015-01-02"].df() assert len(df) == 1440 df = series["2015-12-31":"2016-01-02"].df() assert len(df) == 2880
def test_label_regexp(): repo = Repo() ok = ["abc", "abc-abc-123", "abc_abc-123.45", "abc+abc", "$", "é"] for label in ok: repo.create_collection(SCHEMA, label) repo.create_collection(SCHEMA, label.upper(), raise_if_exists=False) not_ok = ["", "\t", "\n"] for label in not_ok: with pytest.raises(ValueError): repo.create_collection(SCHEMA, label) with pytest.raises(ValueError): repo.create_collection(SCHEMA, label + " ")
def test_series_shallow_pull(size, direction, shallow): label = "LABEL" local_repo = Repo() remote_repo = Repo() local_coll = local_repo.create_collection(schema, "a_collection") series = local_coll / label series.write({"timestamp": arange(size), "value": arange(size)}) series.write({"timestamp": arange(size), "value": arange(size) * 2}) if direction == "pull": remote_repo.pull(local_repo, shallow=shallow) else: local_repo.push(remote_repo, shallow=shallow) remote_clc = remote_repo / "a_collection" assert len(remote_clc.changelog.log()) == (1 if shallow else 2) remote_series = remote_clc / label expected = series.frame() assert remote_series.frame() == expected
def test_gc(): # Create pod, repo & collection pod = POD.from_uri("memory://") token = pod.token label = "my_label" repo = Repo(pod=pod) clc = repo.create_collection(schema, "my_collection") # Start cluster & schedule concurrent writes & gc cluster = LocalCluster(processes=False) client = Client(cluster) args = [(token, label, y) for y in years] insert_fut = client.map(insert, args) gc_fut = client.submit(do_squash_and_gc, token) assert sum(client.gather(insert_fut)) == 10_519_200 client.gather(gc_fut) client.close() cluster.close() # Read data back clc.merge() frm = clc.series("my_label").frame() assert len(frm) == 10_519_200
df = DataFrame({ "ts": timestamps, "value": values, }) df.to_csv(f"timeseries-{suffix}.csv") df.to_parquet(f"timeseries-{suffix}.snappy.pqt", compression='snappy') df.to_parquet(f"timeseries-{suffix}.brotli.pqt", compression='brotli') with timeit('pqt'): df.to_parquet(f"timeseries-{suffix}.gzip.pqt", compression='gzip') repo = Repo("repo") schema = Schema(ts="timestamp*", value="float") clct = repo / "my_collection" if not clct: clct = repo.create_collection(schema, "my_collection") series = clct / "my_series" with timeit('lk'): series.write(df) ## Results # $ python examples/data_size.py # pqt 198.76ms # lk 24.24ms # $ du -hs timeseries-* repo # 1,4M timeseries-17a813a84a1.brotli.pqt # 4,4M timeseries-17a813a84a1.csv # 1,5M timeseries-17a813a84a1.gzip.pqt
from lakota import Repo, Schema # TODO use a KVSeries instead (it solve the problem explained at the bottom) ts_schema = Schema( timestamp="timestamp*", pubtime="timestamp*", value="float", ) repo = Repo() clc = repo.create_collection(ts_schema, "my-collection") srs = clc / "my_series" # First insertion df = { "timestamp": [ "2020-01-01T00:00", "2020-01-01T00:00", "2020-01-02T00:00", "2020-01-02T00:00", "2020-01-03T00:00", "2020-01-03T00:00", "2020-01-04T00:00", "2020-01-04T00:00", ], "pubtime": [ "2020-01-01T00:00", "2020-01-02T00:00", "2020-01-01T00:00", "2020-01-02T00:00", "2020-01-01T00:00",
CHUNK_SIZES = (500, 5_000, 50_000, 500_000) def create_df(start, stop): ts = arange(start, stop) value = arange(start, stop) random.shuffle(value) return DataFrame({"timestamp": ts, "value": value}) def call(cmd): proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) stdout, _ = proc.communicate() return stdout for chunk_size in CHUNK_SIZES: df = create_df(0, SIZE) with timeit(f"chunk size {chunk_size}:"): schema = Schema(timestamp="timestamp*", value="float") repo = Repo("test-db") collection = repo.create_collection(schema, "test") series = collection / "test" for i in range(0, SIZE, chunk_size): series.write(df[i : i + chunk_size]) res = call("du -hs test-db") print("Disk use", res.split()[0].decode()) call("rm -r test-db")
schema = Schema(key="int*", **{x: "float" for x in cols}) frm = { "key": range(SIZE), } for x in cols: frm[x] = sin(arange(SIZE)) # Simulate network lag def lag(fn, delay): def wrapper(*a, **kw): sleep(delay) return fn(*a, **kw) return wrapper mempod_write = MemPOD.write for delay in (0.001, 0.01, 0.1): MemPOD.write = lag(mempod_write, delay) for threaded in (False, True): settings.threaded = threaded with timeit(f"{delay}-{threaded}"): repo = Repo() clc = repo.create_collection(schema, "clc") with clc.multi(): for name in "ABC": series = clc / name series.write(frm)
def test_pull(threaded, large): c_label = "a_collection" s_label = "a_series" remote_repo = Repo() remote_coll = remote_repo.create_collection(schema, c_label) rseries = remote_coll / s_label # Test support of both small dataset (where data is embedded in # commits) and large one (arrays are save on their own) N = 100_000 if large else 10 for i in range(10): # Create 10 series of size N rseries.write({ "timestamp": range(i, i + N), "value": range(i + 100, i + 100 + N), }) nb_items = len(remote_repo.pod.ls()) if large: assert nb_items > 2 else: # for small arrays we have only two folder (one for the repo # registry one for the collection) assert nb_items == 2 expected = rseries.frame() # Test pull local_repo = Repo() local_coll = local_repo.create_collection(schema, c_label) local_coll.pull(remote_coll) lseries = local_coll / s_label assert lseries.frame() == expected # Test push other_repo = Repo() other_coll = other_repo.create_collection(schema, c_label) remote_coll.push(other_coll) oseries = other_coll / s_label assert oseries.frame() == expected # Test with existing series local_repo = Repo() local_coll = local_repo.create_collection(schema, c_label) local_coll.pull(remote_coll) lseries = ( other_repo.create_collection(schema, c_label, raise_if_exists=False) / s_label) assert oseries.frame() == expected # Test with existing series with existing data local_repo = Repo() local_coll = local_repo.create_collection(schema, c_label) lseries = local_coll / s_label frm = Frame( schema, { "timestamp": range(0, 20), "value": range(0, 20), }, ) lseries.write(frm) local_coll.pull(remote_coll) assert lseries.frame() == frm # Test with existing series with other schema local_repo = Repo() other_schema = Schema(timestamp="int*", value="int") local_coll = local_repo.create_collection(other_schema, c_label) lseries = local_coll / s_label with pytest.raises(ValueError): local_repo.pull(remote_repo)
def write_lk(df): schema = Schema(timestamp="timestamp*", value="float") repo = Repo("test-db") collection = repo.create_collection(schema, "test") series = collection / "test" series.write(df)