def test_frame_record(): schema = Schema(timestamp="timestamp*", date="date", float_val="float", int_val="int") values = { "timestamp": [1589455901, 1589455902, 1589455903, 1589455904], "date": [1, 2, 3, 4], "float_val": [1, 2, 3, 4], "int_val": [1, 2, 3, 4], } frm = Frame(schema, values) records = list(frm.records(map_dtype="default")) assert len(records) == len(frm) assert records[0] == { "timestamp": datetime(2020, 5, 14, 11, 31, 41), "date": date(1970, 1, 2), "float_val": 1.0, "int_val": 1, } assert records[-1] == { "timestamp": datetime(2020, 5, 14, 11, 31, 44), "date": date(1970, 1, 5), "float_val": 4.0, "int_val": 4, } records = list(frm.records(map_dtype=None)) assert len(records) == len(frm) assert records[0] == { "timestamp": datetime64("2020-05-14T11:31:41"), "date": datetime64("1970-01-02"), "float_val": 1.0, "int_val": 1, } assert records[-1] == { "timestamp": datetime64("2020-05-14T11:31:44"), "date": datetime64("1970-01-05"), "float_val": 4.0, "int_val": 4, } records = list(frm.records(map_dtype="epoch")) assert len(records) == len(frm) assert records[0] == { "timestamp": 1589455901, "date": 86400, "float_val": 1.0, "int_val": 1, } assert records[-1] == { "timestamp": 1589455904, "date": 345600, "float_val": 4.0, "int_val": 4, }
def test_df_conversion(): df = DataFrame({ "category": NAMES, "value": VALUES, }) # Convert to lakota frame and back to df frm = Frame(base_schema, df) for col in frm: assert all(frm.df()[col] == df[col])
def test_spill_write(series, how): if how == "left": ts = [1589455902, 1589455903, 1589455904, 1589455905] vals = [22, 33, 44, 55] else: ts = [1589455903, 1589455904, 1589455905, 1589455906] vals = [33, 44, 55, 66] frm = Frame( schema, { "timestamp": ts, "value": vals, }, ) series.write(frm) # Test full read args = [ # closed is both (default) (None, None, "b"), (min(ts), max(ts), "b"), (None, max(ts), "b"), (min(ts), None, "b"), # open on left (min(ts) - 1, max(ts), "r"), # open on right (min(ts), max(ts) + 1, "l"), # full open (min(ts) - 1, max(ts) + 1, "n"), ] for start, stop, closed in args: frm_copy = series.frame(start=start, stop=stop, closed=closed) assert frm_copy == frm # Test partial read expected = Frame( schema, { "timestamp": [1589455903, 1589455904], "value": [33, 44], }, ) args = [ # closed is both (default) (1589455903, 1589455904, "b"), # Open on left (1589455902, 1589455904, "r"), # # open on right (1589455903, 1589455905, "l"), # # open on both (1589455902, 1589455905, "n"), ] for start, stop, closed in args: frm_copy = series.frame(start=start, stop=stop, closed=closed) assert frm_copy == expected
def test_alias(): res = AST.parse("(as (asarray (list 1 2 3)) 'new_name')").eval() arr = res.value alias = res.name assert all(arr == asarray([1, 2, 3])) assert alias == "new_name" frm = Frame(schema, values) frm = frm.reduce("(as self.timestamp 'ts')") assert all(frm["ts"] == asarray(values["timestamp"], "M"))
def test_concat(frm): frm2 = Frame.concat(frm, frm) for name in frm: col = list(frm[name]) expected = sorted(col + col) result = list(frm2[name]) assert result == expected assert Frame.concat(frm) == frm assert Frame.concat() is None
def test_mask(): # with an array schema = Schema(x="int*") frm = Frame(schema, {"x": [1, 2, 3, 4, 5, 5, 5, 6]}) frm2 = frm.mask(array([True, False] * 4)) assert all(frm2["x"] == [1, 3, 5, 5]) # with an expression frm2 = frm.mask("(= (% self.x 2) 0") assert all(frm2["x"] == [2, 4, 6])
def test_reduce_agg(): schema = Schema(timestamp="timestamp*", category="str*", value="int") values = { "timestamp": [1589455901, 1589455901, 1589455902, 1589455902], "category": list("abab"), "value": [1, 2, 3, 4], } frm = Frame(schema, values) for op in AST.aggregates: if op == "quantile": # quantile not avail with binning continue new_frm = frm.reduce(category="category", value=f"({op} self.value)") if op == "min": assert list(new_frm["value"]) == [1, 2] elif op == "max": assert list(new_frm["value"]) == [3, 4] elif op == "sum": assert list(new_frm["value"]) == [4, 6] elif op in ("mean", "average"): assert list(new_frm["value"]) == [2, 3] elif op == "first": assert list(new_frm["value"]) == [1, 2] elif op == "last": assert list(new_frm["value"]) == [3, 4] elif op in ("count", "len"): assert list(new_frm["value"]) == [2, 2] else: raise ValueError(f'op "{op}" not tested') for op in AST.aggregates: if op == "quantile": # quantile not avail with binning continue new_frm = frm.reduce(timestamp='(floor self.timestamp "D")', value=f"({op} self.value)") if op == "min": assert list(new_frm["value"]) == [1] elif op == "max": assert list(new_frm["value"]) == [4] elif op == "sum": assert list(new_frm["value"]) == [10] elif op in ("mean", "average"): assert list(new_frm["value"]) == [2.5] elif op == "first": assert list(new_frm["value"]) == [1] elif op == "last": assert list(new_frm["value"]) == [4] elif op in ("count", "len"): assert list(new_frm["value"]) == [4] else: raise ValueError(f'op "{op}" not tested')
def test_getitem(): # with a slice schema = Schema(x="int*") frm = Frame(schema, {"x": [1, 2, 3, 4, 5, 5, 5, 6]}) frm2 = frm[5:] assert all(frm2["x"] == [5, 5, 5, 6]) # with a mask frm2 = frm[array([True, False] * 4)] assert all(frm2["x"] == [1, 3, 5, 5])
def test_with_frame(): frm = Frame(schema, values) env = {"frm": frm, "floor": floor} res = AST.parse("(floor frm.timestamp 'Y')").eval(env) expect = asarray(["2020", "2020", "2020"], dtype="datetime64[Y]") assert all(res == expect) res = AST.parse("(floor frm.timestamp 'h')").eval(env) expect = asarray(["2020-01-01T11", "2020-01-02T12", "2020-01-03T13"], dtype="datetime64") assert all(res == expect)
def test_adjacent_write(series, how): if how == "left": ts = [1589455901, 1589455902] vals = [1.1, 2.2] else: ts = [1589455906, 1589455907] vals = [6.6, 7.7] # We do two write of one arrays (should trigger more corner cases) for pos, stamp in enumerate(ts): frm = Frame( schema, { "timestamp": [stamp], "value": [vals[pos]], }, ) series.write(frm) # Full read frm_copy = series.frame() if how == "left": assert all( frm_copy["timestamp"] == [1589455901, 1589455902, 1589455903, 1589455904, 1589455905] ) assert all(frm_copy["value"] == [1.1, 2.2, 3.3, 4.4, 5.5]) else: assert all( frm_copy["timestamp"] == [1589455903, 1589455904, 1589455905, 1589455906, 1589455907] ) assert all(frm_copy["value"] == [3.3, 4.4, 5.5, 6.6, 7.7]) # Slice read - left slice frm_copy = series[1589455902:1589455903].frame(closed="b") if how == "left": assert all(frm_copy["timestamp"] == [1589455902, 1589455903]) assert all(frm_copy["value"] == [2.2, 3.3]) else: assert all(frm_copy["timestamp"] == [1589455903]) assert all(frm_copy["value"] == [3.3]) # Slice read - right slice frm_copy = series[1589455905:1589455906].frame(closed="b") if how == "left": assert all(frm_copy["timestamp"] == [1589455905]) assert all(frm_copy["value"] == [5.5]) else: assert all(frm_copy["timestamp"] == [1589455905, 1589455906]) assert all(frm_copy["value"] == [5.5, 6.6])
def test_reduce_without_agg(): schema = Schema(timestamp="timestamp*", category="str*", value="int") values = { "timestamp": [1589455901, 1589455901, 1589455902, 1589455902], "category": list("abab"), "value": [1, 2, 3, 4], } frm = Frame(schema, values) # No changes to column assert frm == frm.reduce(timestamp="timestamp", category="category", value="value") # Mapping on one column res = frm.reduce(value="(% self.value 2)")["value"] assert list(res) == [1, 0, 1, 0] # Mapping over two columns expected = frm["timestamp"] + frm["value"] new_frm = frm.reduce(new_col="(+ self.value self.timestamp)") assert all(new_frm["new_col"] == expected)
def test_sort(): # One index column category = ["b", "a", "c"] value = [2, 1, 3] frm = Frame( base_schema, { "category": category, "value": value, }, ) assert frm.is_sorted() == False frm = frm.sorted() assert all(frm["category"] == sorted(category)) assert all(frm["value"] == sorted(value)) assert frm.is_sorted() == True # multi-index timestamp = ["2020-01-02", "2020-01-03", "2020-01-02"] frm = Frame( multi_idx_schema, { "timestamp": timestamp, "category": category, "value": value, }, ) assert frm.is_sorted() == False timestamp, category = zip(*sorted(zip(timestamp, category))) frm = frm.sorted() assert all(frm["timestamp"] == asarray(timestamp, "M")) assert all(frm["category"] == category) assert all(frm["value"] == [2, 3, 1]) assert frm.is_sorted() == True
def insert(args): token, label, year = args pod = POD.from_token(token) repo = Repo(pod=pod) collection = repo / "my_collection" series = collection / label ts = date_range(f"{year}-01-01", f"{year+1}-01-01", freq="1min", closed="left") df = DataFrame({ "timestamp": ts, "value": numpy.round(numpy.random.random(len(ts)) * 1000, decimals=0), }) sgm = Frame(schema, df) series.write(sgm) return len(sgm)
def test_short_cover(series, how): if how == "left": ts = [1589455904, 1589455905] vals = [44, 55] else: ts = [1589455903, 1589455904] vals = [33, 44] frm = Frame( schema, {"timestamp": ts, "value": vals}, ) series.write(frm) frm_copy = series.frame() assert all(frm_copy["timestamp"] == [1589455903, 1589455904, 1589455905]) if how == "left": assert all(frm_copy["value"] == [3.3, 44, 55]) else: assert all(frm_copy["value"] == [33, 44, 5.5])
def test_fragmented_write(series, direction, sgm_size): ts = [1589455901, 1589455902, 1589455903, 1589455904, 1589455905, 1589455906] vals = [11, 22, 33, 44, 55, 66] if direction == "fwd": rg = range(len(ts)) elif direction == "bwd": rg = range(len(ts) - 1, -1, -1) else: rg = list(range(len(ts))) shuffle(rg) for pos in rg: frm = Frame( schema, { "timestamp": ts[pos : pos + sgm_size], "value": vals[pos : pos + sgm_size], }, ) series.write(frm) frm = series.frame() assert all(frm["timestamp"] == ts) assert all(frm["value"] == vals)
def frm(frame_values): frm = Frame(base_schema, frame_values) return frm
def test_pull(threaded, large): c_label = "a_collection" s_label = "a_series" remote_repo = Repo() remote_coll = remote_repo.create_collection(schema, c_label) rseries = remote_coll / s_label # Test support of both small dataset (where data is embedded in # commits) and large one (arrays are save on their own) N = 100_000 if large else 10 for i in range(10): # Create 10 series of size N rseries.write({ "timestamp": range(i, i + N), "value": range(i + 100, i + 100 + N), }) nb_items = len(remote_repo.pod.ls()) if large: assert nb_items > 2 else: # for small arrays we have only two folder (one for the repo # registry one for the collection) assert nb_items == 2 expected = rseries.frame() # Test pull local_repo = Repo() local_coll = local_repo.create_collection(schema, c_label) local_coll.pull(remote_coll) lseries = local_coll / s_label assert lseries.frame() == expected # Test push other_repo = Repo() other_coll = other_repo.create_collection(schema, c_label) remote_coll.push(other_coll) oseries = other_coll / s_label assert oseries.frame() == expected # Test with existing series local_repo = Repo() local_coll = local_repo.create_collection(schema, c_label) local_coll.pull(remote_coll) lseries = ( other_repo.create_collection(schema, c_label, raise_if_exists=False) / s_label) assert oseries.frame() == expected # Test with existing series with existing data local_repo = Repo() local_coll = local_repo.create_collection(schema, c_label) lseries = local_coll / s_label frm = Frame( schema, { "timestamp": range(0, 20), "value": range(0, 20), }, ) lseries.write(frm) local_coll.pull(remote_coll) assert lseries.frame() == frm # Test with existing series with other schema local_repo = Repo() other_schema = Schema(timestamp="int*", value="int") local_coll = local_repo.create_collection(other_schema, c_label) lseries = local_coll / s_label with pytest.raises(ValueError): local_repo.pull(remote_repo)
def test_index_slice(): schema = Schema(x="int*") frm = Frame(schema, {"x": [1, 2, 3, 4, 5, 5, 5, 6]}) # include both side res = frm.slice(*frm.index_slice([2], [4], closed="b"))["x"] assert all(res == [2, 3, 4]) # include only left res = frm.slice(*frm.index_slice([2], [4], closed="l"))["x"] assert all(res == [2, 3]) # include only right res = frm.slice(*frm.index_slice([2], [4], closed="r"))["x"] assert all(res == [3, 4]) # implict right res = frm.slice(*frm.index_slice([5], [5], closed="b"))["x"] assert all(res == [5, 5, 5]) res = frm.slice(*frm.index_slice([1], [1], closed="b"))["x"] assert all(res == [1]) res = frm.slice(*frm.index_slice([6], [6], closed="b"))["x"] assert all(res == [6])