def test_groupby_windowing_value(func, value, getter, grouper, indexer): index = pd.DatetimeIndex(start="2000-01-01", end="2000-01-03", freq="1h") df = pd.DataFrame( { "x": np.arange(len(index), dtype=float), "y": np.arange(len(index), dtype=float) % 2, }, index=index, ) sdf = DataFrame(example=df) def f(x): return func(indexer(x.groupby(grouper(x)))) L = f(sdf.window(value)).stream.gather().sink_to_list() value = pd.Timedelta(value) diff = 13 for i in range(0, len(index), diff): sdf.emit(df.iloc[i : i + diff]) sdf.emit(df.iloc[:0]) assert len(L) == 5 first = df.iloc[:diff] lost = first[first.index.min() + value :] first = first.iloc[len(lost) :] assert_eq(L[0], f(first)) last = df.loc[index.max() - value + pd.Timedelta("1s") :] assert_eq(L[-1], f(last))
def test_exceptions(stream): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) with pytest.raises(TypeError): sdf.emit(1) with pytest.raises(IndexError): sdf.emit(pd.DataFrame())
def test_dataframe_simple(func): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) expected = func(df) a = DataFrame(example=df) L = func(a).stream.sink_to_list() a.emit(df) assert_eq(L[0], expected)
def test_getitem(stream): df = pd.DataFrame({"x": list(range(10)), "y": [1] * 10}) a = DataFrame(example=df.iloc[:0], stream=stream) L = a[a.x > 4].stream.gather().sink_to_list() a.emit(df.iloc[:5]) a.emit(df.iloc[5:]) assert len(L) == 2 assert_eq(pd.concat(L, axis=0), df[df.x > 4])
def test_pair_arithmetic(stream): df = pd.DataFrame({"x": list(range(10)), "y": [1] * 10}) a = DataFrame(example=df.iloc[:0], stream=stream) L = ((a.x + a.y) * 2).stream.gather().sink_to_list() a.emit(df.iloc[:5]) a.emit(df.iloc[5:]) assert len(L) == 2 assert_eq(pd.concat(L, axis=0), (df.x + df.y) * 2)
def test_index(stream): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) a = DataFrame(example=df, stream=stream) b = a.index + 5 L = b.stream.gather().sink_to_list() a.emit(df) a.emit(df) assert_eq(L[0], df.index + 5) assert_eq(L[1], df.index + 5)
def test_binary_stream_operators(stream): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) expected = df.x + df.y a = DataFrame(example=df, stream=stream) b = (a.x + a.y).stream.gather().sink_to_list() a.emit(df) assert_eq(b[0], expected)
def test_tail(stream): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.tail(2).stream.gather().sink_to_list() sdf.emit(df) sdf.emit(df) assert_eq(L[0], df.tail(2)) assert_eq(L[1], df.tail(2))
def test_unary_operators(op, getter): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) try: expected = op(getter(df)) except Exception: return a = DataFrame(example=df) b = op(getter(a)).stream.sink_to_list() a.emit(df) assert_eq(b[0], expected)
def test_windowing_n(func, n, getter): df = pd.DataFrame({"x": list(range(10)), "y": [1, 2] * 5}) sdf = DataFrame(example=df) L = func(getter(sdf).window(n=n) + 10).stream.gather().sink_to_list() for i in range(0, 10, 3): sdf.emit(df.iloc[i : i + 3]) sdf.emit(df.iloc[:0]) assert len(L) == 5 assert_eq(L[0], func(getter(df).iloc[max(0, 3 - n) : 3] + 10)) assert_eq(L[-1], func(getter(df).iloc[len(df) - n :] + 10))
def test_custom_aggregation(): df = pd.DataFrame({"x": np.arange(10, dtype=float), "y": [1.0, 2.0] * 5}) class Custom(Aggregation): def initial(self, new): return 0 def on_new(self, state, new): return state + 1, state def on_old(self, state, new): return state - 100, state sdf = DataFrame(example=df) L = sdf.aggregate(Custom()).stream.sink_to_list() sdf.emit(df) sdf.emit(df) sdf.emit(df) assert L == [0, 1, 2] sdf = DataFrame(example=df) L = sdf.window(n=5).aggregate(Custom()).stream.sink_to_list() sdf.emit(df) sdf.emit(df) sdf.emit(df) assert L == [1, -198, -397]
def test_reductions(stream, func): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) for example in [df, df.iloc[:0]]: sdf = DataFrame(example=example, stream=stream) df_out = func(sdf).stream.gather().sink_to_list() x = sdf.x x_out = func(x).stream.gather().sink_to_list() sdf.emit(df) sdf.emit(df) assert_eq(df_out[-1], func(pd.concat([df, df]))) assert_eq(x_out[-1], func(pd.concat([df, df]).x))
def test_cumulative_aggregations(op, getter, stream): df = pd.DataFrame({"x": list(range(10)), "y": [1] * 10}) expected = getattr(getter(df), op)() sdf = DataFrame(example=df, stream=stream) L = getattr(getter(sdf), op)().stream.gather().sink_to_list() for i in range(0, 10, 3): sdf.emit(df.iloc[i : i + 3]) sdf.emit(df.iloc[:0]) assert len(L) > 1 assert_eq(pd.concat(L), expected)
def test_binary_operators(op, getter, stream): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) try: left = op(getter(df), 2) right = op(2, getter(df)) except Exception: return a = DataFrame(example=df, stream=stream) l = op(getter(a), 2).stream.gather().sink_to_list() r = op(2, getter(a)).stream.gather().sink_to_list() a.emit(df) assert_eq(l[0], left) assert_eq(r[0], right)
def test_identity(stream): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.stream.gather().sink_to_list() sdf.emit(df) assert L[0] is df assert list(sdf.example.columns) == ["x", "y"] x = sdf.x assert isinstance(x, Series) L2 = x.stream.gather().sink_to_list() assert not L2 sdf.emit(df) assert isinstance(L2[0], pd.Series) assert assert_eq(L2[0], df.x)
def test_rolling_count_aggregations( op, window, m, pre_get, post_get, kwargs, stream ): index = pd.DatetimeIndex(start="2000-01-01", end="2000-01-03", freq="1h") df = pd.DataFrame({"x": np.arange(len(index))}, index=index) expected = getattr(post_get(pre_get(df).rolling(window)), op)(**kwargs) sdf = DataFrame(example=df.iloc[:0], stream=stream) roll = getattr(post_get(pre_get(sdf).rolling(window)), op)(**kwargs) L = roll.stream.gather().sink_to_list() assert len(L) == 0 for i in range(0, len(df), m): sdf.emit(df.iloc[i : i + m]) assert len(L) > 1 assert_eq(pd.concat(L), expected)
def test_instantiate_with_dict(stream): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) sdf2 = DataFrame({"a": sdf.x, "b": sdf.x * 2, "c": sdf.y % 2}) L = sdf2.stream.gather().sink_to_list() assert len(sdf2.columns) == 3 sdf.emit(df) sdf.emit(df) assert len(L) == 2 for x in L: assert_eq( x[["a", "b", "c"]], pd.DataFrame( {"a": df.x, "b": df.x * 2, "c": df.y % 2}, columns=["a", "b", "c"], ), )
def test_groupby_windowing_n(func, n, getter, grouper, indexer): df = pd.DataFrame({"x": np.arange(10, dtype=float), "y": [1.0, 2.0] * 5}) sdf = DataFrame(example=df) def f(x): return func(indexer(x.groupby(grouper(x)))) L = f(sdf.window(n=n)).stream.gather().sink_to_list() diff = 3 for i in range(0, 10, diff): sdf.emit(df.iloc[i : i + diff]) sdf.emit(df.iloc[:0]) assert len(L) == 5 first = df.iloc[max(0, diff - n) : diff] assert_eq(L[0], f(first)) last = df.iloc[len(df) - n :] assert_eq(L[-1], f(last))
def test_window_sum(stream): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).x.sum().stream.gather().sink_to_list() sdf.emit(df) assert L == [6] sdf.emit(df) assert L == [6, 9] sdf.emit(df) assert L == [6, 9, 9]
def test_window_full(): df = pd.DataFrame({"x": np.arange(10, dtype=float), "y": [1.0, 2.0] * 5}) sdf = DataFrame(example=df) L = sdf.window(n=4).apply(lambda x: x).stream.sink_to_list() sdf.emit(df.iloc[:3]) sdf.emit(df.iloc[3:8]) sdf.emit(df.iloc[8:]) assert_eq(L[0], df.iloc[:3]) assert_eq(L[1], df.iloc[4:8]) assert_eq(L[2], df.iloc[-4:])
def test_window_sum_dataframe(stream): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).sum().stream.gather().sink_to_list() sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=["x", "y"])) sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=["x", "y"])) assert_eq(L[1], pd.Series([9, 21], index=["x", "y"])) sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=["x", "y"])) assert_eq(L[1], pd.Series([9, 21], index=["x", "y"])) assert_eq(L[2], pd.Series([9, 21], index=["x", "y"]))
def test_set_index(): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) a = DataFrame(example=df) b = a.set_index("x").stream.sink_to_list() a.emit(df) assert_eq(b[0], df.set_index("x")) b = a.set_index("x", drop=True).stream.sink_to_list() a.emit(df) assert_eq(b[0], df.set_index("x", drop=True)) b = a.set_index(a.y + 1, drop=True).stream.sink_to_list() a.emit(df) assert_eq(b[0], df.set_index(df.y + 1, drop=True))
def test_groupby_aggregate(agg, grouper, indexer, stream): df = pd.DataFrame( {"x": (np.arange(10) // 2).astype(float), "y": [1.0, 2.0] * 5} ) a = DataFrame(example=df.iloc[:0], stream=stream) def f(x): return agg(indexer(x.groupby(grouper(x)))) L = f(a).stream.gather().sink_to_list() a.emit(df.iloc[:3]) a.emit(df.iloc[3:7]) a.emit(df.iloc[7:]) first = df.iloc[:3] assert assert_eq(L[0], f(first)) assert assert_eq(L[-1], f(df))