def test_custom_aggregation(): df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) class Custom(Aggregation): def initial(self, new): return 0 def on_new(self, state, new): return state + 1, state def on_old(self, state, new): return state - 100, state sdf = DataFrame(example=df) L = sdf.aggregate(Custom()).stream.sink_to_list() sdf.emit(df) sdf.emit(df) sdf.emit(df) assert L == [0, 1, 2] sdf = DataFrame(example=df) L = sdf.window(n=5).aggregate(Custom()).stream.sink_to_list() sdf.emit(df) sdf.emit(df) sdf.emit(df) assert L == [1, -198, -397]
def test_groupby_windowing_value(func, value, getter, grouper, indexer): index = pd.date_range(start='2000-01-01', end='2000-01-03', freq='1h') df = pd.DataFrame( { 'x': np.arange(len(index), dtype=float), 'y': np.arange(len(index), dtype=float) % 2 }, index=index) sdf = DataFrame(example=df) def f(x): return func(indexer(x.groupby(grouper(x)))) L = f(sdf.window(value)).stream.gather().sink_to_list() value = pd.Timedelta(value) diff = 13 for i in range(0, len(index), diff): sdf.emit(df.iloc[i:i + diff]) sdf.emit(df.iloc[:0]) assert len(L) == 5 first = df.iloc[:diff] first = first[first.index.max() - value + pd.Timedelta('1ns'):] assert_eq(L[0], f(first)) last = df.loc[index.max() - value + pd.Timedelta('1ns'):] assert_eq(L[-1], f(last))
def test_groupby_windowing_n(func, n, getter, grouper, indexer): df = cudf.DataFrame({"x": np.arange(10, dtype=float), "y": [1.0, 2.0] * 5}) sdf = DataFrame(example=df) def f(x): return func(indexer(x.groupby(grouper(x)))) L = f(sdf.window(n=n)).stream.gather().sink_to_list() diff = 3 for i in range(0, 10, diff): sdf.emit(df.iloc[i:i + diff]) sdf.emit(df.iloc[:0]) assert len(L) == 5 first = df.iloc[max(0, diff - n):diff] g = f(first) assert_eq(L[0], g) last = df.iloc[len(df) - n:] h = f(last) assert_eq(L[-1], h)
def test_groupby_windowing_value(func, value, getter, grouper, indexer): index = pd.DatetimeIndex(start="2000-01-01", end="2000-01-03", freq="1h") df = cudf.DataFrame( { "x": np.arange(len(index), dtype=float), "y": np.arange(len(index), dtype=float) % 2, }, index=index, ) value = pd.Timedelta(value) sdf = DataFrame(example=df) def f(x): return func(indexer(x.groupby(grouper(x)))) L = f(sdf.window(value)).stream.gather().sink_to_list() diff = 13 for i in range(0, len(index), diff): sdf.emit(df.iloc[i:i + diff]) assert len(L) == 4 first = df.iloc[:diff] lost = first.loc[first.index.min() + value:] first = first.iloc[len(lost):] g = f(first) assert_eq(L[0], g) last = df.loc[index.max() - value + pd.Timedelta("1s"):] h = f(last) assert_eq(L[-1], h)
def test_window_aggs_with_start_state(stream): example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) output0 = sdf.window(2, with_state=True, start=None).amount.sum().stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice', 'Tom', 'Linda'], 'amount': [50, 100, 200]}) stream.emit(df) df = pd.DataFrame({'name': ['Bob'], 'amount': [250]}) stream.emit(df) assert output0[-1][1] == 450 stream = Stream() example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) output1 = sdf.window(2, with_state=True, start=output0[-1][0]).amount.sum().stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice'], 'amount': [50]}) stream.emit(df) assert output1[-1][1] == 300
def test_windowed_groupby_aggs_with_start_state(stream): example = cudf.DataFrame({"name": [], "amount": []}) sdf = DataFrame(stream, example=example) output0 = ( sdf.window(5, with_state=True, start=None) .groupby(["name"]) .amount.sum() .stream.gather() .sink_to_list() ) df = cudf.DataFrame( {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]} ) stream.emit(df) df = cudf.DataFrame( {"name": ["Alice", "Linda", "Bob"], "amount": [250, 300, 350]} ) stream.emit(df) stream = Stream() example = cudf.DataFrame({"name": [], "amount": []}) sdf = DataFrame(stream, example=example) output1 = ( sdf.window(5, with_state=True, start=output0[-1][0]) .groupby(["name"]) .amount.sum() .stream.gather() .sink_to_list() ) df = cudf.DataFrame( { "name": ["Alice", "Linda", "Tom", "Bob"], "amount": [50, 100, 150, 200], } ) stream.emit(df) out_df1 = cudf.DataFrame( { "name": ["Alice", "Bob", "Linda", "Tom"], "amount": [50, 550, 100, 150], } ) assert_eq(output1[-1][1].reset_index(), out_df1)
def test_window_sum(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).x.sum().stream.gather().sink_to_list() sdf.emit(df) assert L == [6] sdf.emit(df) assert L == [6, 9] sdf.emit(df) assert L == [6, 9, 9]
def test_windowed_groupby_aggs_with_start_state(stream): example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) output0 = sdf.window(5, with_state=True, start=None).groupby(['name']).amount.sum().\ stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice', 'Tom', 'Linda'], 'amount': [50, 100, 200]}) stream.emit(df) df = pd.DataFrame({'name': ['Alice', 'Linda', 'Bob'], 'amount': [250, 300, 350]}) stream.emit(df) stream = Stream() example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) output1 = sdf.window(5, with_state=True, start=output0[-1][0]).groupby(['name']).amount.sum().\ stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice', 'Linda', 'Tom', 'Bob'], 'amount': [50, 100, 150, 200]}) stream.emit(df) out_df1 = pd.DataFrame({'name':['Alice', 'Bob', 'Linda', 'Tom'], 'amount':[50.0, 550.0, 100.0, 150.0]}) assert_eq(output1[-1][1].reset_index(), out_df1)
def test_windowing_value_empty_intermediate_index(stream): def preprocess(df): mask = df["amount"] == 5 df = df.loc[mask] return df source = stream.map(preprocess) example = pd.DataFrame({"amount": []}) sdf = DataFrame(stream=source, example=example) output = sdf.window("2h").amount.sum().stream.gather().sink_to_list() stream.emit( pd.DataFrame({"amount": [1, 2, 3]}, index=[ pd.Timestamp("2050-01-01 00:00:00"), pd.Timestamp("2050-01-01 01:00:00"), pd.Timestamp("2050-01-01 02:00:00") ])) stream.emit( pd.DataFrame({"amount": [5, 5, 5]}, index=[ pd.Timestamp("2050-01-01 03:00:00"), pd.Timestamp("2050-01-01 04:00:00"), pd.Timestamp("2050-01-01 05:00:00") ])) stream.emit( pd.DataFrame({"amount": [4, 5, 6]}, index=[ pd.Timestamp("2050-01-01 06:00:00"), pd.Timestamp("2050-01-01 07:00:00"), pd.Timestamp("2050-01-01 08:00:00") ])) stream.emit( pd.DataFrame({"amount": [1, 2, 3]}, index=[ pd.Timestamp("2050-01-01 09:00:00"), pd.Timestamp("2050-01-01 10:00:00"), pd.Timestamp("2050-01-01 11:00:00") ])) stream.emit( pd.DataFrame({"amount": [5, 5, 5]}, index=[ pd.Timestamp("2050-01-01 12:00:00"), pd.Timestamp("2050-01-01 13:00:00"), pd.Timestamp("2050-01-01 14:00:00") ])) assert_eq(output, [0, 10, 5, 5, 10])
def test_window_full(): df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) sdf = DataFrame(example=df) L = sdf.window(n=4).apply(lambda x: x).stream.sink_to_list() sdf.emit(df.iloc[:3]) sdf.emit(df.iloc[3:8]) sdf.emit(df.iloc[8:]) assert_eq(L[0], df.iloc[:3]) assert_eq(L[1], df.iloc[4:8]) assert_eq(L[2], df.iloc[-4:])
def test_window_sum_dataframe(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).sum().stream.gather().sink_to_list() sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) assert_eq(L[1], pd.Series([9, 21], index=['x', 'y'])) sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) assert_eq(L[1], pd.Series([9, 21], index=['x', 'y'])) assert_eq(L[2], pd.Series([9, 21], index=['x', 'y']))
def test_window_sum_dataframe(stream): df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).sum().stream.gather().sink_to_list() sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"])) sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"])) assert_eq(L[1], cudf.Series([9, 21], index=["x", "y"])) sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"])) assert_eq(L[1], cudf.Series([9, 21], index=["x", "y"])) assert_eq(L[2], cudf.Series([9, 21], index=["x", "y"]))
samples = pd.DataFrame({'x': [0], 'y': [0]}) # The streaming dataframe takes the source stream and sample pandas dataframe # The sample defines the dataframe schema, maybe? sdf = DataFrame(source, example=samples) def stest(r): print(datetime.now()) print(r) # I don't recall what this does # I think what I was looking to do was display the last 3 items...? # ...which this doesn't appear to do! df = sdf.window(2).full() # This seems to set a callback on stest when a stream element appears df.stream.sink(stest) for i in range(10): # pull the next item in the streaming dataframe into the stream # We could iloc on an existing dataframe? source.emit(pd.DataFrame({'x': [i, i, i], 'y': [i, i, i]})) # Pause for a short while... sleep(0.2) print() print('--------------------------') print()