def test_exceptions(stream): df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) with pytest.raises(TypeError): sdf.emit(1) with pytest.raises(IndexError): sdf.emit(cudf.DataFrame())
def test_exceptions(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) with pytest.raises(TypeError): sdf.emit(1) with pytest.raises(IndexError): sdf.emit(pd.DataFrame())
def test_dataframe_simple(func): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) expected = func(df) a = DataFrame(example=df) L = func(a).stream.sink_to_list() a.emit(df) assert_eq(L[0], expected)
def test_getitem(stream): df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10}) a = DataFrame(example=df.iloc[:0], stream=stream) L = a[a.x > 4].stream.gather().sink_to_list() a.emit(df.iloc[:5]) a.emit(df.iloc[5:]) assert len(L) == 2 assert_eq(cudf.concat(L), df[df.x > 4])
def test_ewm_mean(): sdf = DataFrame(example=pd.DataFrame(columns=['x', 'y'])) L = sdf.ewm(1).mean().stream.gather().sink_to_list() sdf.emit(pd.DataFrame({'x': [1.], 'y': [2.]})) sdf.emit(pd.DataFrame({'x': [2.], 'y': [3.]})) sdf.emit(pd.DataFrame({'x': [3.], 'y': [4.]})) result = pd.concat(L, ignore_index=True) df = pd.DataFrame({'x': [1., 2., 3.], 'y': [2., 3., 4.]}) expected = df.ewm(1).mean() assert_eq(result, expected)
def test_index(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) a = DataFrame(example=df, stream=stream) b = a.index + 5 L = b.stream.gather().sink_to_list() a.emit(df) a.emit(df) assert_eq(L[0], df.index + 5) assert_eq(L[1], df.index + 5)
def test_pair_arithmetic(stream): df = pd.DataFrame({'x': list(range(10)), 'y': [1] * 10}) a = DataFrame(example=df.iloc[:0], stream=stream) L = ((a.x + a.y) * 2).stream.gather().sink_to_list() a.emit(df.iloc[:5]) a.emit(df.iloc[5:]) assert len(L) == 2 assert_eq(pd.concat(L, axis=0), (df.x + df.y) * 2)
def test_getitem(stream): df = pd.DataFrame({'x': list(range(10)), 'y': [1] * 10}) a = DataFrame(example=df.iloc[:0], stream=stream) L = a[a.x > 4].stream.gather().sink_to_list() a.emit(df.iloc[:5]) a.emit(df.iloc[5:]) assert len(L) == 2 assert_eq(pd.concat(L, axis=0), df[df.x > 4])
def test_tail(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.tail(2).stream.gather().sink_to_list() sdf.emit(df) sdf.emit(df) assert_eq(L[0], df.tail(2)) assert_eq(L[1], df.tail(2))
def test_binary_stream_operators(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) expected = df.x + df.y a = DataFrame(example=df, stream=stream) b = (a.x + a.y).stream.gather().sink_to_list() a.emit(df) assert_eq(b[0], expected)
def test_set_index(): df = cudf.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) a = DataFrame(example=df) b = a.set_index('x').stream.sink_to_list() a.emit(df) assert_eq(b[0], df.set_index('x')) b = a.set_index(a.y + 1).stream.sink_to_list() a.emit(df) assert_eq(b[0], df.set_index(df.y + 1))
def test_expanding(func): df = pd.DataFrame({'x': [1.], 'y': [2.]}) sdf = DataFrame(example=df) L = func(sdf.expanding()).stream.gather().sink_to_list() for i in range(5): sdf.emit(df) result = pd.concat(L, axis=1).T.astype(float) expected = func(pd.concat([df] * 5, ignore_index=True).expanding()) assert_eq(result, expected)
def test_unary_operators(op, getter): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) try: expected = op(getter(df)) except Exception: return a = DataFrame(example=df) b = op(getter(a)).stream.sink_to_list() a.emit(df) assert_eq(b[0], expected)
def test_index(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) a = DataFrame(example=df, stream=stream) b = a.index + 5 L = b.stream.gather().sink_to_list() a.emit(df) a.emit(df) wait_for(lambda: len(L) > 1, timeout=2, period=0.05) assert_eq(L[0], df.index + 5) assert_eq(L[1], df.index + 5)
def test_windowing_n(func, n, getter): df = pd.DataFrame({'x': list(range(10)), 'y': [1, 2] * 5}) sdf = DataFrame(example=df) L = func(getter(sdf).window(n=n) + 10).stream.gather().sink_to_list() for i in range(0, 10, 3): sdf.emit(df.iloc[i: i + 3]) sdf.emit(df.iloc[:0]) assert len(L) == 5 assert_eq(L[0], func(getter(df).iloc[max(0, 3 - n): 3] + 10)) assert_eq(L[-1], func(getter(df).iloc[len(df) - n:] + 10))
def test_custom_aggregation(): df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) class Custom(Aggregation): def initial(self, new): return 0 def on_new(self, state, new): return state + 1, state def on_old(self, state, new): return state - 100, state sdf = DataFrame(example=df) L = sdf.aggregate(Custom()).stream.sink_to_list() sdf.emit(df) sdf.emit(df) sdf.emit(df) assert L == [0, 1, 2] sdf = DataFrame(example=df) L = sdf.window(n=5).aggregate(Custom()).stream.sink_to_list() sdf.emit(df) sdf.emit(df) sdf.emit(df) assert L == [1, -198, -397]
def test_reductions(stream, func): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) df_out = func(sdf).stream.gather().sink_to_list() x = sdf.x x_out = func(x).stream.gather().sink_to_list() sdf.emit(df) sdf.emit(df) assert_eq(df_out[-1], func(pd.concat([df, df]))) assert_eq(x_out[-1], func(pd.concat([df, df]).x))
def test_reductions(stream, func): df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) for example in [df, df.iloc[:0]]: sdf = DataFrame(example=example, stream=stream) df_out = func(sdf).stream.gather().sink_to_list() x = sdf.x x_out = func(x).stream.gather().sink_to_list() sdf.emit(df) sdf.emit(df) assert_eq(df_out[-1], func(cudf.concat([df, df]))) assert_eq(x_out[-1], func(cudf.concat([df, df]).x))
def test_cumulative_aggregations(op, getter, stream): df = pd.DataFrame({'x': list(range(10)), 'y': [1] * 10}) expected = getattr(getter(df), op)() sdf = DataFrame(example=df, stream=stream) L = getattr(getter(sdf), op)().stream.gather().sink_to_list() for i in range(0, 10, 3): sdf.emit(df.iloc[i: i + 3]) sdf.emit(df.iloc[:0]) assert len(L) > 1 assert_eq(pd.concat(L), expected)
def test_instantiate_with_dict(stream): df = cudf.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) sdf2 = DataFrame({'a': sdf.x, 'b': sdf.x * 2, 'c': sdf.y % 2}) L = sdf2.stream.gather().sink_to_list() assert len(sdf2.columns) == 3 sdf.emit(df) sdf.emit(df) assert len(L) == 2 for x in L: assert_eq(x[['a', 'b', 'c']], cudf.DataFrame({'a': df.x, 'b': df.x * 2, 'c': df.y % 2}))
def test_binary_operators(op, getter, stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) try: left = op(getter(df), 2) right = op(2, getter(df)) except Exception: return a = DataFrame(example=df, stream=stream) l = op(getter(a), 2).stream.gather().sink_to_list() r = op(2, getter(a)).stream.gather().sink_to_list() a.emit(df) assert_eq(l[0], left) assert_eq(r[0], right)
def test_rolling_count_aggregations(op, window, m, pre_get, post_get, kwargs, stream): index = pd.DatetimeIndex(start='2000-01-01', end='2000-01-03', freq='1h') df = pd.DataFrame({'x': np.arange(len(index))}, index=index) expected = getattr(post_get(pre_get(df).rolling(window)), op)(**kwargs) sdf = DataFrame(example=df.iloc[:0], stream=stream) roll = getattr(post_get(pre_get(sdf).rolling(window)), op)(**kwargs) L = roll.stream.gather().sink_to_list() assert len(L) == 0 for i in range(0, len(df), m): sdf.emit(df.iloc[i: i + m]) assert len(L) > 1 assert_eq(pd.concat(L), expected)
def test_identity(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.stream.gather().sink_to_list() sdf.emit(df) assert L[0] is df assert list(sdf.example.columns) == ['x', 'y'] x = sdf.x assert isinstance(x, Series) L2 = x.stream.gather().sink_to_list() assert not L2 sdf.emit(df) assert isinstance(L2[0], pd.Series) assert assert_eq(L2[0], df.x)
def test_rolling_count_aggregations(op, window, m, pre_get, post_get, kwargs, stream): index = pd.DatetimeIndex( pd.date_range("2000-01-01", "2000-01-03", freq="1h")) df = cudf.DataFrame({"x": np.arange(len(index))}, index=index) expected = getattr(post_get(pre_get(df).rolling(window)), op)(**kwargs) sdf = DataFrame(example=df, stream=stream) roll = getattr(post_get(pre_get(sdf).rolling(window)), op)(**kwargs) L = roll.stream.gather().sink_to_list() assert len(L) == 0 for i in range(0, len(df), m): sdf.emit(df.iloc[i:i + m]) assert len(L) > 1 assert_eq(cudf.concat(L), expected)
def test_groupby_windowing_n(func, n, getter, grouper, indexer): df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) sdf = DataFrame(example=df) def f(x): return func(indexer(x.groupby(grouper(x)))) L = f(sdf.window(n=n)).stream.gather().sink_to_list() diff = 3 for i in range(0, 10, diff): sdf.emit(df.iloc[i: i + diff]) sdf.emit(df.iloc[:0]) assert len(L) == 5 first = df.iloc[max(0, diff - n): diff] assert_eq(L[0], f(first)) last = df.iloc[len(df) - n:] assert_eq(L[-1], f(last))
def test_groupby_windowing_value(func, value, getter, grouper, indexer): index = pd.DatetimeIndex( pd.date_range("2000-01-01", "2000-01-03", freq="1h") ) df = cudf.DataFrame( { "x": np.arange(len(index), dtype=float), "y": np.arange(len(index), dtype=float) % 2, }, index=index, ) value = pd.Timedelta(value) sdf = DataFrame(example=df) def f(x): return func(indexer(x.groupby(grouper(x)))) L = f(sdf.window(value)).stream.gather().sink_to_list() diff = 13 for i in range(0, len(index), diff): sdf.emit(df.iloc[i : i + diff]) assert len(L) == 4 first = df.iloc[:diff] lost = first.loc[first.index.min() + value :] first = first.iloc[len(lost) :] g = f(first) assert_eq(L[0], g) last = df.loc[index.max() - value + pd.Timedelta("1s") :] h = f(last) assert_eq(L[-1], h)
def test_window_sum(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).x.sum().stream.gather().sink_to_list() sdf.emit(df) assert L == [6] sdf.emit(df) assert L == [6, 9] sdf.emit(df) assert L == [6, 9, 9]
def test_window_full(): df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) sdf = DataFrame(example=df) L = sdf.window(n=4).apply(lambda x: x).stream.sink_to_list() sdf.emit(df.iloc[:3]) sdf.emit(df.iloc[3:8]) sdf.emit(df.iloc[8:]) assert_eq(L[0], df.iloc[:3]) assert_eq(L[1], df.iloc[4:8]) assert_eq(L[2], df.iloc[-4:])
def test_window_sum_dataframe(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).sum().stream.gather().sink_to_list() sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) assert_eq(L[1], pd.Series([9, 21], index=['x', 'y'])) sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) assert_eq(L[1], pd.Series([9, 21], index=['x', 'y'])) assert_eq(L[2], pd.Series([9, 21], index=['x', 'y']))
def test_window_sum_dataframe(stream): df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).sum().stream.gather().sink_to_list() sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"])) sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"])) assert_eq(L[1], cudf.Series([9, 21], index=["x", "y"])) sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"])) assert_eq(L[1], cudf.Series([9, 21], index=["x", "y"])) assert_eq(L[2], cudf.Series([9, 21], index=["x", "y"]))