def test_reductions(stream, func): df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) for example in [df, df.iloc[:0]]: sdf = DataFrame(example=example, stream=stream) df_out = func(sdf).stream.gather().sink_to_list() x = sdf.x x_out = func(x).stream.gather().sink_to_list() sdf.emit(df) sdf.emit(df) assert_eq(df_out[-1], func(cudf.concat([df, df]))) assert_eq(x_out[-1], func(cudf.concat([df, df]).x))
def test_groupby_aggregate_with_start_state(stream): example = cudf.DataFrame({"name": [], "amount": []}) sdf = DataFrame(stream, example=example).groupby(["name"]) output0 = sdf.amount.sum(start=None).stream.gather().sink_to_list() output1 = (sdf.amount.mean(with_state=True, start=None).stream.gather().sink_to_list()) output2 = sdf.amount.count(start=None).stream.gather().sink_to_list() df = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [50, 100]}) stream.emit(df) out_df0 = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [50, 100]}) out_df1 = cudf.DataFrame({ "name": ["Alice", "Tom"], "amount": [50.0, 100.0] }) out_df2 = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [1, 1]}) assert assert_eq(output0[0].reset_index(), out_df0) assert assert_eq(output1[0][1].reset_index(), out_df1) assert assert_eq(output2[0].reset_index(), out_df2) example = cudf.DataFrame({"name": [], "amount": []}) sdf = DataFrame(stream, example=example).groupby(["name"]) output3 = sdf.amount.sum(start=output0[0]).stream.gather().sink_to_list() output4 = (sdf.amount.mean( with_state=True, start=output1[0][0]).stream.gather().sink_to_list()) output5 = sdf.amount.count(start=output2[0]).stream.gather().sink_to_list() df = cudf.DataFrame({ "name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200] }) stream.emit(df) out_df2 = cudf.DataFrame({ "name": ["Alice", "Linda", "Tom"], "amount": [100, 200, 200] }) out_df3 = cudf.DataFrame({ "name": ["Alice", "Linda", "Tom"], "amount": [50.0, 200.0, 100.0] }) out_df4 = cudf.DataFrame({ "name": ["Alice", "Linda", "Tom"], "amount": [2, 1, 2] }) assert assert_eq(output3[0].reset_index(), out_df2) assert assert_eq(output4[0][1].reset_index(), out_df3) assert assert_eq(output5[0].reset_index(), out_df4)
def stream_predict(self, X, *args, **kwargs): """predict enabled on streams Parameters ---------- X : streamz.dataframe.core.DataFrame input data for predictions Returns ------- streamz.dataframe.core.DataFrame A streaming dataframe containing the predictions """ self._check_stream_inputs(X) _y_example = kwargs.pop('y_example', None) if _y_example is None: _y_example = getattr(self, '_y_example', None) if _y_example is None: raise AttributeError(""" No example provided for y, make sure you called stream_partial_fit before, or provide a y_example keyword argument when calling this function """) stream = X.stream.map(self.predict, *args, **kwargs) if isinstance(_y_example, pd.Series): return Series(stream, example=_y_example) if isinstance(_y_example, pd.DataFrame): return DataFrame(stream, example=_y_example)
class DistributedOptimizer(DistributedSwarm): def stream_progress(self, state, observation, reward): example = pd.DataFrame({"reward": [reward]}, index=[self.n_iters // self.n_swarms]) self.stream.emit(example) msg_obs = "Best solution found:\n {}".format( np.round(observation, 2).tolist()) msg_reward = "Best value found: {:.4f}".format(reward) data = [[0, 1, msg_reward], [0, 2, msg_obs]] self.frame_pipe.send(pd.DataFrame(data, columns=["x", "y", "label"])) def init_plot(self): self.frame_pipe = Pipe(data=[]) self.frame_dmap = hv.DynamicMap(hv.Labels, streams=[self.frame_pipe]) self.frame_dmap = self.frame_dmap.opts( xlim=(-10, 10), ylim=(0.5, 2.5), height=200, width=500, xaxis=None, yaxis=None, title="Best solution", ) example = pd.DataFrame({"reward": []}) self.stream = Stream() self.buffer_df = DataFrame(stream=self.stream, example=example) self.score_dmap = self.buffer_df.hvplot(y=["reward"]).opts( height=200, width=400, title="Best value found")
def test_instantiate_with_dict(stream): df = cudf.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) sdf2 = DataFrame({'a': sdf.x, 'b': sdf.x * 2, 'c': sdf.y % 2}) L = sdf2.stream.gather().sink_to_list() assert len(sdf2.columns) == 3 sdf.emit(df) sdf.emit(df) assert len(L) == 2 for x in L: assert_eq(x[['a', 'b', 'c']], cudf.DataFrame({'a': df.x, 'b': df.x * 2, 'c': df.y % 2}))
def test_stream_predict(): n_rows = 100 X_example = pd.DataFrame({ 'name': [None] * n_rows, 'amount': [None] * n_rows }) X_stream = Stream() X = DataFrame(X_stream, example=X_example) model = MyStreamingEstimator() example_data = pd.Series(pd.np.ones(X_example.shape[0])) pred_series = model.stream_predict(X, y_example=pd.Series(example_data)) pred_df = model.stream_predict(X, y_example=pd.DataFrame(data=example_data)) pred_series_list, pred_df_list = [], [] pred_series.stream.sink(pred_series_list.append) pred_df.stream.sink(pred_df_list.append) n_fits = 10 for i in range(n_fits): X_stream.emit(X_example) ctr_predicate = lambda: (model.predict_ctr == n_fits) target_predictions = np.ones((X_example.shape[0], n_fits)) pred_series_predicate = \ lambda: pd.np.array_equal(pd.np.concatenate(pred_series_list).reshape(-1), target_predictions.reshape(-1)) pred_df_predicate = \ lambda: pd.np.array_equal(pd.np.concatenate(pred_df_list).reshape(-1), target_predictions.reshape(-1)) await_for(ctr_predicate, .1) await_for(pred_series_predicate, .1) await_for(pred_df_predicate, .1)
def test_score_stream(): class MyEstimator(StreamEstimator): def partial_fit(self, X, y): pass def predict(self, X): pass def score(self, X, y): return 1 n_rows = 20 X_example, y_example = pd.DataFrame({ 'name': [None] * n_rows, 'amount': [None] * n_rows }), pd.Series([]) X_stream, y_stream = Stream(), Stream() X, y = DataFrame(X_stream, example=X_example), Series(y_stream, example=y_example) model = MyEstimator() score_stream = model.stream_score(X, y) score_list = list() score_stream.stream.sink(score_list.append) score_predicate = lambda: score_list == [1] * n_rows await_for(score_predicate, .1)
def test_dtype(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) assert str(sdf.dtypes) == str(df.dtypes) assert sdf.x.dtype == df.x.dtype assert sdf.index.dtype == df.index.dtype
def test_dtype(stream): df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) assert str(sdf.dtypes) == str(df.dtypes) assert sdf.x.dtype == df.x.dtype assert sdf.index.dtype == df.index.dtype
def test_binary_operators(op, getter, stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) try: left = op(getter(df), 2) right = op(2, getter(df)) except Exception: return a = DataFrame(example=df, stream=stream) l = op(getter(a), 2).stream.gather().sink_to_list() r = op(2, getter(a)).stream.gather().sink_to_list() a.emit(df) assert_eq(l[0], left) assert_eq(r[0], right)
def test_repr_html(stream): df = pd.DataFrame({'x': (np.arange(10) // 2).astype(float), 'y': [1.0] * 10}) a = DataFrame(example=df, stream=stream) for x in [a, a.y, a.y.mean()]: html = x._repr_html_() assert type(x).__name__ in html assert '1' in html
def init_plot(self): self.frame_pipe = Pipe(data=[]) self.frame_dmap = hv.DynamicMap(hv.Labels, streams=[self.frame_pipe]) self.frame_dmap = self.frame_dmap.opts( xlim=(-10, 10), ylim=(0.5, 2.5), height=200, width=500, xaxis=None, yaxis=None, title="Best solution", ) example = pd.DataFrame({"reward": []}) self.stream = Stream() self.buffer_df = DataFrame(stream=self.stream, example=example) self.score_dmap = self.buffer_df.hvplot(y=["reward"]).opts( height=200, width=400, title="Best value found")
def test_groupby_windowing_n(func, n, getter, grouper, indexer): df = cudf.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) sdf = DataFrame(example=df) def f(x): return func(indexer(x.groupby(grouper(x)))) L = f(sdf.window(n=n)).stream.gather().sink_to_list() diff = 3 for i in range(0, 10, diff): sdf.emit(df.iloc[i:i + diff]) sdf.emit(df.iloc[:0]) assert len(L) == 5 first = df.iloc[max(0, diff - n):diff] g = f(first) assert_eq(L[0], g) last = df.iloc[len(df) - n:] h = f(last) assert_eq(L[-1], h)
def test_window_sum_dataframe(stream): df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).sum().stream.gather().sink_to_list() sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"])) sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"])) assert_eq(L[1], cudf.Series([9, 21], index=["x", "y"])) sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"])) assert_eq(L[1], cudf.Series([9, 21], index=["x", "y"])) assert_eq(L[2], cudf.Series([9, 21], index=["x", "y"]))
def test_window_full(): df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) sdf = DataFrame(example=df) L = sdf.window(n=4).apply(lambda x: x).stream.sink_to_list() sdf.emit(df.iloc[:3]) sdf.emit(df.iloc[3:8]) sdf.emit(df.iloc[8:]) assert_eq(L[0], df.iloc[:3]) assert_eq(L[1], df.iloc[4:8]) assert_eq(L[2], df.iloc[-4:])
def test_groupby_windowing_value(func, value, getter, grouper, indexer): index = pd.DatetimeIndex(start='2000-01-01', end='2000-01-03', freq='1h') df = pd.DataFrame({'x': np.arange(len(index), dtype=float), 'y': np.arange(len(index), dtype=float) % 2}, index=index) sdf = DataFrame(example=df) def f(x): return func(indexer(x.groupby(grouper(x)))) L = f(sdf.window(value)).stream.gather().sink_to_list() value = pd.Timedelta(value) diff = 13 for i in range(0, len(index), diff): sdf.emit(df.iloc[i: i + diff]) sdf.emit(df.iloc[:0]) assert len(L) == 5 first = df.iloc[:diff] lost = first[first.index.min() + value:] first = first.iloc[len(lost):] assert_eq(L[0], f(first)) last = df.loc[index.max() - value + pd.Timedelta('1s'):] assert_eq(L[-1], f(last))
def test_window_sum_dataframe(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).sum().stream.gather().sink_to_list() sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) assert_eq(L[1], pd.Series([9, 21], index=['x', 'y'])) sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) assert_eq(L[1], pd.Series([9, 21], index=['x', 'y'])) assert_eq(L[2], pd.Series([9, 21], index=['x', 'y']))
def test_rolling_count_aggregations(op, window, m, pre_get, post_get, kwargs, stream): index = pd.DatetimeIndex(start='2000-01-01', end='2000-01-03', freq='1h') df = pd.DataFrame({'x': np.arange(len(index))}, index=index) expected = getattr(post_get(pre_get(df).rolling(window)), op)(**kwargs) sdf = DataFrame(example=df.iloc[:0], stream=stream) roll = getattr(post_get(pre_get(sdf).rolling(window)), op)(**kwargs) L = roll.stream.gather().sink_to_list() assert len(L) == 0 for i in range(0, len(df), m): sdf.emit(df.iloc[i: i + m]) assert len(L) > 1 assert_eq(pd.concat(L), expected)
def test_groupby_aggregate_with_start_state(stream): example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example).groupby(['name']) output0 = sdf.amount.sum(start=None).stream.gather().sink_to_list() output1 = sdf.amount.mean(with_state=True, start=None).stream.gather().sink_to_list() output2 = sdf.amount.count(start=None).stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice', 'Tom'], 'amount': [50, 100]}) stream.emit(df) out_df0 = pd.DataFrame({'name': ['Alice', 'Tom'], 'amount': [50.0, 100.0]}) out_df1 = pd.DataFrame({'name': ['Alice', 'Tom'], 'amount': [1, 1]}) assert assert_eq(output0[0].reset_index(), out_df0) assert assert_eq(output1[0][1].reset_index(), out_df0) assert assert_eq(output2[0].reset_index(), out_df1) example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example).groupby(['name']) output3 = sdf.amount.sum(start=output0[0]).stream.gather().sink_to_list() output4 = sdf.amount.mean( with_state=True, start=output1[0][0]).stream.gather().sink_to_list() output5 = sdf.amount.count(start=output2[0]).stream.gather().sink_to_list() df = pd.DataFrame({ 'name': ['Alice', 'Tom', 'Linda'], 'amount': [50, 100, 200] }) stream.emit(df) out_df2 = pd.DataFrame({ 'name': ['Alice', 'Linda', 'Tom'], 'amount': [100.0, 200.0, 200.0] }) out_df3 = pd.DataFrame({ 'name': ['Alice', 'Linda', 'Tom'], 'amount': [50.0, 200.0, 100.0] }) out_df4 = pd.DataFrame({ 'name': ['Alice', 'Linda', 'Tom'], 'amount': [2, 1, 2] }) assert assert_eq(output3[0].reset_index(), out_df2) assert assert_eq(output4[0][1].reset_index(), out_df3) assert assert_eq(output5[0].reset_index(), out_df4)
def test_gc(): sdf = sd.Random(freq='5ms', interval='100ms') a = DataFrame({'volatility': sdf.x.rolling('100ms').var(), 'sub': sdf.x - sdf.x.rolling('100ms').mean()}) n = len(sdf.stream.downstreams) yield gen.sleep(0.1) a = DataFrame({'volatility': sdf.x.rolling('100ms').var(), 'sub': sdf.x - sdf.x.rolling('100ms').mean()}) yield gen.sleep(0.1) a = DataFrame({'volatility': sdf.x.rolling('100ms').var(), 'sub': sdf.x - sdf.x.rolling('100ms').mean()}) yield gen.sleep(0.1) a = DataFrame({'volatility': sdf.x.rolling('100ms').var(), 'sub': sdf.x - sdf.x.rolling('100ms').mean()}) assert len(sdf.stream.downstreams) == n del a import gc; gc.collect() assert len(sdf.stream.downstreams) == 0
def test_display(stream): pytest.importorskip("ipywidgets") pytest.importorskip("IPython") df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) s = sdf.x.sum() s._ipython_display_()
def test_attributes(): df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) sdf = DataFrame(example=df) assert getattr(sdf, "x", -1) != -1 assert getattr(sdf, "z", -1) == -1 sdf.x with pytest.raises(AttributeError): sdf.z
def test_set_index(): df = cudf.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) a = DataFrame(example=df) b = a.set_index('x').stream.sink_to_list() a.emit(df) assert_eq(b[0], df.set_index('x')) b = a.set_index(a.y + 1).stream.sink_to_list() a.emit(df) assert_eq(b[0], df.set_index(df.y + 1))
def test_repr_html(stream): df = cudf.DataFrame( {"x": (np.arange(10) // 2).astype(float), "y": [1.0] * 10} ) a = DataFrame(example=df, stream=stream) for x in [a, a.y, a.y.mean()]: html = x._repr_html_() assert type(x).__name__ in html assert "1" in html
def test_display(stream): pytest.importorskip('ipywidgets') pytest.importorskip('IPython') df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) s = sdf.x.sum() s._ipython_display_()
def test_attributes(): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df) assert getattr(sdf,'x',-1) != -1 assert getattr(sdf,'z',-1) == -1 sdf.x with pytest.raises(AttributeError): sdf.z
def test_rolling_count_aggregations(op, window, m, pre_get, post_get, kwargs, stream): index = pd.DatetimeIndex( pd.date_range("2000-01-01", "2000-01-03", freq="1h")) df = cudf.DataFrame({"x": np.arange(len(index))}, index=index) expected = getattr(post_get(pre_get(df).rolling(window)), op)(**kwargs) sdf = DataFrame(example=df, stream=stream) roll = getattr(post_get(pre_get(sdf).rolling(window)), op)(**kwargs) L = roll.stream.gather().sink_to_list() assert len(L) == 0 for i in range(0, len(df), m): sdf.emit(df.iloc[i:i + m]) assert len(L) > 1 assert_eq(cudf.concat(L), expected)
def test_attributes(): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df) assert 'x' in dir(sdf) assert 'z' not in dir(sdf) sdf.x with pytest.raises(AttributeError): sdf.z
def test_display(monkeypatch, capsys): pytest.importorskip("ipywidgets") import ipywidgets df = pd.DataFrame({ 'x': (np.arange(10) // 2).astype(float), 'y': [1.0] * 10 }) a = DataFrame(example=df, stream=stream) # works by side-affect of display() a._ipython_display_() assert "Output()" in capsys.readouterr().out def get(*_, **__): raise ImportError monkeypatch.setattr(ipywidgets.Output, "__init__", get) out = source._ipython_display_() assert "DataFrame" in capsys.readouterr().out
def test_setitem(stream): df = pd.DataFrame({'x': list(range(10)), 'y': [1] * 10}) sdf = DataFrame(example=df.iloc[:0], stream=stream) stream = sdf.stream sdf['z'] = sdf['x'] * 2 sdf['a'] = 10 sdf[['c', 'd']] = sdf[['x', 'y']] L = sdf.mean().stream.gather().sink_to_list() stream.emit(df.iloc[:3]) stream.emit(df.iloc[3:7]) stream.emit(df.iloc[7:]) df['z'] = df['x'] * 2 df['a'] = 10 df[['c', 'd']] = df[['x', 'y']] assert_eq(L[-1], df.mean())