Beispiel #1
0
def test_reductions(stream, func):
    df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
    for example in [df, df.iloc[:0]]:
        sdf = DataFrame(example=example, stream=stream)

        df_out = func(sdf).stream.gather().sink_to_list()

        x = sdf.x
        x_out = func(x).stream.gather().sink_to_list()

        sdf.emit(df)
        sdf.emit(df)

        assert_eq(df_out[-1], func(cudf.concat([df, df])))
        assert_eq(x_out[-1], func(cudf.concat([df, df]).x))
Beispiel #2
0
def test_groupby_aggregate_with_start_state(stream):
    example = cudf.DataFrame({"name": [], "amount": []})
    sdf = DataFrame(stream, example=example).groupby(["name"])
    output0 = sdf.amount.sum(start=None).stream.gather().sink_to_list()
    output1 = (sdf.amount.mean(with_state=True,
                               start=None).stream.gather().sink_to_list())
    output2 = sdf.amount.count(start=None).stream.gather().sink_to_list()

    df = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [50, 100]})
    stream.emit(df)

    out_df0 = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [50, 100]})
    out_df1 = cudf.DataFrame({
        "name": ["Alice", "Tom"],
        "amount": [50.0, 100.0]
    })
    out_df2 = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [1, 1]})
    assert assert_eq(output0[0].reset_index(), out_df0)
    assert assert_eq(output1[0][1].reset_index(), out_df1)
    assert assert_eq(output2[0].reset_index(), out_df2)

    example = cudf.DataFrame({"name": [], "amount": []})
    sdf = DataFrame(stream, example=example).groupby(["name"])
    output3 = sdf.amount.sum(start=output0[0]).stream.gather().sink_to_list()
    output4 = (sdf.amount.mean(
        with_state=True, start=output1[0][0]).stream.gather().sink_to_list())
    output5 = sdf.amount.count(start=output2[0]).stream.gather().sink_to_list()
    df = cudf.DataFrame({
        "name": ["Alice", "Tom", "Linda"],
        "amount": [50, 100, 200]
    })
    stream.emit(df)

    out_df2 = cudf.DataFrame({
        "name": ["Alice", "Linda", "Tom"],
        "amount": [100, 200, 200]
    })
    out_df3 = cudf.DataFrame({
        "name": ["Alice", "Linda", "Tom"],
        "amount": [50.0, 200.0, 100.0]
    })
    out_df4 = cudf.DataFrame({
        "name": ["Alice", "Linda", "Tom"],
        "amount": [2, 1, 2]
    })
    assert assert_eq(output3[0].reset_index(), out_df2)
    assert assert_eq(output4[0][1].reset_index(), out_df3)
    assert assert_eq(output5[0].reset_index(), out_df4)
Beispiel #3
0
 def stream_predict(self, X, *args, **kwargs):
     """predict enabled on streams
     Parameters
     ----------
     X : streamz.dataframe.core.DataFrame
         input data for predictions
     Returns
     -------
     streamz.dataframe.core.DataFrame
         A streaming dataframe containing the predictions
     """
     self._check_stream_inputs(X)
     _y_example = kwargs.pop('y_example', None)
     if _y_example is None:
         _y_example = getattr(self, '_y_example', None)
     if _y_example is None:
         raise AttributeError("""
             No example provided for y, make sure you called stream_partial_fit before,
             or provide a y_example keyword argument when calling this function
         """)
     stream = X.stream.map(self.predict, *args, **kwargs)
     if isinstance(_y_example, pd.Series):
         return Series(stream, example=_y_example)
     if isinstance(_y_example, pd.DataFrame):
         return DataFrame(stream, example=_y_example)
Beispiel #4
0
class DistributedOptimizer(DistributedSwarm):
    def stream_progress(self, state, observation, reward):
        example = pd.DataFrame({"reward": [reward]},
                               index=[self.n_iters // self.n_swarms])
        self.stream.emit(example)
        msg_obs = "Best solution found:\n {}".format(
            np.round(observation, 2).tolist())
        msg_reward = "Best value found: {:.4f}".format(reward)
        data = [[0, 1, msg_reward], [0, 2, msg_obs]]
        self.frame_pipe.send(pd.DataFrame(data, columns=["x", "y", "label"]))

    def init_plot(self):
        self.frame_pipe = Pipe(data=[])
        self.frame_dmap = hv.DynamicMap(hv.Labels, streams=[self.frame_pipe])
        self.frame_dmap = self.frame_dmap.opts(
            xlim=(-10, 10),
            ylim=(0.5, 2.5),
            height=200,
            width=500,
            xaxis=None,
            yaxis=None,
            title="Best solution",
        )
        example = pd.DataFrame({"reward": []})
        self.stream = Stream()
        self.buffer_df = DataFrame(stream=self.stream, example=example)
        self.score_dmap = self.buffer_df.hvplot(y=["reward"]).opts(
            height=200, width=400, title="Best value found")
Beispiel #5
0
def test_instantiate_with_dict(stream):
    df = cudf.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
    sdf = DataFrame(example=df, stream=stream)

    sdf2 = DataFrame({'a': sdf.x, 'b': sdf.x * 2,
                      'c': sdf.y % 2})
    L = sdf2.stream.gather().sink_to_list()
    assert len(sdf2.columns) == 3

    sdf.emit(df)
    sdf.emit(df)

    assert len(L) == 2
    for x in L:
        assert_eq(x[['a', 'b', 'c']],
                  cudf.DataFrame({'a': df.x, 'b': df.x * 2, 'c': df.y % 2}))
def test_stream_predict():
    n_rows = 100
    X_example = pd.DataFrame({
        'name': [None] * n_rows,
        'amount': [None] * n_rows
    })
    X_stream = Stream()
    X = DataFrame(X_stream, example=X_example)

    model = MyStreamingEstimator()
    example_data = pd.Series(pd.np.ones(X_example.shape[0]))
    pred_series = model.stream_predict(X, y_example=pd.Series(example_data))
    pred_df = model.stream_predict(X,
                                   y_example=pd.DataFrame(data=example_data))

    pred_series_list, pred_df_list = [], []

    pred_series.stream.sink(pred_series_list.append)
    pred_df.stream.sink(pred_df_list.append)

    n_fits = 10
    for i in range(n_fits):
        X_stream.emit(X_example)
    ctr_predicate = lambda: (model.predict_ctr == n_fits)
    target_predictions = np.ones((X_example.shape[0], n_fits))

    pred_series_predicate = \
        lambda: pd.np.array_equal(pd.np.concatenate(pred_series_list).reshape(-1), target_predictions.reshape(-1))

    pred_df_predicate = \
        lambda: pd.np.array_equal(pd.np.concatenate(pred_df_list).reshape(-1), target_predictions.reshape(-1))

    await_for(ctr_predicate, .1)
    await_for(pred_series_predicate, .1)
    await_for(pred_df_predicate, .1)
def test_score_stream():
    class MyEstimator(StreamEstimator):
        def partial_fit(self, X, y):
            pass

        def predict(self, X):
            pass

        def score(self, X, y):
            return 1

    n_rows = 20
    X_example, y_example = pd.DataFrame({
        'name': [None] * n_rows,
        'amount': [None] * n_rows
    }), pd.Series([])
    X_stream, y_stream = Stream(), Stream()
    X, y = DataFrame(X_stream, example=X_example), Series(y_stream,
                                                          example=y_example)

    model = MyEstimator()
    score_stream = model.stream_score(X, y)

    score_list = list()
    score_stream.stream.sink(score_list.append)

    score_predicate = lambda: score_list == [1] * n_rows

    await_for(score_predicate, .1)
Beispiel #8
0
def test_dtype(stream):
    df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
    sdf = DataFrame(example=df, stream=stream)

    assert str(sdf.dtypes) == str(df.dtypes)
    assert sdf.x.dtype == df.x.dtype
    assert sdf.index.dtype == df.index.dtype
Beispiel #9
0
def test_dtype(stream):
    df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
    sdf = DataFrame(example=df, stream=stream)

    assert str(sdf.dtypes) == str(df.dtypes)
    assert sdf.x.dtype == df.x.dtype
    assert sdf.index.dtype == df.index.dtype
Beispiel #10
0
def test_binary_operators(op, getter, stream):
    df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
    try:
        left = op(getter(df), 2)
        right = op(2, getter(df))
    except Exception:
        return

    a = DataFrame(example=df, stream=stream)
    l = op(getter(a), 2).stream.gather().sink_to_list()
    r = op(2, getter(a)).stream.gather().sink_to_list()

    a.emit(df)

    assert_eq(l[0], left)
    assert_eq(r[0], right)
Beispiel #11
0
def test_repr_html(stream):
    df = pd.DataFrame({'x': (np.arange(10) // 2).astype(float), 'y': [1.0] * 10})
    a = DataFrame(example=df, stream=stream)

    for x in [a, a.y, a.y.mean()]:
        html = x._repr_html_()
        assert type(x).__name__ in html
        assert '1' in html
Beispiel #12
0
 def init_plot(self):
     self.frame_pipe = Pipe(data=[])
     self.frame_dmap = hv.DynamicMap(hv.Labels, streams=[self.frame_pipe])
     self.frame_dmap = self.frame_dmap.opts(
         xlim=(-10, 10),
         ylim=(0.5, 2.5),
         height=200,
         width=500,
         xaxis=None,
         yaxis=None,
         title="Best solution",
     )
     example = pd.DataFrame({"reward": []})
     self.stream = Stream()
     self.buffer_df = DataFrame(stream=self.stream, example=example)
     self.score_dmap = self.buffer_df.hvplot(y=["reward"]).opts(
         height=200, width=400, title="Best value found")
Beispiel #13
0
def test_groupby_windowing_n(func, n, getter, grouper, indexer):
    df = cudf.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5})

    sdf = DataFrame(example=df)

    def f(x):
        return func(indexer(x.groupby(grouper(x))))

    L = f(sdf.window(n=n)).stream.gather().sink_to_list()

    diff = 3
    for i in range(0, 10, diff):
        sdf.emit(df.iloc[i:i + diff])
    sdf.emit(df.iloc[:0])

    assert len(L) == 5

    first = df.iloc[max(0, diff - n):diff]

    g = f(first)
    assert_eq(L[0], g)

    last = df.iloc[len(df) - n:]
    h = f(last)
    assert_eq(L[-1], h)
Beispiel #14
0
def test_window_sum_dataframe(stream):
    df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
    sdf = DataFrame(example=df, stream=stream)
    L = sdf.window(n=4).sum().stream.gather().sink_to_list()

    sdf.emit(df)
    assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"]))
    sdf.emit(df)
    assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"]))
    assert_eq(L[1], cudf.Series([9, 21], index=["x", "y"]))
    sdf.emit(df)
    assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"]))
    assert_eq(L[1], cudf.Series([9, 21], index=["x", "y"]))
    assert_eq(L[2], cudf.Series([9, 21], index=["x", "y"]))
Beispiel #15
0
def test_window_full():
    df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5})

    sdf = DataFrame(example=df)

    L = sdf.window(n=4).apply(lambda x: x).stream.sink_to_list()

    sdf.emit(df.iloc[:3])
    sdf.emit(df.iloc[3:8])
    sdf.emit(df.iloc[8:])

    assert_eq(L[0], df.iloc[:3])
    assert_eq(L[1], df.iloc[4:8])
    assert_eq(L[2], df.iloc[-4:])
Beispiel #16
0
def test_groupby_windowing_value(func, value, getter, grouper, indexer):
    index = pd.DatetimeIndex(start='2000-01-01', end='2000-01-03', freq='1h')
    df = pd.DataFrame({'x': np.arange(len(index), dtype=float),
                       'y': np.arange(len(index), dtype=float) % 2},
                      index=index)

    sdf = DataFrame(example=df)

    def f(x):
        return func(indexer(x.groupby(grouper(x))))

    L = f(sdf.window(value)).stream.gather().sink_to_list()

    value = pd.Timedelta(value)

    diff = 13
    for i in range(0, len(index), diff):
        sdf.emit(df.iloc[i: i + diff])
    sdf.emit(df.iloc[:0])

    assert len(L) == 5

    first = df.iloc[:diff]
    lost = first[first.index.min() + value:]
    first = first.iloc[len(lost):]

    assert_eq(L[0], f(first))

    last = df.loc[index.max() - value + pd.Timedelta('1s'):]

    assert_eq(L[-1], f(last))
Beispiel #17
0
def test_window_sum_dataframe(stream):
    df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
    sdf = DataFrame(example=df, stream=stream)
    L = sdf.window(n=4).sum().stream.gather().sink_to_list()

    sdf.emit(df)
    assert_eq(L[0], pd.Series([6, 15], index=['x', 'y']))
    sdf.emit(df)
    assert_eq(L[0], pd.Series([6, 15], index=['x', 'y']))
    assert_eq(L[1], pd.Series([9, 21], index=['x', 'y']))
    sdf.emit(df)
    assert_eq(L[0], pd.Series([6, 15], index=['x', 'y']))
    assert_eq(L[1], pd.Series([9, 21], index=['x', 'y']))
    assert_eq(L[2], pd.Series([9, 21], index=['x', 'y']))
Beispiel #18
0
def test_rolling_count_aggregations(op, window, m, pre_get, post_get, kwargs,
        stream):
    index = pd.DatetimeIndex(start='2000-01-01', end='2000-01-03', freq='1h')
    df = pd.DataFrame({'x': np.arange(len(index))}, index=index)

    expected = getattr(post_get(pre_get(df).rolling(window)), op)(**kwargs)

    sdf = DataFrame(example=df.iloc[:0], stream=stream)
    roll = getattr(post_get(pre_get(sdf).rolling(window)), op)(**kwargs)
    L = roll.stream.gather().sink_to_list()
    assert len(L) == 0

    for i in range(0, len(df), m):
        sdf.emit(df.iloc[i: i + m])

    assert len(L) > 1

    assert_eq(pd.concat(L), expected)
Beispiel #19
0
def test_groupby_aggregate_with_start_state(stream):
    example = pd.DataFrame({'name': [], 'amount': []})
    sdf = DataFrame(stream, example=example).groupby(['name'])
    output0 = sdf.amount.sum(start=None).stream.gather().sink_to_list()
    output1 = sdf.amount.mean(with_state=True,
                              start=None).stream.gather().sink_to_list()
    output2 = sdf.amount.count(start=None).stream.gather().sink_to_list()

    df = pd.DataFrame({'name': ['Alice', 'Tom'], 'amount': [50, 100]})
    stream.emit(df)

    out_df0 = pd.DataFrame({'name': ['Alice', 'Tom'], 'amount': [50.0, 100.0]})
    out_df1 = pd.DataFrame({'name': ['Alice', 'Tom'], 'amount': [1, 1]})
    assert assert_eq(output0[0].reset_index(), out_df0)
    assert assert_eq(output1[0][1].reset_index(), out_df0)
    assert assert_eq(output2[0].reset_index(), out_df1)

    example = pd.DataFrame({'name': [], 'amount': []})
    sdf = DataFrame(stream, example=example).groupby(['name'])
    output3 = sdf.amount.sum(start=output0[0]).stream.gather().sink_to_list()
    output4 = sdf.amount.mean(
        with_state=True, start=output1[0][0]).stream.gather().sink_to_list()
    output5 = sdf.amount.count(start=output2[0]).stream.gather().sink_to_list()
    df = pd.DataFrame({
        'name': ['Alice', 'Tom', 'Linda'],
        'amount': [50, 100, 200]
    })
    stream.emit(df)

    out_df2 = pd.DataFrame({
        'name': ['Alice', 'Linda', 'Tom'],
        'amount': [100.0, 200.0, 200.0]
    })
    out_df3 = pd.DataFrame({
        'name': ['Alice', 'Linda', 'Tom'],
        'amount': [50.0, 200.0, 100.0]
    })
    out_df4 = pd.DataFrame({
        'name': ['Alice', 'Linda', 'Tom'],
        'amount': [2, 1, 2]
    })
    assert assert_eq(output3[0].reset_index(), out_df2)
    assert assert_eq(output4[0][1].reset_index(), out_df3)
    assert assert_eq(output5[0].reset_index(), out_df4)
Beispiel #20
0
def test_gc():
    sdf = sd.Random(freq='5ms', interval='100ms')
    a = DataFrame({'volatility': sdf.x.rolling('100ms').var(),
                            'sub': sdf.x - sdf.x.rolling('100ms').mean()})
    n = len(sdf.stream.downstreams)
    yield gen.sleep(0.1)
    a = DataFrame({'volatility': sdf.x.rolling('100ms').var(),
                            'sub': sdf.x - sdf.x.rolling('100ms').mean()})
    yield gen.sleep(0.1)
    a = DataFrame({'volatility': sdf.x.rolling('100ms').var(),
                            'sub': sdf.x - sdf.x.rolling('100ms').mean()})
    yield gen.sleep(0.1)
    a = DataFrame({'volatility': sdf.x.rolling('100ms').var(),
                            'sub': sdf.x - sdf.x.rolling('100ms').mean()})

    assert len(sdf.stream.downstreams) == n
    del a
    import gc; gc.collect()
    assert len(sdf.stream.downstreams) == 0
Beispiel #21
0
def test_display(stream):
    pytest.importorskip("ipywidgets")
    pytest.importorskip("IPython")

    df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
    sdf = DataFrame(example=df, stream=stream)

    s = sdf.x.sum()

    s._ipython_display_()
Beispiel #22
0
def test_attributes():
    df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
    sdf = DataFrame(example=df)

    assert getattr(sdf, "x", -1) != -1
    assert getattr(sdf, "z", -1) == -1

    sdf.x
    with pytest.raises(AttributeError):
        sdf.z
Beispiel #23
0
def test_set_index():
    df = cudf.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})

    a = DataFrame(example=df)

    b = a.set_index('x').stream.sink_to_list()
    a.emit(df)
    assert_eq(b[0], df.set_index('x'))

    b = a.set_index(a.y + 1).stream.sink_to_list()
    a.emit(df)
    assert_eq(b[0], df.set_index(df.y + 1))
Beispiel #24
0
def test_repr_html(stream):
    df = cudf.DataFrame(
        {"x": (np.arange(10) // 2).astype(float), "y": [1.0] * 10}
    )
    a = DataFrame(example=df, stream=stream)

    for x in [a, a.y, a.y.mean()]:
        html = x._repr_html_()
        assert type(x).__name__ in html
        assert "1" in html
Beispiel #25
0
def test_display(stream):
    pytest.importorskip('ipywidgets')
    pytest.importorskip('IPython')

    df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
    sdf = DataFrame(example=df, stream=stream)

    s = sdf.x.sum()

    s._ipython_display_()
Beispiel #26
0
def test_attributes():
    df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
    sdf = DataFrame(example=df)

    assert getattr(sdf,'x',-1) != -1
    assert getattr(sdf,'z',-1) == -1

    sdf.x
    with pytest.raises(AttributeError):
        sdf.z
Beispiel #27
0
def test_rolling_count_aggregations(op, window, m, pre_get, post_get, kwargs,
                                    stream):
    index = pd.DatetimeIndex(
        pd.date_range("2000-01-01", "2000-01-03", freq="1h"))
    df = cudf.DataFrame({"x": np.arange(len(index))}, index=index)

    expected = getattr(post_get(pre_get(df).rolling(window)), op)(**kwargs)

    sdf = DataFrame(example=df, stream=stream)
    roll = getattr(post_get(pre_get(sdf).rolling(window)), op)(**kwargs)
    L = roll.stream.gather().sink_to_list()
    assert len(L) == 0

    for i in range(0, len(df), m):
        sdf.emit(df.iloc[i:i + m])

    assert len(L) > 1

    assert_eq(cudf.concat(L), expected)
Beispiel #28
0
def test_attributes():
    df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
    sdf = DataFrame(example=df)

    assert 'x' in dir(sdf)
    assert 'z' not in dir(sdf)

    sdf.x
    with pytest.raises(AttributeError):
        sdf.z
Beispiel #29
0
def test_display(monkeypatch, capsys):
    pytest.importorskip("ipywidgets")
    import ipywidgets
    df = pd.DataFrame({
        'x': (np.arange(10) // 2).astype(float),
        'y': [1.0] * 10
    })
    a = DataFrame(example=df, stream=stream)

    # works by side-affect of display()
    a._ipython_display_()
    assert "Output()" in capsys.readouterr().out

    def get(*_, **__):
        raise ImportError

    monkeypatch.setattr(ipywidgets.Output, "__init__", get)

    out = source._ipython_display_()
    assert "DataFrame" in capsys.readouterr().out
Beispiel #30
0
def test_setitem(stream):
    df = pd.DataFrame({'x': list(range(10)), 'y': [1] * 10})

    sdf = DataFrame(example=df.iloc[:0], stream=stream)
    stream = sdf.stream

    sdf['z'] = sdf['x'] * 2
    sdf['a'] = 10
    sdf[['c', 'd']] = sdf[['x', 'y']]

    L = sdf.mean().stream.gather().sink_to_list()

    stream.emit(df.iloc[:3])
    stream.emit(df.iloc[3:7])
    stream.emit(df.iloc[7:])

    df['z'] = df['x'] * 2
    df['a'] = 10
    df[['c', 'd']] = df[['x', 'y']]

    assert_eq(L[-1], df.mean())