コード例 #1
0
def test_custom_aggregation():
    df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5})

    class Custom(Aggregation):
        def initial(self, new):
            return 0

        def on_new(self, state, new):
            return state + 1, state

        def on_old(self, state, new):
            return state - 100, state

    sdf = DataFrame(example=df)
    L = sdf.aggregate(Custom()).stream.sink_to_list()

    sdf.emit(df)
    sdf.emit(df)
    sdf.emit(df)

    assert L == [0, 1, 2]

    sdf = DataFrame(example=df)
    L = sdf.window(n=5).aggregate(Custom()).stream.sink_to_list()

    sdf.emit(df)
    sdf.emit(df)
    sdf.emit(df)

    assert L == [1, -198, -397]
コード例 #2
0
def test_groupby_windowing_value(func, value, getter, grouper, indexer):
    index = pd.date_range(start='2000-01-01', end='2000-01-03', freq='1h')
    df = pd.DataFrame(
        {
            'x': np.arange(len(index), dtype=float),
            'y': np.arange(len(index), dtype=float) % 2
        },
        index=index)

    sdf = DataFrame(example=df)

    def f(x):
        return func(indexer(x.groupby(grouper(x))))

    L = f(sdf.window(value)).stream.gather().sink_to_list()

    value = pd.Timedelta(value)

    diff = 13
    for i in range(0, len(index), diff):
        sdf.emit(df.iloc[i:i + diff])
    sdf.emit(df.iloc[:0])

    assert len(L) == 5

    first = df.iloc[:diff]
    first = first[first.index.max() - value + pd.Timedelta('1ns'):]

    assert_eq(L[0], f(first))

    last = df.loc[index.max() - value + pd.Timedelta('1ns'):]

    assert_eq(L[-1], f(last))
コード例 #3
0
ファイル: test_dataframes.py プロジェクト: vyasr/cudf
def test_groupby_windowing_n(func, n, getter, grouper, indexer):
    df = cudf.DataFrame({"x": np.arange(10, dtype=float), "y": [1.0, 2.0] * 5})

    sdf = DataFrame(example=df)

    def f(x):
        return func(indexer(x.groupby(grouper(x))))

    L = f(sdf.window(n=n)).stream.gather().sink_to_list()

    diff = 3
    for i in range(0, 10, diff):
        sdf.emit(df.iloc[i:i + diff])
    sdf.emit(df.iloc[:0])

    assert len(L) == 5

    first = df.iloc[max(0, diff - n):diff]

    g = f(first)
    assert_eq(L[0], g)

    last = df.iloc[len(df) - n:]
    h = f(last)
    assert_eq(L[-1], h)
コード例 #4
0
ファイル: test_dataframes.py プロジェクト: vuule/cudf
def test_groupby_windowing_value(func, value, getter, grouper, indexer):
    index = pd.DatetimeIndex(start="2000-01-01", end="2000-01-03", freq="1h")
    df = cudf.DataFrame(
        {
            "x": np.arange(len(index), dtype=float),
            "y": np.arange(len(index), dtype=float) % 2,
        },
        index=index,
    )

    value = pd.Timedelta(value)

    sdf = DataFrame(example=df)

    def f(x):
        return func(indexer(x.groupby(grouper(x))))

    L = f(sdf.window(value)).stream.gather().sink_to_list()

    diff = 13
    for i in range(0, len(index), diff):
        sdf.emit(df.iloc[i:i + diff])

    assert len(L) == 4

    first = df.iloc[:diff]
    lost = first.loc[first.index.min() + value:]
    first = first.iloc[len(lost):]

    g = f(first)
    assert_eq(L[0], g)

    last = df.loc[index.max() - value + pd.Timedelta("1s"):]
    h = f(last)
    assert_eq(L[-1], h)
コード例 #5
0
ファイル: test_dataframes.py プロジェクト: nils-braun/streamz
def test_window_aggs_with_start_state(stream):
    example = pd.DataFrame({'name': [], 'amount': []})
    sdf = DataFrame(stream, example=example)
    output0 = sdf.window(2, with_state=True, start=None).amount.sum().stream.gather().sink_to_list()

    df = pd.DataFrame({'name': ['Alice', 'Tom', 'Linda'], 'amount': [50, 100, 200]})
    stream.emit(df)
    df = pd.DataFrame({'name': ['Bob'], 'amount': [250]})
    stream.emit(df)
    assert output0[-1][1] == 450

    stream = Stream()
    example = pd.DataFrame({'name': [], 'amount': []})
    sdf = DataFrame(stream, example=example)
    output1 = sdf.window(2, with_state=True, start=output0[-1][0]).amount.sum().stream.gather().sink_to_list()
    df = pd.DataFrame({'name': ['Alice'], 'amount': [50]})
    stream.emit(df)
    assert output1[-1][1] == 300
コード例 #6
0
def test_windowed_groupby_aggs_with_start_state(stream):
    example = cudf.DataFrame({"name": [], "amount": []})
    sdf = DataFrame(stream, example=example)
    output0 = (
        sdf.window(5, with_state=True, start=None)
        .groupby(["name"])
        .amount.sum()
        .stream.gather()
        .sink_to_list()
    )

    df = cudf.DataFrame(
        {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]}
    )
    stream.emit(df)
    df = cudf.DataFrame(
        {"name": ["Alice", "Linda", "Bob"], "amount": [250, 300, 350]}
    )
    stream.emit(df)

    stream = Stream()
    example = cudf.DataFrame({"name": [], "amount": []})
    sdf = DataFrame(stream, example=example)
    output1 = (
        sdf.window(5, with_state=True, start=output0[-1][0])
        .groupby(["name"])
        .amount.sum()
        .stream.gather()
        .sink_to_list()
    )
    df = cudf.DataFrame(
        {
            "name": ["Alice", "Linda", "Tom", "Bob"],
            "amount": [50, 100, 150, 200],
        }
    )
    stream.emit(df)
    out_df1 = cudf.DataFrame(
        {
            "name": ["Alice", "Bob", "Linda", "Tom"],
            "amount": [50, 550, 100, 150],
        }
    )
    assert_eq(output1[-1][1].reset_index(), out_df1)
コード例 #7
0
ファイル: test_dataframes.py プロジェクト: jbcrail/streamz
def test_window_sum(stream):
    df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
    sdf = DataFrame(example=df, stream=stream)
    L = sdf.window(n=4).x.sum().stream.gather().sink_to_list()

    sdf.emit(df)
    assert L == [6]
    sdf.emit(df)
    assert L == [6, 9]
    sdf.emit(df)
    assert L == [6, 9, 9]
コード例 #8
0
ファイル: test_dataframes.py プロジェクト: nils-braun/streamz
def test_windowed_groupby_aggs_with_start_state(stream):
    example = pd.DataFrame({'name': [], 'amount': []})
    sdf = DataFrame(stream, example=example)
    output0 = sdf.window(5, with_state=True, start=None).groupby(['name']).amount.sum().\
        stream.gather().sink_to_list()

    df = pd.DataFrame({'name': ['Alice', 'Tom', 'Linda'], 'amount': [50, 100, 200]})
    stream.emit(df)
    df = pd.DataFrame({'name': ['Alice', 'Linda', 'Bob'], 'amount': [250, 300, 350]})
    stream.emit(df)

    stream = Stream()
    example = pd.DataFrame({'name': [], 'amount': []})
    sdf = DataFrame(stream, example=example)
    output1 = sdf.window(5, with_state=True, start=output0[-1][0]).groupby(['name']).amount.sum().\
        stream.gather().sink_to_list()
    df = pd.DataFrame({'name': ['Alice', 'Linda', 'Tom', 'Bob'], 'amount': [50, 100, 150, 200]})
    stream.emit(df)
    out_df1 = pd.DataFrame({'name':['Alice', 'Bob', 'Linda', 'Tom'], 'amount':[50.0, 550.0, 100.0, 150.0]})
    assert_eq(output1[-1][1].reset_index(), out_df1)
コード例 #9
0
ファイル: test_dataframes.py プロジェクト: salah93/streamz
def test_windowing_value_empty_intermediate_index(stream):
    def preprocess(df):
        mask = df["amount"] == 5
        df = df.loc[mask]
        return df

    source = stream.map(preprocess)

    example = pd.DataFrame({"amount": []})
    sdf = DataFrame(stream=source, example=example)

    output = sdf.window("2h").amount.sum().stream.gather().sink_to_list()

    stream.emit(
        pd.DataFrame({"amount": [1, 2, 3]},
                     index=[
                         pd.Timestamp("2050-01-01 00:00:00"),
                         pd.Timestamp("2050-01-01 01:00:00"),
                         pd.Timestamp("2050-01-01 02:00:00")
                     ]))

    stream.emit(
        pd.DataFrame({"amount": [5, 5, 5]},
                     index=[
                         pd.Timestamp("2050-01-01 03:00:00"),
                         pd.Timestamp("2050-01-01 04:00:00"),
                         pd.Timestamp("2050-01-01 05:00:00")
                     ]))

    stream.emit(
        pd.DataFrame({"amount": [4, 5, 6]},
                     index=[
                         pd.Timestamp("2050-01-01 06:00:00"),
                         pd.Timestamp("2050-01-01 07:00:00"),
                         pd.Timestamp("2050-01-01 08:00:00")
                     ]))

    stream.emit(
        pd.DataFrame({"amount": [1, 2, 3]},
                     index=[
                         pd.Timestamp("2050-01-01 09:00:00"),
                         pd.Timestamp("2050-01-01 10:00:00"),
                         pd.Timestamp("2050-01-01 11:00:00")
                     ]))

    stream.emit(
        pd.DataFrame({"amount": [5, 5, 5]},
                     index=[
                         pd.Timestamp("2050-01-01 12:00:00"),
                         pd.Timestamp("2050-01-01 13:00:00"),
                         pd.Timestamp("2050-01-01 14:00:00")
                     ]))

    assert_eq(output, [0, 10, 5, 5, 10])
コード例 #10
0
ファイル: test_dataframes.py プロジェクト: jbcrail/streamz
def test_window_full():
    df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5})

    sdf = DataFrame(example=df)

    L = sdf.window(n=4).apply(lambda x: x).stream.sink_to_list()

    sdf.emit(df.iloc[:3])
    sdf.emit(df.iloc[3:8])
    sdf.emit(df.iloc[8:])

    assert_eq(L[0], df.iloc[:3])
    assert_eq(L[1], df.iloc[4:8])
    assert_eq(L[2], df.iloc[-4:])
コード例 #11
0
ファイル: test_dataframes.py プロジェクト: jbcrail/streamz
def test_window_sum_dataframe(stream):
    df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
    sdf = DataFrame(example=df, stream=stream)
    L = sdf.window(n=4).sum().stream.gather().sink_to_list()

    sdf.emit(df)
    assert_eq(L[0], pd.Series([6, 15], index=['x', 'y']))
    sdf.emit(df)
    assert_eq(L[0], pd.Series([6, 15], index=['x', 'y']))
    assert_eq(L[1], pd.Series([9, 21], index=['x', 'y']))
    sdf.emit(df)
    assert_eq(L[0], pd.Series([6, 15], index=['x', 'y']))
    assert_eq(L[1], pd.Series([9, 21], index=['x', 'y']))
    assert_eq(L[2], pd.Series([9, 21], index=['x', 'y']))
コード例 #12
0
def test_window_sum_dataframe(stream):
    df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
    sdf = DataFrame(example=df, stream=stream)
    L = sdf.window(n=4).sum().stream.gather().sink_to_list()

    sdf.emit(df)
    assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"]))
    sdf.emit(df)
    assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"]))
    assert_eq(L[1], cudf.Series([9, 21], index=["x", "y"]))
    sdf.emit(df)
    assert_eq(L[0], cudf.Series([6, 15], index=["x", "y"]))
    assert_eq(L[1], cudf.Series([9, 21], index=["x", "y"]))
    assert_eq(L[2], cudf.Series([9, 21], index=["x", "y"]))
コード例 #13
0
samples = pd.DataFrame({'x': [0], 'y': [0]})

# The streaming dataframe takes the source stream and sample pandas dataframe
# The sample defines the dataframe schema, maybe?
sdf = DataFrame(source, example=samples)


def stest(r):
    print(datetime.now())
    print(r)


# I don't recall what this does
# I think what I was looking to do was display the last 3 items...?
# ...which this doesn't appear to do!
df = sdf.window(2).full()

# This seems to set a callback on stest when a stream element appears
df.stream.sink(stest)

for i in range(10):
    # pull the next item in the streaming dataframe into the stream
    # We could iloc on an existing dataframe?
    source.emit(pd.DataFrame({'x': [i, i, i], 'y': [i, i, i]}))

    # Pause for a short while...
    sleep(0.2)

print()
print('--------------------------')
print()