def test_score_stream():
    class MyEstimator(StreamEstimator):
        def partial_fit(self, X, y):
            pass

        def predict(self, X):
            pass

        def score(self, X, y):
            return 1

    n_rows = 20
    X_example, y_example = pd.DataFrame({
        'name': [None] * n_rows,
        'amount': [None] * n_rows
    }), pd.Series([])
    X_stream, y_stream = Stream(), Stream()
    X, y = DataFrame(X_stream, example=X_example), Series(y_stream,
                                                          example=y_example)

    model = MyEstimator()
    score_stream = model.stream_score(X, y)

    score_list = list()
    score_stream.stream.sink(score_list.append)

    score_predicate = lambda: score_list == [1] * n_rows

    await_for(score_predicate, .1)
Beispiel #2
0
def test_tcp_async():
    port = 9876
    s = Source.from_tcp(port)
    out = s.sink_to_list()
    s.start()
    yield await_for(lambda: s.server is not None, 2, period=0.02)

    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.connect(("localhost", port))
        sock.send(b'data\n')
        sock.close()

        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.connect(("localhost", port))
        sock.send(b'data\n')

        sock2 = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock2.connect(("localhost", port))
        sock2.send(b'data2\n')
        yield await_for(lambda: out == [b'data\n', b'data\n', b'data2\n'],
                        2,
                        period=0.01)
    finally:
        s.stop()
        sock.close()
        sock2.close()
Beispiel #3
0
def test_from_kafka_thread():
    j = random.randint(0, 10000)
    ARGS = {
        'bootstrap.servers': 'localhost:9092',
        'group.id': 'streamz-test%i' % j
    }
    with kafka_service() as kafka:
        kafka, TOPIC = kafka
        stream = Stream.from_kafka([TOPIC], ARGS)
        out = stream.sink_to_list()
        stream.start()
        for i in range(10):
            kafka.produce(TOPIC, b'value-%d' % i)
        kafka.flush()
        # it takes some time for messages to come back out of kafka
        yield await_for(lambda: len(out) == 10, 10, period=0.1)

        assert out[-1] == b'value-9'
        kafka.produce(TOPIC, b'final message')
        kafka.flush()
        yield await_for(lambda: out[-1] == b'final message', 10, period=0.1)

        stream._close_consumer()
        kafka.produce(TOPIC, b'lost message')
        kafka.flush()
        # absolute sleep here, since we expect output list *not* to change
        yield gen.sleep(1)
        assert out[-1] == b'final message'
        stream._close_consumer()
Beispiel #4
0
def test_kafka_dask_checkpointing_sync_nodes(c, s, w1, w2):
    '''
    Testing whether Dask's scatter and gather works in conformity with
    the reference counting checkpointing implementation.
    '''
    j1 = random.randint(0, 10000)
    ARGS1 = {
        'bootstrap.servers': 'localhost:9092',
        'group.id': 'streamz-test%i' % j1,
        'enable.auto.commit': False,
        'auto.offset.reset': 'earliest'
    }
    j2 = j1 + 1
    ARGS2 = {
        'bootstrap.servers': 'localhost:9092',
        'group.id': 'streamz-test%i' % j2,
        'enable.auto.commit': False,
        'auto.offset.reset': 'earliest'
    }
    with kafka_service() as kafka:
        kafka, TOPIC = kafka
        for i in range(10):
            kafka.produce(TOPIC, b'value-%d' % i)
        kafka.flush()
        stream1 = Stream.from_kafka_batched(TOPIC,
                                            ARGS1,
                                            asynchronous=True,
                                            dask=True)
        out1 = stream1.map(split).gather().filter(
            lambda x: x[-1] % 2 == 1).sink_to_list()
        stream1.start()
        yield await_for(lambda: any(out1) and out1[-1][-1] == 9,
                        10,
                        period=0.2)
        stream1.upstream.stopped = True
        stream2 = Stream.from_kafka_batched(TOPIC,
                                            ARGS1,
                                            asynchronous=True,
                                            dask=True)
        out2 = stream2.map(split).gather().filter(
            lambda x: x[-1] % 2 == 1).sink_to_list()
        stream2.start()
        time.sleep(5)
        assert len(out2) == 0
        stream2.upstream.stopped = True
        stream3 = Stream.from_kafka_batched(TOPIC,
                                            ARGS2,
                                            asynchronous=True,
                                            dask=True)
        out3 = stream3.map(split).gather().filter(
            lambda x: x[-1] % 2 == 1).sink_to_list()
        stream3.start()
        yield await_for(lambda: any(out3) and out3[-1][-1] == 9,
                        10,
                        period=0.2)
        stream3.upstream.stopped = True
Beispiel #5
0
def test_from_file():
    with tmpfile() as fn:
        with open(fn, 'wt') as f:
            f.write('{"x": 1, "y": 2}\n')
            f.write('{"x": 2, "y": 2}\n')
            f.write('{"x": 3, "y": 2}\n')
            f.flush()

            source = Stream.from_textfile(fn,
                                          poll_interval=0.010,
                                          asynchronous=True,
                                          start=False)
            L = source.map(json.loads).pluck('x').sink_to_list()

            assert L == []

            source.start()

            yield await_for(lambda: len(L) == 3, timeout=5)

            assert L == [1, 2, 3]

            f.write('{"x": 4, "y": 2}\n')
            f.write('{"x": 5, "y": 2}\n')
            f.flush()

            start = time()
            while L != [1, 2, 3, 4, 5]:
                yield gen.sleep(0.01)
                assert time() < start + 2  # reads within 2s

            source = Stream.from_textfile(fn,
                                          poll_interval=0.010,
                                          asynchronous=True,
                                          start=False,
                                          from_end=True)
            L = source.map(json.loads).pluck('x').sink_to_list()

            source.start()

            yield gen.sleep(0.10)

            assert L == []

            f.write('{"x": 6, "y": 2}\n')
            f.write('{"x": 7, "y": 2}\n')
            f.flush()

            yield await_for(lambda: len(L) == 2, timeout=5)

            assert L == [6, 7]
Beispiel #6
0
def test_kafka_dask_batch(c, s, w1, w2):
    j = random.randint(0, 10000)
    ARGS = {
        'bootstrap.servers': 'localhost:9092',
        'group.id': 'streamz-test%i' % j
    }
    with kafka_service() as kafka:
        kafka, TOPIC = kafka
        stream = Stream.from_kafka_batched(TOPIC,
                                           ARGS,
                                           keys=True,
                                           asynchronous=True,
                                           dask=True)
        out = stream.gather().sink_to_list()
        stream.start()
        yield gen.sleep(5)  # this frees the loop while dask workers report in
        assert isinstance(stream, DaskStream)
        for i in range(10):
            kafka.produce(TOPIC, b'value-%d' % i)
        kafka.flush()
        yield await_for(lambda: any(out), 10, period=0.2)
        assert {'key': None, 'value': b'value-1'} in out[0]
        stream.stop()
        yield gen.sleep(0)
        stream.upstream.upstream.consumer.close()
Beispiel #7
0
def test_process():
    cmd = ["python", "-c", "for i in range(4): print(i)"]
    s = Source.from_process(cmd)
    out = s.sink_to_list()
    s.start()
    yield await_for(lambda: out == [b'0\n', b'1\n', b'2\n', b'3\n'], timeout=5)
    s.stop()
Beispiel #8
0
def test_from_file_end():
    with tmpfile() as fn:
        with open(fn, 'wt') as f:
            f.write('data1\n')
            f.flush()

            source = Stream.from_textfile(fn, poll_interval=0.010,
                                          start=False, from_end=True)
            out = source.sink_to_list()
            source.start()
            assert out == []
            yield await_for(lambda: source.started, 2, period=0.02)

            f.write('data2\n')
            f.flush()
            yield await_for(lambda: out == ['data2\n'], timeout=5, period=0.1)
def test_stream_predict():
    n_rows = 100
    X_example = pd.DataFrame({
        'name': [None] * n_rows,
        'amount': [None] * n_rows
    })
    X_stream = Stream()
    X = DataFrame(X_stream, example=X_example)

    model = MyStreamingEstimator()
    example_data = pd.Series(pd.np.ones(X_example.shape[0]))
    pred_series = model.stream_predict(X, y_example=pd.Series(example_data))
    pred_df = model.stream_predict(X,
                                   y_example=pd.DataFrame(data=example_data))

    pred_series_list, pred_df_list = [], []

    pred_series.stream.sink(pred_series_list.append)
    pred_df.stream.sink(pred_df_list.append)

    n_fits = 10
    for i in range(n_fits):
        X_stream.emit(X_example)
    ctr_predicate = lambda: (model.predict_ctr == n_fits)
    target_predictions = np.ones((X_example.shape[0], n_fits))

    pred_series_predicate = \
        lambda: pd.np.array_equal(pd.np.concatenate(pred_series_list).reshape(-1), target_predictions.reshape(-1))

    pred_df_predicate = \
        lambda: pd.np.array_equal(pd.np.concatenate(pred_df_list).reshape(-1), target_predictions.reshape(-1))

    await_for(ctr_predicate, .1)
    await_for(pred_series_predicate, .1)
    await_for(pred_df_predicate, .1)
Beispiel #10
0
def test_process_str():
    cmd = 'python -c "for i in range(4): print(i)"'
    s = Source.from_process(cmd)
    if sys.platform != "win32":
        # don't know why - something with pytest and new processes
        policy = asyncio.get_event_loop_policy()
        watcher = asyncio.SafeChildWatcher()
        policy.set_child_watcher(watcher)
        watcher.attach_loop(s.loop.asyncio_loop)
    out = s.sink_to_list()
    s.start()
    yield await_for(lambda: out == [b'0\n', b'1\n', b'2\n', b'3\n'], timeout=5)
    s.stop()
Beispiel #11
0
def test_process():
    cmd = ["python", "-c", "for i in range(4): print(i, end='')"]
    s = Source.from_process(cmd, with_end=True)
    if sys.platform != "win32":
        # don't know why - something with pytest and new processes
        policy = asyncio.get_event_loop_policy()
        watcher = asyncio.SafeChildWatcher()
        policy.set_child_watcher(watcher)
        watcher.attach_loop(s.loop.asyncio_loop)
    out = s.sink_to_list()
    s.start()
    yield await_for(lambda: out == [b'0123'], timeout=5)
    s.stop()