def test_null_offsets(pipeline, kafka, clean_db): """ Verify that offsets are stored as NULL if a consumer hasn't consumed any messages yet """ kafka.create_topic('null_topic', partitions=4) pipeline.create_stream('null_stream', x='integer') pipeline.create_cv('null0', 'SELECT count(*) FROM null_stream') pipeline.consume_begin('null_topic', 'null_stream', group_id='null_offsets') # Write to a single partition so that only one partition's offsets are updated producer = kafka.get_producer('null_topic') producer.produce('1', partition_key='key') time.sleep(10) pipeline.consume_end() rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets WHERE "offset" IS NULL') assert len(rows) == 3
def test_consume_stream_partitioned(pipeline, kafka, clean_db): """ Verify that messages with a stream name as their partition key are properly mapped to streams """ for n in range(4): pipeline.create_stream('stream%d' % n, x='integer') pipeline.create_cv('cv%d' % n, 'SELECT x, COUNT(*) FROM stream%d GROUP BY x' % n) kafka.create_topic('stream_partitioned_topic') pipeline.consume_begin_stream_partitioned('stream_partitioned_topic') def produce(producer, stream): for n in range(100): producer.produce(str(n), partition_key=stream) threads = [] for n in range(4): stream = 'stream%d' % n producer = kafka.get_producer('stream_partitioned_topic') t = threading.Thread(target=produce, args=(producer, stream)) t.daemon = True t.start() threads.append(t) map(lambda t: t.join(), threads) def messages_partitioned(): for n in range(4): rows = pipeline.execute('SELECT sum(count) FROM cv%d' % n) assert rows and rows[0][0] == 100 rows = pipeline.execute('SELECT count(*) FROM cv%d' % n) assert rows and rows[0][0] == 100 assert eventually(messages_partitioned) pipeline.consume_end()
def test_consume_stream_partitioned_safety(pipeline, kafka, clean_db): ''' Produce without a key or non-existent stream key and make sure thing work properly ''' pipeline.create_stream('stream0', x='integer') pipeline.create_cv('cv', 'SELECT count(*) FROM stream0') kafka.create_topic('stream_partitioned_topic_safe') pipeline.consume_begin_stream_partitioned('stream_partitioned_topic_safe') producer = kafka.get_producer('stream_partitioned_topic_safe') for n in range(100): producer.produce(str(n), partition_key='') producer.produce(str(n), partition_key='invalid') producer.produce(str(n), partition_key='stream0') def messages_partitioned(): rows = pipeline.execute('SELECT count FROM cv') assert rows and rows[0][0] == 100 assert eventually(messages_partitioned) pipeline.consume_end()
def test_lag(pipeline, kafka, clean_db): """ Verify that consumer lag is properly tracked """ kafka.create_topic('lag_topic0', partitions=8) kafka.create_topic('lag_topic1', partitions=4) pipeline.create_stream('stream0', x='integer') pipeline.create_cv('lag0', 'SELECT count(*) FROM stream0') pipeline.create_stream('stream1', x='integer') pipeline.create_cv('lag1', 'SELECT count(*) FROM stream1') pipeline.consume_begin('lag_topic0', 'stream0') pipeline.consume_begin('lag_topic1', 'stream1') producer = kafka.get_producer('lag_topic0') for n in range(100): producer.produce(str(n), partition_key=str(n)) producer = kafka.get_producer('lag_topic1') for n in range(100): producer.produce(str(n), partition_key=str(n)) def counts0(): rows = pipeline.execute('SELECT count FROM lag0') assert rows[0][0] == 100 rows = pipeline.execute('SELECT count FROM lag1') assert rows[0][0] == 100 assert eventually(counts0) pipeline.consume_end() # Now verify there is no reported lag rows = pipeline.execute('SELECT sum(lag) FROM pipeline_kafka.consumer_lag') assert rows assert rows[0][0] == 0 # Now only start one consumer back up pipeline.consume_begin('lag_topic0', 'stream0') producer = kafka.get_producer('lag_topic0') for n in range(100): producer.produce(str(n), partition_key=str(n)) producer = kafka.get_producer('lag_topic1') for n in range(100): producer.produce(str(n), partition_key=str(n)) def counts1(): rows = pipeline.execute('SELECT count FROM lag0') assert rows[0][0] == 200 rows = pipeline.execute('SELECT count FROM lag1') assert rows[0][0] == 100 assert eventually(counts1) # lag_topic0 should have no lag rows = pipeline.execute( "SELECT sum(lag) FROM pipeline_kafka.consumer_lag WHERE topic = 'lag_topic0'" ) assert rows assert rows[0][0] == 0 # lag_topic1 should have lag now since we didn't start its consumer rows = pipeline.execute( "SELECT sum(lag) FROM pipeline_kafka.consumer_lag WHERE topic = 'lag_topic1'" ) assert rows assert rows[0][0] == 100 pipeline.consume_end()
def test_consumers(pipeline, kafka, clean_db): """ Verify that offsets are properly maintained when storing them locally across consumer restarts """ pipeline.create_stream('stream0', x='integer') pipeline.create_cv('basic', 'SELECT x, COUNT(*) FROM stream0 GROUP BY x') kafka.create_topic('test_consumers', partitions=4) pipeline.consume_begin('test_consumers', 'stream0', parallelism=4) producer = kafka.get_producer('test_consumers') for n in range(1000): producer.produce(str(n)) def messages_consumed0(): rows = pipeline.execute('SELECT sum(count) FROM basic') assert rows and rows[0][0] == 1000 rows = pipeline.execute('SELECT count(*) FROM basic') assert rows and rows[0][0] == 1000 assert eventually(messages_consumed0) pipeline.consume_end() # Now verify offsets rows = pipeline.execute( 'SELECT * FROM pipeline_kafka.offsets ORDER BY partition') assert len(rows) == 4 # Consumer IDs should be the same assert all(r[0] == rows[0][0] for r in rows) # Partitions assert rows[0][1] == 0 assert rows[1][1] == 1 assert rows[2][1] == 2 assert rows[3][1] == 3 # Offsets should sum to the number of (messages_produced - num_partitions) assert sum(r[2] for r in rows) == 996 # Now produce more message while we're not consuming for n in range(1000): producer.produce(str(n)) pipeline.consume_begin('test_consumers', 'stream0', parallelism=4) time.sleep(2) # Verify count def messages_consumed1(): rows = pipeline.execute('SELECT sum(count) FROM basic') assert rows and rows[0][0] == 2000 rows = pipeline.execute('SELECT count(*) FROM basic') assert rows and rows[0][0] == 1000 assert eventually(messages_consumed1) pipeline.consume_end() rows = pipeline.execute( 'SELECT * FROM pipeline_kafka.offsets ORDER BY partition') assert len(rows) == 4 # Consumer IDs should be the same assert all(r[0] == rows[0][0] for r in rows) # Partitions assert rows[0][1] == 0 assert rows[1][1] == 1 assert rows[2][1] == 2 assert rows[3][1] == 3 # Offsets should sum to the number of (messages_produced - num_partitions) assert sum(r[2] for r in rows) == 1996
def test_grouped_consumer(pipeline, kafka, clean_db): """ Verify that consumers with a group.id store offsets in Kafka and consume partitions correctly. """ pipeline.create_stream('stream0', x='integer') pipeline.create_stream('stream1', x='integer') pipeline.create_cv('group0', "SELECT x, COUNT(*) FROM stream0 GROUP BY x") pipeline.create_cv('group1', "SELECT x, COUNT(*) FROM stream1 GROUP BY x") kafka.create_topic('topic0', partitions=4) kafka.create_topic('topic1', partitions=4) pipeline.consume_begin('topic0', 'stream0', group_id='group0') pipeline.consume_begin('topic1', 'stream1', group_id='group1') producer0 = kafka.get_producer('topic0') producer1 = kafka.get_producer('topic1') for n in range(1, 101): producer0.produce(str(n)) producer1.produce(str(n)) def counts(): rows = pipeline.execute('SELECT sum(count) FROM group0') assert rows[0][0] == 100 rows = pipeline.execute('SELECT sum(count) FROM group1') assert rows[0][0] == 100 rows = pipeline.execute('SELECT count(*) FROM group0') assert rows[0][0] == 100 rows = pipeline.execute('SELECT count(*) FROM group1') assert rows[0][0] == 100 assert eventually(counts) pipeline.consume_end() # Verify offsets are also stored locally rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets') assert rows # Now produce some more for n in range(1, 101): producer0.produce(str(-n)) producer1.produce(str(-n)) pipeline.consume_begin('topic0', 'stream0', group_id='group0') pipeline.consume_begin('topic1', 'stream1', group_id='group1') # And verify that only the second batch of messages was read def counts_after_restart(): rows = pipeline.execute('SELECT sum(x) FROM group0') assert rows[0][0] == 0 rows = pipeline.execute('SELECT sum(x) FROM group1') assert rows[0][0] == 0 rows = pipeline.execute('SELECT sum(count) FROM group0') assert rows[0][0] == 200 rows = pipeline.execute('SELECT sum(count) FROM group1') assert rows[0][0] == 200 rows = pipeline.execute('SELECT count(*) FROM group0') assert rows[0][0] == 200 rows = pipeline.execute('SELECT count(*) FROM group1') assert rows[0][0] == 200 assert eventually(counts_after_restart) # Verify that we can still begin consuming from a specific offset pipeline.create_cv('group2', "SELECT x FROM stream0") pipeline.consume_end() # Skip one row per partition pipeline.consume_begin('topic0', 'stream0', group_id='group2', start_offset=1) def from_offset(): rows = pipeline.execute('SELECT count(*) FROM group2') assert rows[0][0] == 196 assert eventually(from_offset)
def test_lag(pipeline, kafka, clean_db): """ Verify that consumer lag is properly tracked """ kafka.create_topic('lag_topic0', partitions=8) kafka.create_topic('lag_topic1', partitions=4) pipeline.create_stream('stream0', x='integer') pipeline.create_cv('lag0', 'SELECT count(*) FROM stream0') pipeline.create_stream('stream1', x='integer') pipeline.create_cv('lag1', 'SELECT count(*) FROM stream1') pipeline.consume_begin('lag_topic0', 'stream0') pipeline.consume_begin('lag_topic1', 'stream1') producer = kafka.get_producer('lag_topic0') for n in range(100): producer.produce(str(n), partition_key=str(n)) producer = kafka.get_producer('lag_topic1') for n in range(100): producer.produce(str(n), partition_key=str(n)) def counts0(): rows = pipeline.execute('SELECT count FROM lag0') assert rows[0][0] == 100 rows = pipeline.execute('SELECT count FROM lag1') assert rows[0][0] == 100 assert eventually(counts0) pipeline.consume_end() # Now verify there is no reported lag rows = pipeline.execute('SELECT sum(lag) FROM pipeline_kafka.consumer_lag') assert rows assert rows[0][0] == 0 # Now only start one consumer back up pipeline.consume_begin('lag_topic0', 'stream0') producer = kafka.get_producer('lag_topic0') for n in range(100): producer.produce(str(n), partition_key=str(n)) producer = kafka.get_producer('lag_topic1') for n in range(100): producer.produce(str(n), partition_key=str(n)) def counts1(): rows = pipeline.execute('SELECT count FROM lag0') assert rows[0][0] == 200 rows = pipeline.execute('SELECT count FROM lag1') assert rows[0][0] == 100 assert eventually(counts1) # lag_topic0 should have no lag rows = pipeline.execute("SELECT sum(lag) FROM pipeline_kafka.consumer_lag WHERE topic = 'lag_topic0'") assert rows assert rows[0][0] == 0 # lag_topic1 should have lag now since we didn't start its consumer rows = pipeline.execute("SELECT sum(lag) FROM pipeline_kafka.consumer_lag WHERE topic = 'lag_topic1'") assert rows assert rows[0][0] == 100 pipeline.consume_end()
def test_consumers(pipeline, kafka, clean_db): """ Verify that offsets are properly maintained when storing them locally across consumer restarts """ pipeline.create_stream('stream0', x='integer') pipeline.create_cv('basic', 'SELECT x, COUNT(*) FROM stream0 GROUP BY x') kafka.create_topic('test_consumers', partitions=4) pipeline.consume_begin('test_consumers', 'stream0', parallelism=4) producer = kafka.get_producer('test_consumers') for n in range(1000): producer.produce(str(n)) def messages_consumed0(): rows = pipeline.execute('SELECT sum(count) FROM basic') assert rows and rows[0][0] == 1000 rows = pipeline.execute('SELECT count(*) FROM basic') assert rows and rows[0][0] == 1000 assert eventually(messages_consumed0) pipeline.consume_end() # Now verify offsets rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets ORDER BY partition') assert len(rows) == 4 # Consumer IDs should be the same assert all(r[0] == rows[0][0] for r in rows) # Partitions assert rows[0][1] == 0 assert rows[1][1] == 1 assert rows[2][1] == 2 assert rows[3][1] == 3 # Offsets should sum to the number of (messages_produced - num_partitions) assert sum(r[2] for r in rows) == 996 # Now produce more message while we're not consuming for n in range(1000): producer.produce(str(n)) pipeline.consume_begin('test_consumers', 'stream0', parallelism=4) time.sleep(2) # Verify count def messages_consumed1(): rows = pipeline.execute('SELECT sum(count) FROM basic') assert rows and rows[0][0] == 2000 rows = pipeline.execute('SELECT count(*) FROM basic') assert rows and rows[0][0] == 1000 assert eventually(messages_consumed1) pipeline.consume_end() rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets ORDER BY partition') assert len(rows) == 4 # Consumer IDs should be the same assert all(r[0] == rows[0][0] for r in rows) # Partitions assert rows[0][1] == 0 assert rows[1][1] == 1 assert rows[2][1] == 2 assert rows[3][1] == 3 # Offsets should sum to the number of (messages_produced - num_partitions) assert sum(r[2] for r in rows) == 1996