def test_null_offsets(pipeline, kafka, clean_db):
  """
  Verify that offsets are stored as NULL if a consumer hasn't consumed any messages yet
  """
  kafka.create_topic('null_topic', partitions=4)
  pipeline.create_stream('null_stream', x='integer')
  pipeline.create_cv('null0', 'SELECT count(*) FROM null_stream')

  pipeline.consume_begin('null_topic', 'null_stream', group_id='null_offsets')

  # Write to a single partition so that only one partition's offsets are updated
  producer = kafka.get_producer('null_topic')
  producer.produce('1', partition_key='key')

  time.sleep(10)
  pipeline.consume_end()

  rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets WHERE "offset" IS NULL')
  assert len(rows) == 3
def test_null_offsets(pipeline, kafka, clean_db):
  """
  Verify that offsets are stored as NULL if a consumer hasn't consumed any messages yet
  """
  kafka.create_topic('null_topic', partitions=4)
  pipeline.create_stream('null_stream', x='integer')
  pipeline.create_cv('null0', 'SELECT count(*) FROM null_stream')

  pipeline.consume_begin('null_topic', 'null_stream', group_id='null_offsets')

  # Write to a single partition so that only one partition's offsets are updated
  producer = kafka.get_producer('null_topic')
  producer.produce('1', partition_key='key')

  time.sleep(10)
  pipeline.consume_end()

  rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets WHERE "offset" IS NULL')
  assert len(rows) == 3
def test_consume_stream_partitioned(pipeline, kafka, clean_db):
    """
  Verify that messages with a stream name as their partition key
  are properly mapped to streams
  """
    for n in range(4):
        pipeline.create_stream('stream%d' % n, x='integer')
        pipeline.create_cv('cv%d' % n,
                           'SELECT x, COUNT(*) FROM stream%d GROUP BY x' % n)

    kafka.create_topic('stream_partitioned_topic')
    pipeline.consume_begin_stream_partitioned('stream_partitioned_topic')

    def produce(producer, stream):
        for n in range(100):
            producer.produce(str(n), partition_key=stream)

    threads = []
    for n in range(4):
        stream = 'stream%d' % n
        producer = kafka.get_producer('stream_partitioned_topic')
        t = threading.Thread(target=produce, args=(producer, stream))
        t.daemon = True
        t.start()
        threads.append(t)

    map(lambda t: t.join(), threads)

    def messages_partitioned():
        for n in range(4):
            rows = pipeline.execute('SELECT sum(count) FROM cv%d' % n)
            assert rows and rows[0][0] == 100

            rows = pipeline.execute('SELECT count(*) FROM cv%d' % n)
            assert rows and rows[0][0] == 100

    assert eventually(messages_partitioned)
    pipeline.consume_end()
def test_consume_stream_partitioned_safety(pipeline, kafka, clean_db):
    '''
  Produce without a key or non-existent stream key and make sure
  thing work properly
  '''
    pipeline.create_stream('stream0', x='integer')
    pipeline.create_cv('cv', 'SELECT count(*) FROM stream0')

    kafka.create_topic('stream_partitioned_topic_safe')
    pipeline.consume_begin_stream_partitioned('stream_partitioned_topic_safe')

    producer = kafka.get_producer('stream_partitioned_topic_safe')
    for n in range(100):
        producer.produce(str(n), partition_key='')
        producer.produce(str(n), partition_key='invalid')
        producer.produce(str(n), partition_key='stream0')

    def messages_partitioned():
        rows = pipeline.execute('SELECT count FROM cv')
        assert rows and rows[0][0] == 100

    assert eventually(messages_partitioned)
    pipeline.consume_end()
def test_consume_stream_partitioned(pipeline, kafka, clean_db):
  """
  Verify that messages with a stream name as their partition key
  are properly mapped to streams
  """
  for n in range(4):
    pipeline.create_stream('stream%d' % n, x='integer')
    pipeline.create_cv('cv%d' % n, 'SELECT x, COUNT(*) FROM stream%d GROUP BY x' % n)

  kafka.create_topic('stream_partitioned_topic')
  pipeline.consume_begin_stream_partitioned('stream_partitioned_topic')

  def produce(producer, stream):
    for n in range(100):
      producer.produce(str(n), partition_key=stream)

  threads = []
  for n in range(4):
    stream = 'stream%d' % n
    producer = kafka.get_producer('stream_partitioned_topic')
    t = threading.Thread(target=produce, args=(producer, stream))
    t.daemon = True
    t.start()
    threads.append(t)

  map(lambda t: t.join(), threads)

  def messages_partitioned():
    for n in range(4):
      rows = pipeline.execute('SELECT sum(count) FROM cv%d' % n)
      assert rows and rows[0][0] == 100

      rows = pipeline.execute('SELECT count(*) FROM cv%d' % n)
      assert rows and rows[0][0] == 100

  assert eventually(messages_partitioned)
  pipeline.consume_end()
def test_consume_stream_partitioned_safety(pipeline, kafka, clean_db):
  '''
  Produce without a key or non-existent stream key and make sure
  thing work properly
  '''
  pipeline.create_stream('stream0', x='integer')
  pipeline.create_cv('cv', 'SELECT count(*) FROM stream0')

  kafka.create_topic('stream_partitioned_topic_safe')
  pipeline.consume_begin_stream_partitioned('stream_partitioned_topic_safe')


  producer = kafka.get_producer('stream_partitioned_topic_safe')
  for n in range(100):
    producer.produce(str(n), partition_key='')
    producer.produce(str(n), partition_key='invalid')
    producer.produce(str(n), partition_key='stream0')

  def messages_partitioned():
    rows = pipeline.execute('SELECT count FROM cv')
    assert rows and rows[0][0] == 100

  assert eventually(messages_partitioned)
  pipeline.consume_end()
def test_lag(pipeline, kafka, clean_db):
    """
  Verify that consumer lag is properly tracked
  """
    kafka.create_topic('lag_topic0', partitions=8)
    kafka.create_topic('lag_topic1', partitions=4)

    pipeline.create_stream('stream0', x='integer')
    pipeline.create_cv('lag0', 'SELECT count(*) FROM stream0')

    pipeline.create_stream('stream1', x='integer')
    pipeline.create_cv('lag1', 'SELECT count(*) FROM stream1')

    pipeline.consume_begin('lag_topic0', 'stream0')
    pipeline.consume_begin('lag_topic1', 'stream1')

    producer = kafka.get_producer('lag_topic0')
    for n in range(100):
        producer.produce(str(n), partition_key=str(n))

    producer = kafka.get_producer('lag_topic1')
    for n in range(100):
        producer.produce(str(n), partition_key=str(n))

    def counts0():
        rows = pipeline.execute('SELECT count FROM lag0')
        assert rows[0][0] == 100

        rows = pipeline.execute('SELECT count FROM lag1')
        assert rows[0][0] == 100

    assert eventually(counts0)

    pipeline.consume_end()

    # Now verify there is no reported lag
    rows = pipeline.execute('SELECT sum(lag) FROM pipeline_kafka.consumer_lag')
    assert rows
    assert rows[0][0] == 0

    # Now only start one consumer back up
    pipeline.consume_begin('lag_topic0', 'stream0')

    producer = kafka.get_producer('lag_topic0')
    for n in range(100):
        producer.produce(str(n), partition_key=str(n))

    producer = kafka.get_producer('lag_topic1')
    for n in range(100):
        producer.produce(str(n), partition_key=str(n))

    def counts1():
        rows = pipeline.execute('SELECT count FROM lag0')
        assert rows[0][0] == 200

        rows = pipeline.execute('SELECT count FROM lag1')
        assert rows[0][0] == 100

    assert eventually(counts1)

    # lag_topic0 should have no lag
    rows = pipeline.execute(
        "SELECT sum(lag) FROM pipeline_kafka.consumer_lag WHERE topic = 'lag_topic0'"
    )
    assert rows
    assert rows[0][0] == 0

    # lag_topic1 should have lag now since we didn't start its consumer
    rows = pipeline.execute(
        "SELECT sum(lag) FROM pipeline_kafka.consumer_lag WHERE topic = 'lag_topic1'"
    )
    assert rows
    assert rows[0][0] == 100

    pipeline.consume_end()
def test_consumers(pipeline, kafka, clean_db):
    """
  Verify that offsets are properly maintained when storing them locally across consumer restarts
  """
    pipeline.create_stream('stream0', x='integer')
    pipeline.create_cv('basic', 'SELECT x, COUNT(*) FROM stream0 GROUP BY x')

    kafka.create_topic('test_consumers', partitions=4)
    pipeline.consume_begin('test_consumers', 'stream0', parallelism=4)

    producer = kafka.get_producer('test_consumers')
    for n in range(1000):
        producer.produce(str(n))

    def messages_consumed0():
        rows = pipeline.execute('SELECT sum(count) FROM basic')
        assert rows and rows[0][0] == 1000

        rows = pipeline.execute('SELECT count(*) FROM basic')
        assert rows and rows[0][0] == 1000

    assert eventually(messages_consumed0)
    pipeline.consume_end()

    # Now verify offsets
    rows = pipeline.execute(
        'SELECT * FROM pipeline_kafka.offsets ORDER BY partition')
    assert len(rows) == 4

    # Consumer IDs should be the same
    assert all(r[0] == rows[0][0] for r in rows)

    # Partitions
    assert rows[0][1] == 0
    assert rows[1][1] == 1
    assert rows[2][1] == 2
    assert rows[3][1] == 3

    # Offsets should sum to the number of (messages_produced - num_partitions)
    assert sum(r[2] for r in rows) == 996

    # Now produce more message while we're not consuming
    for n in range(1000):
        producer.produce(str(n))

    pipeline.consume_begin('test_consumers', 'stream0', parallelism=4)
    time.sleep(2)

    # Verify count
    def messages_consumed1():
        rows = pipeline.execute('SELECT sum(count) FROM basic')
        assert rows and rows[0][0] == 2000

        rows = pipeline.execute('SELECT count(*) FROM basic')
        assert rows and rows[0][0] == 1000

    assert eventually(messages_consumed1)
    pipeline.consume_end()

    rows = pipeline.execute(
        'SELECT * FROM pipeline_kafka.offsets ORDER BY partition')
    assert len(rows) == 4

    # Consumer IDs should be the same
    assert all(r[0] == rows[0][0] for r in rows)

    # Partitions
    assert rows[0][1] == 0
    assert rows[1][1] == 1
    assert rows[2][1] == 2
    assert rows[3][1] == 3

    # Offsets should sum to the number of (messages_produced - num_partitions)
    assert sum(r[2] for r in rows) == 1996
def test_grouped_consumer(pipeline, kafka, clean_db):
    """
  Verify that consumers with a group.id store offsets in Kafka and consume
  partitions correctly.
  """
    pipeline.create_stream('stream0', x='integer')
    pipeline.create_stream('stream1', x='integer')
    pipeline.create_cv('group0', "SELECT x, COUNT(*) FROM stream0 GROUP BY x")
    pipeline.create_cv('group1', "SELECT x, COUNT(*) FROM stream1 GROUP BY x")

    kafka.create_topic('topic0', partitions=4)
    kafka.create_topic('topic1', partitions=4)

    pipeline.consume_begin('topic0', 'stream0', group_id='group0')
    pipeline.consume_begin('topic1', 'stream1', group_id='group1')

    producer0 = kafka.get_producer('topic0')
    producer1 = kafka.get_producer('topic1')

    for n in range(1, 101):
        producer0.produce(str(n))
        producer1.produce(str(n))

    def counts():
        rows = pipeline.execute('SELECT sum(count) FROM group0')
        assert rows[0][0] == 100

        rows = pipeline.execute('SELECT sum(count) FROM group1')
        assert rows[0][0] == 100

        rows = pipeline.execute('SELECT count(*) FROM group0')
        assert rows[0][0] == 100

        rows = pipeline.execute('SELECT count(*) FROM group1')
        assert rows[0][0] == 100

    assert eventually(counts)

    pipeline.consume_end()

    # Verify offsets are also stored locally
    rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets')
    assert rows

    # Now produce some more
    for n in range(1, 101):
        producer0.produce(str(-n))
        producer1.produce(str(-n))

    pipeline.consume_begin('topic0', 'stream0', group_id='group0')
    pipeline.consume_begin('topic1', 'stream1', group_id='group1')

    # And verify that only the second batch of messages was read
    def counts_after_restart():
        rows = pipeline.execute('SELECT sum(x) FROM group0')
        assert rows[0][0] == 0

        rows = pipeline.execute('SELECT sum(x) FROM group1')
        assert rows[0][0] == 0

        rows = pipeline.execute('SELECT sum(count) FROM group0')
        assert rows[0][0] == 200

        rows = pipeline.execute('SELECT sum(count) FROM group1')
        assert rows[0][0] == 200

        rows = pipeline.execute('SELECT count(*) FROM group0')
        assert rows[0][0] == 200

        rows = pipeline.execute('SELECT count(*) FROM group1')
        assert rows[0][0] == 200

    assert eventually(counts_after_restart)

    # Verify that we can still begin consuming from a specific offset
    pipeline.create_cv('group2', "SELECT x FROM stream0")

    pipeline.consume_end()
    # Skip one row per partition
    pipeline.consume_begin('topic0',
                           'stream0',
                           group_id='group2',
                           start_offset=1)

    def from_offset():
        rows = pipeline.execute('SELECT count(*) FROM group2')
        assert rows[0][0] == 196

    assert eventually(from_offset)
def test_lag(pipeline, kafka, clean_db):
  """
  Verify that consumer lag is properly tracked
  """
  kafka.create_topic('lag_topic0', partitions=8)
  kafka.create_topic('lag_topic1', partitions=4)

  pipeline.create_stream('stream0', x='integer')
  pipeline.create_cv('lag0', 'SELECT count(*) FROM stream0')

  pipeline.create_stream('stream1', x='integer')
  pipeline.create_cv('lag1', 'SELECT count(*) FROM stream1')

  pipeline.consume_begin('lag_topic0', 'stream0')
  pipeline.consume_begin('lag_topic1', 'stream1')

  producer = kafka.get_producer('lag_topic0')
  for n in range(100):
    producer.produce(str(n), partition_key=str(n))

  producer = kafka.get_producer('lag_topic1')
  for n in range(100):
    producer.produce(str(n), partition_key=str(n))

  def counts0():
    rows = pipeline.execute('SELECT count FROM lag0')
    assert rows[0][0] == 100

    rows = pipeline.execute('SELECT count FROM lag1')
    assert rows[0][0] == 100

  assert eventually(counts0)

  pipeline.consume_end()

  # Now verify there is no reported lag
  rows = pipeline.execute('SELECT sum(lag) FROM pipeline_kafka.consumer_lag')
  assert rows
  assert rows[0][0] == 0

  # Now only start one consumer back up
  pipeline.consume_begin('lag_topic0', 'stream0')

  producer = kafka.get_producer('lag_topic0')
  for n in range(100):
    producer.produce(str(n), partition_key=str(n))

  producer = kafka.get_producer('lag_topic1')
  for n in range(100):
    producer.produce(str(n), partition_key=str(n))

  def counts1():
    rows = pipeline.execute('SELECT count FROM lag0')
    assert rows[0][0] == 200

    rows = pipeline.execute('SELECT count FROM lag1')
    assert rows[0][0] == 100

  assert eventually(counts1)

  # lag_topic0 should have no lag
  rows = pipeline.execute("SELECT sum(lag) FROM pipeline_kafka.consumer_lag WHERE topic = 'lag_topic0'")
  assert rows
  assert rows[0][0] == 0

  # lag_topic1 should have lag now since we didn't start its consumer
  rows = pipeline.execute("SELECT sum(lag) FROM pipeline_kafka.consumer_lag WHERE topic = 'lag_topic1'")
  assert rows
  assert rows[0][0] == 100

  pipeline.consume_end()
def test_consumers(pipeline, kafka, clean_db):
  """
  Verify that offsets are properly maintained when storing them locally across consumer restarts
  """
  pipeline.create_stream('stream0', x='integer')
  pipeline.create_cv('basic', 'SELECT x, COUNT(*) FROM stream0 GROUP BY x')

  kafka.create_topic('test_consumers', partitions=4)
  pipeline.consume_begin('test_consumers', 'stream0', parallelism=4)

  producer = kafka.get_producer('test_consumers')
  for n in range(1000):
    producer.produce(str(n))

  def messages_consumed0():
    rows = pipeline.execute('SELECT sum(count) FROM basic')
    assert rows and rows[0][0] == 1000

    rows = pipeline.execute('SELECT count(*) FROM basic')
    assert rows and rows[0][0] == 1000

  assert eventually(messages_consumed0)
  pipeline.consume_end()

  # Now verify offsets
  rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets ORDER BY partition')
  assert len(rows) == 4

  # Consumer IDs should be the same
  assert all(r[0] == rows[0][0] for r in rows)

  # Partitions
  assert rows[0][1] == 0
  assert rows[1][1] == 1
  assert rows[2][1] == 2
  assert rows[3][1] == 3

  # Offsets should sum to the number of (messages_produced - num_partitions)
  assert sum(r[2] for r in rows) == 996

  # Now produce more message while we're not consuming
  for n in range(1000):
    producer.produce(str(n))

  pipeline.consume_begin('test_consumers', 'stream0', parallelism=4)
  time.sleep(2)

  # Verify count
  def messages_consumed1():
    rows = pipeline.execute('SELECT sum(count) FROM basic')
    assert rows and rows[0][0] == 2000

    rows = pipeline.execute('SELECT count(*) FROM basic')
    assert rows and rows[0][0] == 1000

  assert eventually(messages_consumed1)
  pipeline.consume_end()

  rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets ORDER BY partition')
  assert len(rows) == 4

  # Consumer IDs should be the same
  assert all(r[0] == rows[0][0] for r in rows)

  # Partitions
  assert rows[0][1] == 0
  assert rows[1][1] == 1
  assert rows[2][1] == 2
  assert rows[3][1] == 3

  # Offsets should sum to the number of (messages_produced - num_partitions)
  assert sum(r[2] for r in rows) == 1996
def test_grouped_consumer(pipeline, kafka, clean_db):
  """
  Verify that consumers with a group.id store offsets in Kafka and consume
  partitions correctly.
  """
  pipeline.create_stream('stream0', x='integer')
  pipeline.create_stream('stream1', x='integer')
  pipeline.create_cv('group0', "SELECT x, COUNT(*) FROM stream0 GROUP BY x")
  pipeline.create_cv('group1', "SELECT x, COUNT(*) FROM stream1 GROUP BY x")

  kafka.create_topic('topic0', partitions=4)
  kafka.create_topic('topic1', partitions=4)

  pipeline.consume_begin('topic0', 'stream0', group_id='group0')
  pipeline.consume_begin('topic1', 'stream1', group_id='group1')

  producer0 = kafka.get_producer('topic0')
  producer1 = kafka.get_producer('topic1')

  for n in range(1, 101):
    producer0.produce(str(n))
    producer1.produce(str(n))

  def counts():
    rows = pipeline.execute('SELECT sum(count) FROM group0')
    assert rows[0][0] == 100

    rows = pipeline.execute('SELECT sum(count) FROM group1')
    assert rows[0][0] == 100

    rows = pipeline.execute('SELECT count(*) FROM group0')
    assert rows[0][0] == 100

    rows = pipeline.execute('SELECT count(*) FROM group1')
    assert rows[0][0] == 100

  assert eventually(counts)

  pipeline.consume_end()

  # Verify offsets are also stored locally
  rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets')
  assert rows

  # Now produce some more
  for n in range(1, 101):
    producer0.produce(str(-n))
    producer1.produce(str(-n))

  pipeline.consume_begin('topic0', 'stream0', group_id='group0')
  pipeline.consume_begin('topic1', 'stream1', group_id='group1')

  # And verify that only the second batch of messages was read
  def counts_after_restart():
    rows = pipeline.execute('SELECT sum(x) FROM group0')
    assert rows[0][0] == 0

    rows = pipeline.execute('SELECT sum(x) FROM group1')
    assert rows[0][0] == 0

    rows = pipeline.execute('SELECT sum(count) FROM group0')
    assert rows[0][0] == 200

    rows = pipeline.execute('SELECT sum(count) FROM group1')
    assert rows[0][0] == 200

    rows = pipeline.execute('SELECT count(*) FROM group0')
    assert rows[0][0] == 200

    rows = pipeline.execute('SELECT count(*) FROM group1')
    assert rows[0][0] == 200

  assert eventually(counts_after_restart)

  # Verify that we can still begin consuming from a specific offset
  pipeline.create_cv('group2', "SELECT x FROM stream0")

  pipeline.consume_end()
  # Skip one row per partition
  pipeline.consume_begin('topic0', 'stream0', group_id='group2', start_offset=1)

  def from_offset():
    rows = pipeline.execute('SELECT count(*) FROM group2')
    assert rows[0][0] == 196

  assert eventually(from_offset)