class NativeKafkaConsumer(BackgroundTask): def __init__(self, brokers, topic_partitions, num_records=1, batch_size=4092): super(NativeKafkaConsumer, self).__init__() self._topic_partitions = topic_partitions self._num_records = num_records self._brokers = brokers self._batch_size = batch_size self._max_attempts = 20 self.results = TopicsResultSet() def task_name(self): return f"consumer-worker-{str(random.randint(0,9999))}" def _init_consumer(self): # Setting 'auto_offset_reset' to something other then "earliest" or # "latest" throws "OffsetOutOfRangeError" in the case there is a gap # in the log or read of an offset too new / old. consumer = KafkaConsumer(client_id=self.task_name(), bootstrap_servers=self._brokers, request_timeout_ms=1000, enable_auto_commit=False, auto_offset_reset="crash") consumer.assign(self._topic_partitions) for tps in self._topic_partitions: consumer.seek_to_beginning(tps) return consumer def _run(self): def stop_consume(empty_attempts): read_all = self.results.num_records() >= self._num_records waited_enough = empty_attempts >= self._max_attempts return self.is_finished() or read_all or waited_enough consumer = self._init_consumer() empty_reads = 0 while not stop_consume(empty_reads): try: results = consumer.poll(timeout_ms=1000, max_records=self._batch_size) if results is None or len(results) == 0: empty_reads += 1 time.sleep(1) else: empty_reads = 0 self.results.append(results) except OffsetOutOfRangeError: # Ensure that the element at this offset is read, otherwise # there will be gaps in the result set. In other words this # class does manage its own offset to its subscriptions. time.sleep(1) empty_reads += 1 for tp, values in self.results.rset.items(): offset = values[-1].offset + 1 # print(f"Offset OOR tp: {tp} - offset {offset}") consumer.seek(tp, offset)
def __init__(self, brokers, topic_partitions, num_records=1, batch_size=4092): super(NativeKafkaConsumer, self).__init__() self._topic_partitions = topic_partitions self._num_records = num_records self._brokers = brokers self._batch_size = batch_size self._max_attempts = 20 self.results = TopicsResultSet()
def __init__(self, brokers, topic_partitions, num_records=1, batch_size=4092): super(NativeKafkaConsumer, self).__init__() self._topic_partitions = topic_partitions self._num_records = num_records self._batch_size = batch_size self._consumer = KafkaConsumer(client_id=self.task_name(), bootstrap_servers=brokers, request_timeout_ms=1000, enable_auto_commit=False, auto_offset_reset="earliest") self.results = TopicsResultSet()
class NativeKafkaConsumer(BackgroundTask): def __init__(self, brokers, topic_partitions, num_records=1, batch_size=4092): super(NativeKafkaConsumer, self).__init__() self._topic_partitions = topic_partitions self._num_records = num_records self._brokers = brokers self._batch_size = batch_size self._max_attempts = 20 self.results = TopicsResultSet() def task_name(self): return f"consumer-worker-{str(random.randint(0,9999))}" def _init_consumer(self): consumer = KafkaConsumer(client_id=self.task_name(), bootstrap_servers=self._brokers, request_timeout_ms=1000, enable_auto_commit=False, auto_offset_reset="latest") consumer.assign(self._topic_partitions) for tps in self._topic_partitions: consumer.seek_to_beginning(tps) return consumer def _run(self): def stop_consume(empty_attempts): read_all = self.results.num_records() >= self._num_records waited_enough = empty_attempts >= self._max_attempts return self.is_finished() or read_all or waited_enough consumer = self._init_consumer() empty_reads = 0 while not stop_consume(empty_reads): results = consumer.poll(timeout_ms=1000, max_records=self._batch_size) if results is None or len(results) == 0: empty_reads += 1 time.sleep(1) else: empty_reads = 0 self.results.append(results)
class NativeKafkaConsumer(BackgroundTask): def __init__(self, brokers, topic_partitions, num_records=1, batch_size=4092): super(NativeKafkaConsumer, self).__init__() self._topic_partitions = topic_partitions self._num_records = num_records self._batch_size = batch_size self._consumer = KafkaConsumer(client_id=self.task_name(), bootstrap_servers=brokers, request_timeout_ms=1000, enable_auto_commit=False, auto_offset_reset="earliest") self.results = TopicsResultSet() def task_name(self): return f"consumer-worker-{str(random.randint(0,9999))}" def _run(self): def stop_consume(empty_iterations): read_all = self.results.num_records() >= self._num_records waited_enough = empty_iterations <= 0 return self.is_finished() or read_all or waited_enough self._consumer.assign(self._topic_partitions) empty_iterations = 10 total = 0 while not stop_consume(empty_iterations): r = self._consumer.poll(timeout_ms=100, max_records=self._batch_size) if len(r) == 0: empty_iterations -= 1 time.sleep(1) else: total += reduce(lambda acc, x: acc + len(x), r.values(), 0) empty_iterations = 10 self.results.append(r)
class NativeKafkaConsumer(BackgroundTask): def __init__(self, brokers, topic_partitions, max_records_per_topic, batch_size=4092): super(NativeKafkaConsumer, self).__init__() self._topic_partitions = topic_partitions self._max_records_per_topic = max_records_per_topic self._brokers = brokers self._batch_size = batch_size self._max_attempts = 20 self.results = TopicsResultSet() def task_name(self): return f"consumer-worker-{str(random.randint(0,9999))}" def total_expected_records(self): return sum(self._max_records_per_topic.values()) def _init_consumer(self): consumer = KafkaConsumer(client_id=self.task_name(), bootstrap_servers=self._brokers, request_timeout_ms=1000, enable_auto_commit=False, metadata_max_age_ms=5000, reconnect_backoff_max_ms=0, reconnect_backoff_ms=1000, auto_offset_reset="throw") consumer.assign(self._topic_partitions) for tps in self._topic_partitions: consumer.seek_to_beginning(tps) return consumer def _finished_consume(self): for topic, throughput in self._max_records_per_topic.items(): if self.results.num_records_for_topic(topic) < throughput: return False return True def _run(self): consumer = self._init_consumer() empty_reads = 0 empty_reads_post_complete = 0 while True: if self.is_finished(): break # User stopped background task if self._finished_consume(): # The idea is to not stop consuming even if the bounds # have been reached, and stop when there really is not more data empty_reads_post_complete += 1 if empty_reads_post_complete >= 3: break if empty_reads >= self._max_attempts: # However if a lower bound hasn't been reached, wait longer # possibly to avert situations where log hasn't been yet populated break results = consumer.poll(timeout_ms=1000, max_records=self._batch_size) if results is None or len(results) == 0: empty_reads += 1 time.sleep(1) else: empty_reads = 0 empty_reads_post_complete = 0 self.results.append(results)