def test_any_method_after_close_throws_exception(): """ Calling any consumer method after close should thorw a RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.subscribe(['test']) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.unsubscribe() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.poll() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.consume() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.assign([TopicPartition('test', 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.unassign() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.assignment() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.commit() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.committed([TopicPartition("test", 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.position([TopicPartition("test", 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.seek([TopicPartition("test", 0, 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: lo, hi = c.get_watermark_offsets(TopicPartition("test", 0)) assert ex.match('Consumer closed')
def test_any_method_after_close_throws_exception(): """ Calling any consumer method after close should thorw a RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.subscribe(['test']) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unsubscribe() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.poll() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.consume() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assign([TopicPartition('test', 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unassign() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assignment() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.commit() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.committed([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.position([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.seek([TopicPartition("test", 0, 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: lo, hi = c.get_watermark_offsets(TopicPartition("test", 0)) assert 'Consumer closed' == str(ex.value)
def _build_kafka_consumer(self): """Setup of the kafka consumer.""" try: consumer = Consumer(self.get_conf()) consumer.subscribe([self.topic]) consumer.assignment() except KafkaException: logger.warning( f"Error connecting to the Kafka consumer thread: {self}") raise else: return consumer
def test_send_offsets_committed_transaction(kafka_cluster): input_topic = kafka_cluster.create_topic("input_topic") output_topic = kafka_cluster.create_topic("output_topic") error_cb = prefixed_error_cb('test_send_offsets_committed_transaction') producer = kafka_cluster.producer({ 'client.id': 'producer1', 'transactional.id': 'example_transactional_id', 'error_cb': error_cb, }) consumer_conf = { 'group.id': str(uuid1()), 'auto.offset.reset': 'earliest', 'enable.auto.commit': False, 'enable.partition.eof': True, 'error_cb': error_cb } consumer_conf.update(kafka_cluster.client_conf()) consumer = Consumer(consumer_conf) kafka_cluster.seed_topic(input_topic) consumer.subscribe([input_topic]) read_all_msgs(consumer) producer.init_transactions() transactional_produce(producer, output_topic, 100) consumer_position = consumer.position(consumer.assignment()) group_metadata = consumer.consumer_group_metadata() print( "=== Sending offsets {} to transaction ===".format(consumer_position)) producer.send_offsets_to_transaction(consumer_position, group_metadata) producer.commit_transaction() producer2 = kafka_cluster.producer({ 'client.id': 'producer2', 'transactional.id': 'example_transactional_id', 'error_cb': error_cb }) # ensure offset commits are visible prior to sending FetchOffsets request producer2.init_transactions() committed_offsets = consumer.committed(consumer.assignment()) print("=== Committed offsets for {} ===".format(committed_offsets)) assert [tp.offset for tp in committed_offsets] == [100] consumer.close()
async def consume(topic_name): c = Consumer({ "bootstrap.servers": "PLAINTEXT://localhost:9092", "group.id": "0", # "auto.offset.reset": "beginning" }) topic_partition = TopicPartition(topic_name, 0, OFFSET_BEGINNING) # c.subscribe([topic_name]) # c.subscribe([topic_name], on_assign=on_assign) c.assign([topic_partition]) assignment = c.assignment() print(f"assignment: {assignment}") position = c.position([topic_partition]) print(f"position: {position}") while True: message = c.poll(1.0) if message is None: print("no message received by consumer") elif message.error() is not None: print(f"error from consumer {message.error()}") else: print(f"consumed message {message.key()}: {message.value()}") await asyncio.sleep(1)
def get_message_face(self): consumer_conf = { 'bootstrap.servers': ','.join(self.config['kafka']['host']), 'group.id': 'face_yisa_20200823', 'enable.auto.commit': 'true', 'default.topic.config': { 'auto.offset.reset': 'largest' } } # 实例化消费者 consumer = Consumer(consumer_conf) def print_assignment(consumer, partitions): logging.info("Assignment: {}".format(partitions)) def print_revoke(consumer, partitions): logging.info("Revoke: {}".format(partitions)) consumer.subscribe([self.config['kafka']["face_topic"]], on_assign=print_assignment, on_revoke=print_revoke) number_unassigned = 0 number_pull = 0 while 1: try: message = consumer.poll(timeout=5.0) if message is None: time.sleep(0.01) if not consumer.assignment(): number_unassigned += 1 if number_unassigned % 100 == 0: logging.warning("Partition is not assignment. 请检查进程数量是否大于partition个数或kafka leader状态是否正常. ") continue partition = message.partition() offset = message.offset() #logging.info('偏移量:{}'.format(str(offset))) value = message.value() if message.error(): if message.error().code() == KafkaError._PARTITION_EOF: pass else: logging.error("kafka consumer error! {}".format(message.error())) continue number_pull += 1 if value: messages = [] row = json.loads(value) if isinstance(row, dict): messages = [row] else: messages = row for msg in messages: self.message_queen.put(msg) except Queue.Empty: continue except Exception as e: logging.exception('读取kafka时错误: {}'.format(str(e))) time.sleep(1)
def get_last_available_status_message(cons: Consumer, status_topic: str): """ :param cons: :param status_topic: :return: The last status message. """ partitions = cons.assignment() _, hi = cons.get_watermark_offsets(partitions[0], cached=False, timeout=2.0) last_msg_offset = hi - 1 cons.assign( [TopicPartition(status_topic, partition=0, offset=last_msg_offset)]) status_msg, _ = poll_for_valid_message(cons, expected_file_identifier=None) return status_msg
def get_all_available_messages(consumer: Consumer): """ Consumes all available messages topics subscribed to by the consumer :param consumer: The consumer object :return: list of messages, empty if none available """ messages = [] low_offset, high_offset = consumer.get_watermark_offsets( consumer.assignment()[0], cached=False) number_of_messages_available = high_offset - low_offset while len(messages) < number_of_messages_available: message = consumer.poll(timeout=2.0) if message is None or message.error(): continue messages.append(message) return messages
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({ 'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb }) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list( map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()
class KafkaClient(object): def __init__(self, kafka_bootstrap_servers, kafka_topic, guid=None, partition=None): self.kafka_bootstrap_servers = kafka_bootstrap_servers self.kafka_topic = kafka_topic if partition: raise NotImplementedError("multiple partitions not supported yet") self.guid = guid if not self.guid: self.guid = str(uuid4()) self.p = None self.c = None def produce(self, key, val): try: if not self.p: self.p = Producer({ 'bootstrap.servers': self.kafka_bootstrap_servers, 'api.version.request': True }) if not isinstance(key, bytes): raise TypeError( 'producing to kafka requires key to be raw bytes') if not isinstance(val, bytes) and val is not None: raise TypeError( 'producing to kafka requires val to be raw bytes or None') self.p.produce(topic=self.kafka_topic, value=val, key=key) except BufferError: self.p.flush() self.p.produce(topic=self.kafka_topic, value=val, key=key) def flush_producer(self): if self.p: self.p.flush() def consume(self): if not self.c: self.c = Consumer({ 'bootstrap.servers': self.kafka_bootstrap_servers, 'group.id': self.guid, 'api.version.request': True, 'log.connection.close': False, 'socket.keepalive.enable': True, 'session.timeout.ms': 6000, 'default.topic.config': { 'auto.offset.reset': 'smallest' } }) self.c.subscribe([self.kafka_topic]) # must perform an initial poll to get partition assignments first_message = True msg = self.c.poll(timeout=10.0) # grab watermarks from partition partitionobjs = self.c.assignment() partitions = {} for prt in partitionobjs: partition = prt.partition last_offset = self.c.get_watermark_offsets(prt)[1] - 1 if last_offset < 0: # if nothing in partition then this will be -1 continue position = max( self.c.position([prt])[0].offset - 1, -1 ) # if never read before then call returns -1001 for some reason if last_offset > position: partitions[partition] = last_offset # process partitions up to watermarks (but remember that we already consumed a message, so need to yield that) while first_message or len(partitions) > 0: if not first_message: msg = self.c.poll(timeout=10.0) else: first_message = False if msg is None or msg.error( ): # NOTE: "if not msg" checks if message len = 0, which is different from checking "if msg is None" continue # ignore errors partition = msg.partition() if partition in partitions and msg.offset() >= partitions[ partition]: # first check is because we might read past the watermark # for a partition that we're already done with... but that's ok del partitions[partition] yield msg.key(), msg.value(), msg.timestamp()[1] def __del__(self): self.flush_producer() if self.c: self.c.close()
def run(self) -> None: ac = ApiClient() api = public_api(self.api_host) # only used by container indexing query_stats code path es_client = elasticsearch.Elasticsearch(self.elasticsearch_backend) def fail_fast(err: Any, partitions: List[Any]) -> None: if err is not None: print("Kafka consumer commit error: {}".format(err), file=sys.stderr) print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors if p.error: print("Kafka consumer commit error: {}".format(p.error), file=sys.stderr) print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) # print("Kafka consumer commit successful") pass def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None: for p in partitions: if p.error: raise KafkaException(p.error) print( "Kafka partitions rebalanced: {} / {}".format(consumer, partitions), file=sys.stderr, ) consumer_conf = self.kafka_config.copy() consumer_conf.update( { "group.id": self.consumer_group, "on_commit": fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker "enable.auto.commit": True, "enable.auto.offset.store": False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) "max.poll.interval.ms": 60000, "default.topic.config": { "auto.offset.reset": "latest", }, } ) consumer = Consumer(consumer_conf) consumer.subscribe( [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) while True: batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval) if not batch: if not consumer.assignment(): print("... no Kafka consumer partitions assigned yet", file=sys.stderr) print( "... nothing new from kafka, try again (interval: {}".format( self.poll_interval ), file=sys.stderr, ) continue print("... got {} kafka messages".format(len(batch)), file=sys.stderr) # first check errors on entire batch... for msg in batch: if msg.error(): raise KafkaException(msg.error()) # ... then process bulk_actions = [] for msg in batch: json_str = msg.value().decode("utf-8") entity = entity_from_json(json_str, self.entity_type, api_client=ac) assert isinstance(entity, self.entity_type) if self.entity_type == ChangelogEntry: key = entity.index # might need to fetch from API if not ( entity.editgroup # pylint: disable=no-member # (TODO) and entity.editgroup.editor # pylint: disable=no-member # (TODO) ): entity = api.get_changelog_entry(entity.index) else: key = entity.ident # pylint: disable=no-member # (TODO) if self.entity_type != ChangelogEntry and entity.state == "wip": print( f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}", file=sys.stderr, ) continue if self.entity_type == ContainerEntity and self.query_stats: stats = query_es_container_stats( entity.ident, es_client=es_client, es_index=self.elasticsearch_release_index, merge_shadows=True, ) doc_dict = container_to_elasticsearch(entity, stats=stats) else: doc_dict = self.transform_func(entity) # TODO: handle deletions from index bulk_actions.append( json.dumps( { "index": { "_id": key, }, } ) ) bulk_actions.append(json.dumps(doc_dict)) # if only WIP entities, then skip if not bulk_actions: for msg in batch: consumer.store_offsets(message=msg) continue print( "Upserting, eg, {} (of {} {} in elasticsearch)".format( key, len(batch), self.entity_type.__name__ ), file=sys.stderr, ) elasticsearch_endpoint = "{}/{}/_bulk".format( self.elasticsearch_backend, self.elasticsearch_index ) resp = requests.post( elasticsearch_endpoint, headers={"Content-Type": "application/x-ndjson"}, data="\n".join(bulk_actions) + "\n", ) resp.raise_for_status() if resp.json()["errors"]: desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint) print(desc, file=sys.stderr) print(resp.content, file=sys.stderr) raise Exception(desc) for msg in batch: # offsets are *committed* (to brokers) automatically, but need # to be marked as processed here consumer.store_offsets(message=msg)
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()
timestamp=record_json.get("phenomenonTime"), result=record_json.get("result"), topic=msg.topic(), partition=msg.partition(), offset=msg.offset(), **additional_attributes) # ingest the record into the StreamBuffer instance, instant emit if record.get("topic") == KAFKA_TOPIC_IN_1: # Car1 stream_buffer.ingest_left(record) # with instant emit elif record.get("topic") == KAFKA_TOPIC_IN_2: # Car2 stream_buffer.ingest_right(record) except KeyboardInterrupt: print("Gracefully stopping") finally: ts_stop = time.time() # commit processed message offsets to the transaction kafka_producer.send_offsets_to_transaction( kafka_consumer.position(kafka_consumer.assignment()), kafka_consumer.consumer_group_metadata()) # commit transaction kafka_producer.commit_transaction() # Leave group and commit offsets kafka_consumer.close() print(f"\nRecords in |{KAFKA_TOPIC_OUT}| = {stream_buffer.get_join_counter()}, " f"|{KAFKA_TOPIC_IN_1}| = {stream_buffer.get_left_counter()}, " f"|{KAFKA_TOPIC_IN_2}| = {stream_buffer.get_right_counter()}.") print(f"Joined time-series {ts_stop - st0:.5g} s long, " f"this are {stream_buffer.get_join_counter() / (ts_stop - st0):.6g} joins per second.")
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb (err, partitions): pass kc = Consumer({'group.id':'test', 'socket.timeout.ms':'100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke (consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) partitions = list(map(lambda p: TopicPartition("test", p), range(0,100,3))) kc.assign(partitions) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE), str(e.args([0])) kc.unassign() kc.commit(async=True) try: kc.commit(async=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: offsets = kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT kc.close()
class JournalClient: """A base client for the Software Heritage journal. The current implementation of the journal uses Apache Kafka brokers to publish messages under a given topic prefix, with each object type using a specific topic under that prefix. If the `prefix` argument is None (default value), it will take the default value `'swh.journal.objects'`. Clients subscribe to events specific to each object type as listed in the `object_types` argument (if unset, defaults to all existing kafka topic under the prefix). Clients can be sharded by setting the `group_id` to a common value across instances. The journal will share the message throughput across the nodes sharing the same group_id. Messages are processed by the `worker_fn` callback passed to the `process` method, in batches of maximum `batch_size` messages (defaults to 200). The objects passed to the `worker_fn` callback are the result of the kafka message converted by the `value_deserializer` function. By default (if this argument is not given), it will produce dicts (using the `kafka_to_value` function). This signature of the function is: `value_deserializer(object_type: str, kafka_msg: bytes) -> Any` If the value returned by `value_deserializer` is None, it is ignored and not passed the `worker_fn` function. If set, the processing stops after processing `stop_after_objects` messages in total. `stop_on_eof` stops the processing when the client has reached the end of each partition in turn. `auto_offset_reset` sets the behavior of the client when the consumer group initializes: `'earliest'` (the default) processes all objects since the inception of the topics; `''` Any other named argument is passed directly to KafkaConsumer(). """ def __init__( self, brokers: Union[str, List[str]], group_id: str, prefix: Optional[str] = None, object_types: Optional[List[str]] = None, privileged: bool = False, stop_after_objects: Optional[int] = None, batch_size: int = 200, process_timeout: Optional[float] = None, auto_offset_reset: str = "earliest", stop_on_eof: bool = False, value_deserializer: Optional[Callable[[str, bytes], Any]] = None, **kwargs, ): if prefix is None: prefix = DEFAULT_PREFIX if auto_offset_reset not in ACCEPTED_OFFSET_RESET: raise ValueError( "Option 'auto_offset_reset' only accept %s, not %s" % (ACCEPTED_OFFSET_RESET, auto_offset_reset)) if batch_size <= 0: raise ValueError("Option 'batch_size' needs to be positive") if value_deserializer: self.value_deserializer = value_deserializer else: self.value_deserializer = lambda _, value: kafka_to_value(value) if isinstance(brokers, str): brokers = [brokers] debug_logging = rdkafka_logger.isEnabledFor(logging.DEBUG) if debug_logging and "debug" not in kwargs: kwargs["debug"] = "consumer" # Static group instance id management group_instance_id = os.environ.get("KAFKA_GROUP_INSTANCE_ID") if group_instance_id: kwargs["group.instance.id"] = group_instance_id if "group.instance.id" in kwargs: # When doing static consumer group membership, set a higher default # session timeout. The session timeout is the duration after which # the broker considers that a consumer has left the consumer group # for good, and triggers a rebalance. Considering our current # processing pattern, 10 minutes gives the consumer ample time to # restart before that happens. if "session.timeout.ms" not in kwargs: kwargs["session.timeout.ms"] = 10 * 60 * 1000 # 10 minutes if "session.timeout.ms" in kwargs: # When the session timeout is set, rdkafka requires the max poll # interval to be set to a higher value; the max poll interval is # rdkafka's way of figuring out whether the client's message # processing thread has stalled: when the max poll interval lapses # between two calls to consumer.poll(), rdkafka leaves the consumer # group and terminates the connection to the brokers. # # We default to 1.5 times the session timeout if "max.poll.interval.ms" not in kwargs: kwargs["max.poll.interval.ms"] = kwargs[ "session.timeout.ms"] // 2 * 3 consumer_settings = { **kwargs, "bootstrap.servers": ",".join(brokers), "auto.offset.reset": auto_offset_reset, "group.id": group_id, "on_commit": _on_commit, "error_cb": _error_cb, "enable.auto.commit": False, "logger": rdkafka_logger, } self.stop_on_eof = stop_on_eof if self.stop_on_eof: consumer_settings["enable.partition.eof"] = True logger.debug("Consumer settings: %s", consumer_settings) self.consumer = Consumer(consumer_settings) if privileged: privileged_prefix = f"{prefix}_privileged" else: # do not attempt to subscribe to privileged topics privileged_prefix = f"{prefix}" existing_topics = [ topic for topic in self.consumer.list_topics(timeout=10).topics.keys() if (topic.startswith(f"{prefix}.") or topic.startswith(f"{privileged_prefix}.")) ] if not existing_topics: raise ValueError( f"The prefix {prefix} does not match any existing topic " "on the kafka broker") if not object_types: object_types = list( {topic.split(".")[-1] for topic in existing_topics}) self.subscription = [] unknown_types = [] for object_type in object_types: topics = (f"{privileged_prefix}.{object_type}", f"{prefix}.{object_type}") for topic in topics: if topic in existing_topics: self.subscription.append(topic) break else: unknown_types.append(object_type) if unknown_types: raise ValueError( f"Topic(s) for object types {','.join(unknown_types)} " "are unknown on the kafka broker") logger.debug(f"Upstream topics: {existing_topics}") self.subscribe() self.stop_after_objects = stop_after_objects self.eof_reached: Set[Tuple[str, str]] = set() self.batch_size = batch_size if process_timeout is not None: raise DeprecationWarning( "'process_timeout' argument is not supported anymore by " "JournalClient; please remove it from your configuration.", ) def subscribe(self): """Subscribe to topics listed in self.subscription This can be overridden if you need, for instance, to manually assign partitions. """ logger.debug(f"Subscribing to: {self.subscription}") self.consumer.subscribe(topics=self.subscription) def process(self, worker_fn): """Polls Kafka for a batch of messages, and calls the worker_fn with these messages. Args: worker_fn Callable[Dict[str, List[dict]]]: Function called with the messages as argument. """ total_objects_processed = 0 # timeout for message poll timeout = 1.0 with statsd.status_gauge(JOURNAL_STATUS_METRIC, statuses=["idle", "processing", "waiting"]) as set_status: set_status("idle") while True: batch_size = self.batch_size if self.stop_after_objects: if total_objects_processed >= self.stop_after_objects: break # clamp batch size to avoid overrunning stop_after_objects batch_size = min( self.stop_after_objects - total_objects_processed, batch_size, ) set_status("waiting") for i in cycle(reversed(range(10))): messages = self.consumer.consume(timeout=timeout, num_messages=batch_size) if messages: break # do check for an EOF condition iff we already consumed # messages, otherwise we could detect an EOF condition # before messages had a chance to reach us (e.g. in tests) if total_objects_processed > 0 and self.stop_on_eof and i == 0: at_eof = all( (tp.topic, tp.partition) in self.eof_reached for tp in self.consumer.assignment()) if at_eof: break if messages: set_status("processing") batch_processed, at_eof = self.handle_messages( messages, worker_fn) set_status("idle") # report the number of handled messages statsd.increment(JOURNAL_MESSAGE_NUMBER_METRIC, value=batch_processed) total_objects_processed += batch_processed if at_eof: break return total_objects_processed def handle_messages(self, messages, worker_fn): objects: Dict[str, List[Any]] = defaultdict(list) nb_processed = 0 for message in messages: error = message.error() if error is not None: if error.code() == KafkaError._PARTITION_EOF: self.eof_reached.add( (message.topic(), message.partition())) else: _error_cb(error) continue if message.value() is None: # ignore message with no payload, these can be generated in tests continue nb_processed += 1 object_type = message.topic().split(".")[-1] deserialized_object = self.deserialize_message( message, object_type=object_type) if deserialized_object is not None: objects[object_type].append(deserialized_object) if objects: worker_fn(dict(objects)) self.consumer.commit() at_eof = self.stop_on_eof and all( (tp.topic, tp.partition) in self.eof_reached for tp in self.consumer.assignment()) return nb_processed, at_eof def deserialize_message(self, message, object_type=None): return self.value_deserializer(object_type, message.value()) def close(self): self.consumer.close()
'group.id': group, 'default.topic.config': { 'auto.offset.reset': 'earliest' } }) thesetopics = [ tpmat.group(0) for tpmat in [ pat.match(topic) for pat in topicFilter for topic in con.list_topics().topics ] if tpmat ] if thesetopics: con.assign( [TopicPartition(tp, partition, offset) for tp in thesetopics]) didAssign = {tpp.topic for tpp in con.assignment()} diffAssign = set(thesetopics).difference(didAssign) if diffAssign: pe_log( f"Error, something awry: attempt assign topics to consumer group \'{group}\' did not assign topics: {diffAssign}" ) consumers.append((group, con)) pi_log( f"Created consumer group \'{group}\' with topics {sorted(didAssign)}" ) else: pw_log(f"failed to render topics from topics list: \'{topicslist}\'") # were there any consumer groups created? If not, no point in continuing
def run(self): ac = ApiClient() def fail_fast(err, partitions): if err is not None: print("Kafka consumer commit error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors if p.error: print("Kafka consumer commit error: {}".format(p.error)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) #print("Kafka consumer commit successful") pass def on_rebalance(consumer, partitions): for p in partitions: if p.error: raise KafkaException(p.error) print("Kafka partitions rebalanced: {} / {}".format( consumer, partitions)) consumer_conf = self.kafka_config.copy() consumer_conf.update({ 'group.id': self.consumer_group, 'on_commit': fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker 'enable.auto.commit': True, 'enable.auto.offset.store': False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) 'max.poll.interval.ms': 60000, 'default.topic.config': { 'auto.offset.reset': 'latest', }, }) consumer = Consumer(consumer_conf) consumer.subscribe( [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) while True: batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval) if not batch: if not consumer.assignment(): print("... no Kafka consumer partitions assigned yet") print("... nothing new from kafka, try again (interval: {}". format(self.poll_interval)) continue print("... got {} kafka messages".format(len(batch))) # first check errors on entire batch... for msg in batch: if msg.error(): raise KafkaException(msg.error()) # ... then process bulk_actions = [] for msg in batch: json_str = msg.value().decode('utf-8') # HACK: work around a bug where container entities got published to # release_v03 topic if self.elasticsearch_document_name == "release": entity_dict = json.loads(json_str) if entity_dict.get( 'name') and not entity_dict.get('title'): continue entity = entity_from_json(json_str, self.entity_type, api_client=ac) # TODO: handle deletions from index bulk_actions.append( json.dumps({ "index": { "_id": entity.ident, }, })) bulk_actions.append(json.dumps(self.transform_func(entity))) print("Upserting, eg, {} (of {} releases in elasticsearch)".format( entity.ident, len(batch))) elasticsearch_endpoint = "{}/{}/{}/_bulk".format( self.elasticsearch_backend, self.elasticsearch_index, self.elasticsearch_document_name) resp = requests.post( elasticsearch_endpoint, headers={"Content-Type": "application/x-ndjson"}, data="\n".join(bulk_actions) + "\n") resp.raise_for_status() if resp.json()['errors']: desc = "Elasticsearch errors from post to {}:".format( elasticsearch_endpoint) print(desc) print(resp.content) raise Exception(desc) for msg in batch: # offsets are *committed* (to brokers) automatically, but need # to be marked as processed here consumer.store_offsets(message=msg)
def main(args): brokers = args.brokers group_id = args.group_id input_topic = args.input_topic input_partition = args.input_partition output_topic = args.output_topic consumer = Consumer({ 'bootstrap.servers': brokers, 'group.id': group_id, 'auto.offset.reset': 'earliest', # Do not advance committed offsets outside of the transaction. # Consumer offsets are committed along with the transaction # using the producer's send_offsets_to_transaction() API. 'enable.auto.commit': False, 'enable.partition.eof': True, }) # Prior to KIP-447 being supported each input partition requires # its own transactional producer, so in this example we use # assign() to a single partition rather than subscribe(). # A more complex alternative is to dynamically create a producer per # partition in subscribe's rebalance callback. consumer.assign([TopicPartition(input_topic, input_partition)]) producer = Producer({ 'bootstrap.servers': brokers, 'transactional.id': 'eos-transactions.py' }) # Initialize producer transaction. producer.init_transactions() # Start producer transaction. producer.begin_transaction() eof = {} msg_cnt = 0 print("=== Starting Consume-Transform-Process loop ===") while True: # serve delivery reports from previous produce()s producer.poll(0) # read message from input_topic msg = consumer.poll(timeout=1.0) if msg is None: continue topic, partition = msg.topic(), msg.partition() if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: eof[(topic, partition)] = True print("=== Reached the end of {} [{}] at {}====".format( topic, partition, msg.offset())) if len(eof) == len(consumer.assignment()): print("=== Reached end of input ===") break continue # clear EOF if a new message has been received eof.pop((topic, partition), None) msg_cnt += 1 # process message processed_key, processed_value = process_input(msg) # produce transformed message to output topic producer.produce(output_topic, processed_value, processed_key, on_delivery=delivery_report) if msg_cnt % 100 == 0: print( "=== Committing transaction with {} messages at input offset {} ===" .format(msg_cnt, msg.offset())) # Send the consumer's position to transaction to commit # them along with the transaction, committing both # input and outputs in the same transaction is what provides EOS. producer.send_offsets_to_transaction( consumer.position(consumer.assignment()), consumer.consumer_group_metadata()) # Commit the transaction producer.commit_transaction() # Begin new transaction producer.begin_transaction() msg_cnt = 0 print("=== Committing final transaction with {} messages ===".format( msg_cnt)) # commit processed message offsets to the transaction producer.send_offsets_to_transaction( consumer.position(consumer.assignment()), consumer.consumer_group_metadata()) # commit transaction producer.commit_transaction() consumer.close()