def kafka_available(): try: consumer_params = {'bootstrap.servers': '127.0.0.1', 'group.id': 'kafka-unit-test', 'auto.offset.reset': 'latest'} consumer = Consumer(consumer_params) consumer.list_topics(timeout=5) return True except KafkaException: return False
def kafka_GetOffset(self, p_szTopicName, p_szGroupID=''): if self.__kafka_servers__ is None: raise SQLCliException( "Missed kafka server information. Please use set kafka server first .." ) c = Consumer({ 'bootstrap.servers': self.__kafka_servers__, 'group.id': p_szGroupID, }) m_OffsetResults = [] try: for pid in c.list_topics(topic=p_szTopicName ).topics[p_szTopicName].partitions.keys(): tp = TopicPartition(p_szTopicName, pid) (low, high) = c.get_watermark_offsets(tp) m_OffsetResults.append([pid, low, high]) if len(m_OffsetResults) == 0: raise SQLCliException("Topic [" + p_szTopicName + "] does not exist!") return m_OffsetResults except KafkaException as ke: if "SQLCLI_DEBUG" in os.environ: print('traceback.print_exc():\n%s' % traceback.print_exc()) print('traceback.format_exc():\n%s' % traceback.format_exc()) raise ke
def desc_topic(args): c = Consumer({ 'bootstrap.servers': f'{args.broker}', 'group.id': 'confluent-kafka-describe-topic', }) topics = c.list_topics().topics if args.topic not in topics.keys(): print(f'Topic "{args.topic}" not in cluster.') else: topic_metadata = topics[args.topic] partitions, leaders, replicas, isrs = [], [], [], [] for metadata in topic_metadata.partitions.values(): partitions.append(str(metadata.id)) leaders.append(str(metadata.leader)) replicas.append(str(metadata.replicas)) isrs.append(str(metadata.isrs)) partitions = ', '.join(partitions) leaders = ', '.join(leaders) replicas = ', '.join(replicas) isrs = ', '.join(isrs) print(f'Topic: {topic_metadata.topic}') print(f'Partition: {partitions}') print(f'Leader: {leaders}') print(f'Replica: {replicas}') print(f'ISRs: {isrs}') c.close()
def get_starting_offsets(topic_name): offsets_history_filename = get_topic_offsets_filename(topic_name) consumer = Consumer({ 'bootstrap.servers': BOOTSTRAP_SERVERS, 'group.id': 'borisov_get_partitions_number', }) n_partitions = len(consumer.list_topics().topics[topic_name].partitions) if os.path.isfile(offsets_history_filename): status, message = validate_offset_dump(topic_name, offsets_history_filename, n_partitions) if status == 'ERROR': raise Exception(message) else: return message else: starting_offsets_dict = { topic_name: { str(partition): -2 for partition in range(n_partitions) } } return json.dumps(starting_offsets_dict)
def test_storage_direct_writer_anonymized( kafka_prefix: str, kafka_server, consumer: Consumer ): writer_config = { "cls": "kafka", "brokers": [kafka_server], "client_id": "kafka_writer", "prefix": kafka_prefix, "anonymize": True, } storage_config: Dict[str, Any] = { "cls": "pipeline", "steps": [ {"cls": "memory", "journal_writer": writer_config}, ], } storage = get_storage(**storage_config) expected_messages = 0 for obj_type, objs in TEST_OBJECTS.items(): if obj_type == "origin_visit": # these have non-consistent API and are unrelated with what we # want to test here continue method = getattr(storage, obj_type + "_add") method(objs) expected_messages += len(objs) existing_topics = set( topic for topic in consumer.list_topics(timeout=10).topics.keys() if topic.startswith(kafka_prefix) ) assert existing_topics == { f"{kafka_prefix}.{obj_type}" for obj_type in ( "content", "directory", "extid", "metadata_authority", "metadata_fetcher", "origin", "origin_visit", "origin_visit_status", "raw_extrinsic_metadata", "release", "revision", "snapshot", "skipped_content", ) } | { f"{kafka_prefix}_privileged.{obj_type}" for obj_type in ( "release", "revision", ) }
class msgConsumer(): def __init__(self, kafka_server, group_id): self.group_id = group_id conf = { 'bootstrap.servers': kafka_server, 'group.id': self.group_id, 'default.topic.config': { 'auto.offset.reset': 'smallest' } } self.streamReader = Consumer(conf) def topics(self): t = self.streamReader.list_topics() t = t.topics t = t.keys() t = list(t) return t def subscribe(self, topic): self.streamReader.subscribe([topic]) def poll(self): try: msg = self.streamReader.poll(timeout=30) if (msg != None): return msg.value() except: return None def close(self): self.streamReader.close()
def consume_topic(broker, topic, start_from_oldest=False, truncate=False): consumer = KafkaConsumer({ "bootstrap.servers": broker, "group.id": f"get-topic-{time.time_ns()}", "auto.offset.reset": "earliest" if start_from_oldest else "latest", }) metadata = consumer.list_topics(topic) if topic not in metadata.topics: raise Exception("Topic does not exist") topic_partitions = [ TopicPartition(topic, p) for p in metadata.topics[topic].partitions ] consumer.assign(topic_partitions) while True: msg = consumer.poll(0.0) if msg: value = msg.value()[0:100] if truncate else msg.value() print(f"Timestamp: {msg.timestamp()[1]}\n{value}") time.sleep(0.1)
class KafkaQueryConsumer: """ Wraps Kafka library consumer methods which query the broker for metadata and poll for single messages. It is a thin wrapper but allows a fake to be used in unit tests. """ def __init__(self, broker: str): # Set "enable.auto.commit" to False, as we do not need to report to the # kafka broker where we got to (it usually does this in case of a # crash, but we simply restart the process and go and find the last # run_start message. # # Set "queued.min.messages" to 1 as we will consume backwards through # the partition one message at a time; we do not want to retrieve # multiple messages in the forward direction each time we step # backwards by 1 offset conf = { "bootstrap.servers": broker, "group.id": "consumer_group_name", "auto.offset.reset": "latest", "enable.auto.commit": False, "queued.min.messages": 1 } self._consumer = Consumer(**conf) def get_topic_partitions(self, topic: str, offset: int = -1): metadata = self._consumer.list_topics(topic) return [ TopicPartition(topic, partition[1].id, offset=offset) for partition in metadata.topics[topic].partitions.items() ] def seek(self, partition: TopicPartition): """ Set offset in partition, the consumer will seek to that offset """ self._consumer.seek(partition) def poll(self, timeout=2.): """ Poll for a message from Kafka """ return self._consumer.poll(timeout=timeout) def get_watermark_offsets(self, partition: TopicPartition) -> Tuple[int, int]: """ Get the offset of the first and last available message in the given partition """ return self._consumer.get_watermark_offsets(partition, cached=False) def assign(self, partitions: List[TopicPartition]): self._consumer.assign(partitions) def offsets_for_times(self, partitions: List[TopicPartition]): return self._consumer.offsets_for_times(partitions)
def get_topics(broker): consumer = KafkaConsumer({ "bootstrap.servers": broker, "group.id": f"get-topic-{time.time_ns()}", "auto.offset.reset": "latest", }) metadata = consumer.list_topics() for n, v in metadata.topics.items(): print(f"{n} {len(v.partitions)}")
def list_topics(args): c = Consumer({ 'bootstrap.servers': f'{args.broker}', 'group.id': 'confluent-kafka-list-topic', }) metadata = c.list_topics() c.close() for topic in metadata.topics.keys(): print(topic)
class Kafka(): def __init__(self, topic_name, group_id, auto_offset_reset): with open(config_file_path) as kafka_conf: self.conf = yaml.load(kafka_conf, Loader=yaml.FullLoader) self.group_id = group_id self.topic_name = topic_name self.auto_offset_reset = auto_offset_reset self.running_consumer = True self.c = Consumer({ 'bootstrap.servers': self.conf['bootstrap_servers'], 'group.id': self.group_id, 'auto.offset.reset': self.auto_offset_reset }) self.c.subscribe([self.topic_name]) print(self.c.list_topics()) def consume(self): # self.batch_size = batch_size while self.running_consumer: a = 0 msg = self.c.poll(1.0) if msg is None: # empty = Log("Empty") # empty.write("Empty message!","kafka") print("empty message!") msg = "empty".encode('utf-8') #if a%10 == 0: #break # if msg.error(): # err = Log("Error") # err.write(msg.error(),"kafka") # print(msg.value().decode('utf-8')) else: a += 1 msg = msg.value().decode('utf-8') print("message is : {}".format(msg)) #.decode('utf-8'))) self.c.commit() if a % 10 == 0: self.running_consumer = False #return msg # self.c.close() return msg self.consume() def stop_consume(self): self.running_consumer = False time.sleep(10) self.consume()
def count_messages(bootstrap_servers): c = Consumer({'bootstrap.servers': bootstrap_servers, 'group.id': 'group2', 'enable.auto.commit': False, 'auto.offset.reset': 'beginning'}) metadata = c.list_topics() topics = metadata.topics for topic, topicMetadata in topics.items(): for partition in topicMetadata.partitions: (low, high) = c.get_watermark_offsets(TopicPartition(topic, partition)) print(f"{topic} {partition}: {high}")
class KafkaConsumer: def __init__(self, conf, group_id='kafka-rest-service'): conf = dict(conf) conf['group.id'] = group_id self.consumer = Consumer(conf) # @cached(cache=TTLCache(maxsize=1024, ttl=60)) def get_topic_partition_count(self, topic_name): cmd = self.consumer.list_topics(topic_name) tmd = cmd.topics.get(topic_name, None) pcount = 0 if tmd: pcount = len(tmd.partitions) return pcount # @cached(cache=TTLCache(maxsize=1024, ttl=60)) def get_topic_offsets(self, topic_name): pcount = self.get_topic_partition_count(topic_name) if pcount == 0: return dict(error=f"Requested topic {topic_name} not found", status="ERROR", report=None) part_status_map = {} for p in range(pcount): l, h = self.consumer.get_watermark_offsets( TopicPartition(topic_name, p)) part_status_map[p] = [h, '1 month'] def get_minute_report(minute, time_text): timestamp = (datetime.now() - timedelta(minutes=minute)).timestamp() timestamp = int(timestamp) * 1000 partitions = [ TopicPartition(topic_name, p, timestamp) for p in range(pcount) ] partitions = self.consumer.offsets_for_times(partitions) for par in partitions: if par.offset > -1: part_status_map[par.partition][-1] = time_text get_minute_report(60 * 24 * 7, '1 week') get_minute_report(60 * 24, '1 day') get_minute_report(60, '1 hour') get_minute_report(10, '10 minutes') get_minute_report(1, '1 minute') part_status_map = {k: list(v) for k, v in part_status_map.items()} return dict(error=None, status="SUCCESS", topic=topic_name, offsets=part_status_map)
def get_topics(kafka_broker): c_ = Consumer({ 'bootstrap.servers': kafka_broker, 'group.id': "group" + str(uuid.uuid1()), 'auto.offset.reset': 'earliest' }) try: topics = c_.list_topics(timeout=2).topics except: raise NoValidKafkaBroker("no valid broker: ", kafka_broker) return topics
class KafkaConsumer(object): """ 消费者 """ def __init__(self, kafka_url, topic, group_id): self.__kafka_url = kafka_url self.__topic = topic self.consumer = Consumer({ 'bootstrap.servers': self.__kafka_url, 'group.id': group_id, 'default.topic.config': {'auto.offset.reset': 'smallest'} }) assert self.__topic in self.consumer.list_topics().topics, \ 'Kafka.Consumer.init: not found topic[{0}]'.format(self.__topic) self.consumer.subscribe([self.__topic]) def run(self, callbacks=None): if callbacks is None: callbacks = [] try: while True: msg = self.consumer.poll(1) if msg is None: continue if not msg.error(): value = msg.value() try: data = json.loads(value.decode('utf-8')) except Exception as ex: print('[failed][kafka]json.loads message failed: {0}\nvalue: {1}'.format(ex, value)) else: print('[succeed][kafka]message received from {0} [{1}] value: {2}'.format(msg.topic(), msg.partition(), value)) for callback in callbacks: callback(data) elif msg.error().code() != KafkaError._PARTITION_EOF: break except KeyboardInterrupt as e: print('[failed][kafka]KeyboardInterrupt.') finally: self.consumer.close()
def consume(): c = Consumer(conf) num_partitions = len(c.list_topics().topics[topic_name].partitions) topic_partitions = [] for partition_index in range(0, num_partitions - 1): topic_partition = TopicPartition(topic_name, partition_index, 0) topic_partitions.append(topic_partition) c.subscribe([topic_name]) c.assign(topic_partitions) while True: msg = c.poll(5) if msg is None: continue if msg.error(): raise KafkaException(msg.error()) else: print(msg.value())
def create_topic(brokers, topic, partition_count=1, replica_count=1): """Create a topic if it does not exist. Args: brokers (list): The 'host[:port]' list that the producer should contact to bootstrap initial cluster metadata. topic (str): Topic where the message will be published. partition_count (int): Specified partition number (default 1). replica_count (int): Specified replication factor (default 1). Returns: partitions (list): A set including partition number. """ c = Consumer({ 'bootstrap.servers': ','.join(brokers), 'group.id': 'a_consumer' }) topics = c.list_topics().topics c.close() if topic in topics.keys(): partitions = list(topics[topic].partitions.keys()) else: new_topic = NewTopic( topic=topic, num_partitions=partition_count, replication_factor=replica_count, ) admin = AdminClient({ 'bootstrap.servers': ','.join(brokers), }) status = admin.create_topics([new_topic]) while status[topic].done() == False: pass partitions = [p for p in range(partition_count)] return partitions
def create_topic(self): """Creates the topic with the given topic name""" conf = { 'bootstrap.servers': self.bootstrap_servers, 'group.id': 'listTopics', 'session.timeout.ms': 6000, 'auto.offset.reset': 'latest' } consumer = Consumer(**conf) topics = consumer.list_topics().topics.keys() if (self.topic not in topics): client = AdminClient({"bootstrap.servers": self.bootstrap_servers}) futures = client.create_topics([ NewTopic(topic=self.topic, num_partitions=1, replication_factor=1) ]) for _, future in futures.items(): try: future.result() except Exception as e: pass
group_id = 'consumer_group_' + sys.argv[1] # Load Config config = yaml.load(open('./kafka_consumer.yaml'), Loader=yaml.FullLoader) bootstrap_servers = ",".join(config['bootstrap_servers']) # Initialize Consumer consumer = Consumer({ 'bootstrap.servers': bootstrap_servers, 'group.id': group_id, 'auto.offset.reset': 'end' }) # Subscribe to all topics topics = list(consumer.list_topics().topics.keys()) print("Total Topics Found: {}".format(len(topics))) consumer.subscribe(topics) count = 0 while 1: message = consumer.poll(1.0) if message is None: continue if message.error(): print('ERROR: {}'.format(message.error())) print("[{}]: RECEIVE: {}:{}:{}: key={} value={}".format( datetime.now().isoformat(), message.topic(), message.partition(), message.offset(), message.key(), msgpack.unpackb(message.value())))
def get_cluster_metadata(broker: str, timeout: float) -> admin.ClusterMetadata: consumer = Consumer({'bootstrap.servers': broker, 'group.id': 'groupid'}) ret = consumer.list_topics(timeout=timeout) consumer.close() return ret
def launch(api_key, port, data_directory=None, topic='announce'): logging.basicConfig(level=logging.DEBUG) # Initialize the database if data_directory is None: data_directory = os.getcwd() db = tinydb.TinyDB(os.path.join(data_directory, 'run_db.json')) logging.info('Constructing local consumer') consumer = Consumer({ 'bootstrap.servers': 'localhost:' + str(port), 'group.id': 0, 'auto.offset.reset': 'earliest', 'sasl.username': '******', 'sasl.password': api_key, 'security.protocol': 'sasl_plaintext', 'sasl.mechanism': 'PLAIN', }) adm_client = AdminClient({ 'bootstrap.servers': 'localhost:' + str(port), 'group.id': 0, 'auto.offset.reset': 'earliest', 'sasl.username': '******', 'sasl.password': api_key, 'security.protocol': 'sasl_plaintext', 'sasl.mechanism': 'PLAIN', }) # Clean up the Kafka board try: results = adm_client.delete_topics( list(consumer.list_topics().topics.keys())) for v in results.values(): v.result() except ValueError: pass # Create the announce topic try: logging.info('Setting up announce topic') tp_future = adm_client.create_topics([NewTopic('announce', 1, 1)]) tp_future['announce'].result() # Wait for the future logging.info('Topic created!') except KafkaException as ex: logging.warning(ex) logging.info('Connecting to topic: %s', topic) consumer.subscribe([topic]) # Main consumer loop while True: msg = consumer.poll(0.1) # Validate the message is good if msg is None: continue if msg.error(): logging.error('Topic Consumer Error: %s', msg.error()) continue logging.info('Processing Message') process_message(msg.value(), db, data_directory, api_key, port, adm_client)
class Kafka(): def __init__(self, topic_name, group_id, auto_offset_reset, kafka_id): with open(config_file_path) as kafka_conf: self.conf = yaml.load(kafka_conf, Loader=yaml.FullLoader) self.mysql = Mysql() self.elasticsearch_instance = Elastic() self.group_id = group_id self.topic_name = topic_name self.auto_offset_reset = auto_offset_reset self.running_consumer = True self.kafka_id = kafka_id self.c = Consumer({ 'bootstrap.servers': self.conf['bootstrap_servers'], 'group.id': self.group_id, 'auto.offset.reset': self.auto_offset_reset }) self.c.subscribe([self.topic_name]) self.batch_size = float(self.conf['batch_size']) print(self.c.list_topics()) print("{}th kafka_object has created!".format(self.kafka_id)) def consume(self, index, consumer_id): # self.batch_size = batch_size a = 0 data = [] self.index = index self.consumer_id = consumer_id self.old_consumer_record = self.mysql.get_list( consumer_id=self.consumer_id) while self.running_consumer: msg = self.c.poll(self.batch_size) if msg is None: a += 1 print("empty message!") else: a += 1 msg = msg.value().decode('utf-8') data.append(msg) if a % 10 == 0: self.running_consumer = False if len(data) > 5: self.elasticsearch_instance.post(data=data, index=self.index) print("elk_consume for index : {}".format(self.index)) # check change in mysql data = [] a = 0 consumer_record = self.mysql.get_list( consumer_id=self.consumer_id) if consumer_record != self.old_consumer_record: self.old_consumer_record = consumer_record print("record has changed in database!") self.c.close() break else: self.c.commit() self.running_consumer = True return None
consumers = [] for group, (topicslist, partition, offset) in groupsTopics.items(): topicFilter = [re.compile(pat) for pat in topicslist] con = Consumer({ 'bootstrap.servers': ",".join(bsServers), 'group.id': group, 'default.topic.config': { 'auto.offset.reset': 'earliest' } }) thesetopics = [ tpmat.group(0) for tpmat in [ pat.match(topic) for pat in topicFilter for topic in con.list_topics().topics ] if tpmat ] if thesetopics: con.assign( [TopicPartition(tp, partition, offset) for tp in thesetopics]) didAssign = {tpp.topic for tpp in con.assignment()} diffAssign = set(thesetopics).difference(didAssign) if diffAssign: pe_log( f"Error, something awry: attempt assign topics to consumer group \'{group}\' did not assign topics: {diffAssign}" ) consumers.append((group, con)) pi_log(
class ConsoleConsumer: def __init__(self, brokers, topic, offset, key_decoder, value_decoder, registry_url, additional_properties): config = { 'bootstrap.servers': brokers, 'enable.partition.eof': 'true', 'group.id': 'not-used', 'auto.offset.reset': 'earliest', 'enable.auto.commit': 'false' } self.consumer = Consumer({**additional_properties, **config}) self.topic = topic self.offset = offset.lower() self.key_decoder = key_decoder.lower() self.value_decoder = value_decoder.lower() self.avro_serializer = None if registry_url: client = CachedSchemaRegistryClient(registry_url) self.avro_serializer = MessageSerializer(client) def run(self): try: partition_ends = 0 total_parts, partitions = self._partitions() self.consumer.assign(partitions) while True: msg = self.consumer.poll(timeout=0.5) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: eprint( f'{msg.topic()} reached end of partition [{msg.partition()}] at offset {msg.offset()}' ) partition_ends += 1 if partition_ends == total_parts: break elif msg.error(): raise KafkaException(msg.error()) else: record = { 'key': self._decode(self.key_decoder, msg.key()), 'payload': self._decode(self.value_decoder, msg.value()), 'topic': msg.topic(), 'partition': msg.partition(), 'offset': msg.offset(), 'timestamp': msg.timestamp()[1] } print(json.dumps(record)) finally: self.consumer.close() def _partitions(self): parts = [] topic_data = self.consumer.list_topics(topic=self.topic) total_parts = len(topic_data.topics[self.topic].partitions) for i in range(0, total_parts): partition = TopicPartition(self.topic, i, offset=OFFSET_BEGINNING) if self.offset == 'earliest': parts.append(partition) else: try: start, end = self.consumer.get_watermark_offsets( partition, timeout=0.5) real_offset = int(self.offset) ass_offset = (end + real_offset) if ( real_offset < 0) else (start + real_offset) parts.append( TopicPartition(self.topic, i, offset=ass_offset)) except ValueError: eprint(f"Could not parse offset: {self.offset}") exit(1) return total_parts, parts def _decode(self, data_type, payload): if data_type == "avro": return self.avro_serializer.decode_message(payload) payload_str = payload.decode('utf-8') try: return json.loads(payload_str) except (JSONDecodeError, TypeError): return payload_str
def get_last_n_messages( self, n: int) -> Optional[List[Tuple[datetime.datetime, Dict]]]: ''' Returns the last n published timestamps and messages or None, if no message has been published yet. If the configured topic has more than one partition, you will receive more messages than requested (at most partitions * n). You might receive less messages than requested, if the broker has cleared messages. :return: List of tuples with timestamp and message or None if no message has been published yet ''' consumer = Consumer({ 'bootstrap.servers': self.__kafka_bootstrap, 'group.id': self.__import_id }) partitions = consumer.list_topics(topic=self.__kafka_topic).topics[ self.__kafka_topic].partitions.keys() self.__logger.debug("Found " + str(len(partitions)) + " partition(s) of topic " + self.__kafka_topic) num_messages = 0 topic_partitions = [] for partition in partitions: high_low_offset = consumer.get_watermark_offsets( cimpl.TopicPartition(self.__kafka_topic, partition=partition)) high_offset = high_low_offset[1] low_offset = high_low_offset[0] available_messages = high_offset - low_offset self.__logger.debug("Low/High offset of partition " + str(partition) + " is " + str(low_offset) + "/" + str(high_offset)) if high_offset > 0: # Ignore partitions without data if available_messages >= n: offset = high_offset - n num_messages += n else: offset = low_offset num_messages += available_messages partition = cimpl.TopicPartition(self.__kafka_topic, partition=partition, offset=offset) topic_partitions.append(partition) self.__logger.debug("Setting offset of partition " + str(partition)) if len(topic_partitions) == 0: # No partition has any data return None consumer.assign(topic_partitions) consumer.commit(offsets=topic_partitions) tuples = [] consumed_messages = 0 batch_size = 10000 self.__logger.debug("Consuming last " + str(num_messages) + " message(s)") while consumed_messages < num_messages: if consumed_messages + batch_size <= num_messages: to_consume = batch_size else: to_consume = num_messages - consumed_messages consumed_messages += to_consume self.__logger.debug("Consuming batch of " + str(to_consume) + " messages") msgs = consumer.consume(num_messages=to_consume, timeout=30) for msg in msgs: value = json.loads(msg.value()) if 'time' not in value: self.__logger.warning( "time field missing in message, is someone else using this topic? Ignoring " "message") continue if 'value' not in value or not isinstance( value['value'], Dict): self.__logger.warning( "value field missing or malformed in message, is someone else using this topic? " "Ignoring message") continue try: date_time = datetime.datetime.strptime( value["time"], "%Y-%m-%dT%H:%M:%SZ") except ValueError: self.__logger.warning( "time field not in rfc3339 format, is someone else using this topic? Ignoring " "message") continue tuples.append((date_time, value["value"])) consumer.close() return tuples
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()
class ConfluentKafkaMsgQAPI: """ This class provides API's into interact with Kafka Queue. """ def __init__(self, is_producer=False, is_consumer=False, perform_subscription=False, thread_identifier=None): if not is_producer and not is_consumer: logging_to_console_and_syslog( "ConfluentKafkaMsgQAPI: You need to pick either producer or consumer." ) pass self.producer_instance = None self.consumer_instance = None self.broker_name = None self.topic = None self.producer_conf = None self.consumer_conf = None self.is_topic_created = False self.perform_subscription = perform_subscription self.thread_identifier = thread_identifier self.__read_environment_variables() if is_producer: self.__producer_connect() if is_consumer: self.__consumer_connect() def __read_environment_variables(self): """ This method is used to read the environment variables defined in the OS. :return: """ while self.broker_name is None or \ self.topic is None: time.sleep(2) logging_to_console_and_syslog( "ConfluentKafkaMsgQAPI: " "Trying to read the environment variables...") self.broker_name = os.getenv("broker_name_key", default=None) self.topic = os.getenv("topic_key", default=None) logging_to_console_and_syslog( "ConfluentKafkaMsgQAPI: broker_name={}".format(self.broker_name)) logging_to_console_and_syslog("ConfluentKafkaMsgQAPI: topic={}".format( self.topic)) # Optional per-message delivery callback (triggered by poll() or flush()) # when a message has been successfully delivered or permanently # failed delivery (after retries). @staticmethod def delivery_callback(err, msg): if err: logging_to_console_and_syslog('%% Message failed delivery: %s\n' % err) else: logging_to_console_and_syslog( '%% Message delivered to %s [%d] @ %s\n' % (msg.topic(), msg.partition(), str(msg.offset()))) def __producer_connect(self): """ This method tries to connect to the kafka broker based upon the type of kafka. :return: """ while self.producer_instance is None: try: self.producer_conf = {'bootstrap.servers': self.broker_name} # Create Producer instance self.producer_instance = Producer(**self.producer_conf) except: print("Exception in user code:") print("-" * 60) traceback.print_exc(file=sys.stdout) print("-" * 60) time.sleep(5) else: logging_to_console_and_syslog( "ConfluentKafkaMsgQAPI: Successfully " "connected to broker_name={}".format(self.broker_name)) def __consumer_connect(self): status = False try: if self.perform_subscription: self.__consumer_connect_to_broker() self.__subscribe_to_a_topic() # self.__iterate_over_kafka_consumer_instance_messages() else: self.__consumer_connect_to_kafka_broker_and_to_a_topic() # self.__consumer_poll_for_new_messages() status = True except: logging_to_console_and_syslog( "{}:Exception occurred while polling for " "a message from kafka Queue. {} ".format( self.thread_identifier, sys.exc_info()[0])) print("{}:Exception in user code:".format(self.thread_identifier)) print("-" * 60) traceback.print_exc(file=sys.stdout) print("-" * 60) return status def enqueue(self, filename): """ This method tries to post a message to the pre-defined kafka topic. :param filename: :return status False or True: """ status = False if filename is None or len(filename) == 0: logging_to_console_and_syslog( "ConfluentKafkaMsgQAPI: filename is None or invalid") return status if self.producer_instance is None: logging_to_console_and_syslog( "ConfluentKafkaMsgQAPI: instance is None") return status if not self.is_topic_created: try: if self.producer_instance.list_topics(self.topic, timeout=1.0): logging_to_console_and_syslog( "Found topic name = {} in the zookeeper.".format( self.topic)) self.is_topic_created = True except KafkaException: self.kafka_admin_client = admin.AdminClient(self.producer_conf) logging_to_console_and_syslog("Creating topic {}.".format( self.topic)) ret = self.kafka_admin_client.create_topics( new_topics=[ admin.NewTopic(topic=self.topic, num_partitions=1) ], operation_timeout=1.0) logging_to_console_and_syslog("ret = {}".format(ret)) # Asynchronously produce a message, the delivery report callback # will be triggered from poll() above, or flush() below, when the message has # been successfully delivered or failed permanently. logging_to_console_and_syslog( "ConfluentKafkaMsgQAPI: Posting filename={} into " "kafka broker={}, topic={}".format(filename, self.broker_name, self.topic)) value = filename.encode('utf-8') try: # Produce line (without newline) self.producer_instance.produce( self.topic, value, callback=ConfluentKafkaMsgQAPI.delivery_callback) status = True except BufferError: sys.stderr.write('%% Local producer queue is full ' '(%d messages awaiting delivery): try again\n' % len(self.producer_instance)) status = False except: print("ConfluentKafkaMsgQAPI: Exception in user code:") print("-" * 60) traceback.print_exc(file=sys.stdout) print("-" * 60) status = False else: event = "ConfluentKafkaMsgQAPI: Posting filename={} into " \ "kafka broker={}, topic={}." \ .format(filename, self.broker_name, self.topic) logging_to_console_and_syslog(event) # Wait for any outstanding messages to be delivered and delivery report # callbacks to be triggered. # Serve delivery callback queue. # NOTE: Since produce() is an asynchronous API this poll() call # will most likely not serve the delivery callback for the # last produce()d message. self.producer_instance.poll(timeout=0.1) # Wait until all messages have been delivered # sys.stderr.write('%% Waiting for %d deliveries\n' % len(self.producer_instance)) self.producer_instance.flush(timeout=0.1) return status def __consumer_connect_to_kafka_broker_and_to_a_topic(self): """ This method tries to connect to the kafka broker. :return: """ pass def __consumer_poll_for_new_messages(self): logging_to_console_and_syslog( "{}: Polling the kafka consumer instance for " "new messages in the topic {}.".format(self.thread_identifier, self.topic)) # Read messages from Kafka, print to stdout try: while True: msg = self.consumer_instance.poll(timeout=1.0) if msg is None: continue if msg.error(): raise KafkaException(msg.error()) else: # Proper message sys.stderr.write('%% %s [%d] at offset %d with key %s:\n' % (msg.topic(), msg.partition(), msg.offset(), str(msg.key()))) print(msg.value()) except KeyboardInterrupt: sys.stderr.write('%% Aborted by user\n') finally: # Close down consumer to commit final offsets. self.consumer_instance.close() """ msg = self.consumer_instance.poll(timeout=5.0) if msg is None: return None if msg.error(): raise KafkaException(msg.error()) else: logging_to_console_and_syslog("msg = {}".format(msg)) logging_to_console_and_syslog('Consumer:{}: Rcvd msg %% %s [%d] at offset %d with key %s: value : %s\n' .format(self.thread_identifier, msg.topic(), msg.partition(), msg.offset(), str(msg.key()), str(msg.value())) ) return msg.value() """ return None def __consumer_connect_to_broker(self): """ This method tries to connect to the kafka broker. :return: """ if self.consumer_instance: return # Consumer configuration # See https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md """ self.consumer_conf = {'bootstrap.servers': self.broker_name, 'group.id': 'kafka-consumer{}'.format(self.thread_identifier), 'session.timeout.ms': 6000, 'auto.offset.reset': 'earliest'} """ consumer_conf = { 'bootstrap.servers': self.broker_name, 'group.id': 'group', 'session.timeout.ms': 6000, 'auto.offset.reset': 'earliest' } consumer_conf['stats_cb'] = stats_cb consumer_conf['statistics.interval.ms'] = 0 # Create logger for consumer (logs will be emitted when poll() is called) logger = logging.getLogger('consumer') logger.setLevel(logging.DEBUG) handler = logging.StreamHandler() handler.setFormatter( logging.Formatter('%(asctime)-15s %(levelname)-8s %(message)s')) logger.addHandler(handler) while self.consumer_instance is None: try: logging_to_console_and_syslog( "Consumer:{}:Trying to connect to broker_name={}".format( self.thread_identifier, self.broker_name)) # Create Consumer instance # Hint: try debug='fetch' to generate some log messages self.consumer_instance = Consumer(consumer_conf, logger=logger) except: logging_to_console_and_syslog( "Consumer:{}:Exception in user code:".format( self.thread_identifier)) logging_to_console_and_syslog("-" * 60) traceback.print_exc(file=sys.stdout) logging_to_console_and_syslog("-" * 60) time.sleep(5) logging_to_console_and_syslog("Consumer:{}:Consumer Successfully " "connected to broker_name={}".format( self.thread_identifier, self.broker_name)) @staticmethod def print_assignment(consumer, partitions): logging_to_console_and_syslog('consumer = {}, Assignment {}:', repr(consumer), partitions) def __subscribe_to_a_topic(self): try: # Subscribe to topics cluster_meta_data = self.consumer_instance.list_topics(self.topic, timeout=0.3) logging_to_console_and_syslog("ClusterMetaData={}".format( repr(cluster_meta_data))) if self.topic not in cluster_meta_data.topics.keys(): logging_to_console_and_syslog( "Topic {} is " "not found in the ClusterMetaData {}".format( self.topic, repr(cluster_meta_data.topics.keys()))) raise KafkaException def print_assignment(consumer, partitions): print('Assignment:', partitions) # Subscribe to topics self.consumer_instance.subscribe(self.topics, on_assign=print_assignment) """ self.consumer_instance.subscribe(self.topic, on_assign=ConfluentKafkaMsgQAPI.print_assignment) """ except: logging_to_console_and_syslog( "Consumer:{}: Subscribed to topic {}.".format( self.thread_identifier, self.topic)) return True def __iterate_over_kafka_consumer_instance_messages(self): """ logging_to_console_and_syslog("Consumer:{}: dequeue {}." .format(self.thread_identifier, self.topic)) """ pass def dequeue(self): try: if self.perform_subscription: # logging_to_console_and_syslog("{}:Perform __consumer_poll_for_new_messages." # .format(self.thread_identifier)) return self.__consumer_poll_for_new_messages() else: # logging_to_console_and_syslog("{}:Perform __iterate_over_kafka_consumer_instance_messages." # .format(self.thread_identifier)) return self.__iterate_over_kafka_consumer_instance_messages() except: logging_to_console_and_syslog( "ConfluentKafkaMsgQAPI:Exception occurred while polling for " "a message from kafka Queue. {} ".format(sys.exc_info()[0])) logging_to_console_and_syslog( "ConfluentKafkaMsgQAPI:Exception in user code:") logging_to_console_and_syslog("-" * 60) traceback.print_exc(file=sys.stdout) logging_to_console_and_syslog("-" * 60) return None def cleanup(self): pass
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({ 'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb }) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list( map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()
if check and arr[0] >= arr[1] // 2: fw.write(f"{line}") line = f.readline() f.close() fw.close() consumer = Consumer({ 'bootstrap.servers': config['DEFAULT']['KafkaServer'], 'group.id': 'mygroup', 'client.id': 'client-1', 'enable.auto.commit': True, 'session.timeout.ms': 6000, 'default.topic.config': { 'auto.offset.reset': 'smallest' }, }) admin_client = AdminClient( {'bootstrap.servers': config["DEFAULT"]["KafkaServer"]}) clusterMetaData = consumer.list_topics() # gather topic name topics_delete = [] for key in clusterMetaData.topics: if topic_header in key: topics_delete.append(key) # delete finished topics admin_client.delete_topics(topics_delete)
def list_topics(c: Consumer, filter_by="ztf"): topics = c.list_topics().topics.keys() return list(filter(lambda x: filter_by in x, list(topics)))
class KafkaHandler(BaseHandler[KafkaHandlerConfig]): config_cls = KafkaHandlerConfig _eof_reached: Dict[int, bool] OFFSET_AT_FIRST_MESSAGE = OFFSET_BEGINNING OFFSET_AFTER_LAST_MESSAGE = OFFSET_END # hopefully this number won't get assigned any semantics by the Kafka Devs any time soon OFFSET_AT_LAST_MESSAGE = -101 def __init__(self, config: KafkaHandlerConfig): super().__init__(config) self._assignment_created = False self._seek = OFFSET_BEGINNING self._high_watermarks: Dict[int, int] = {} self._consumer: Optional[Consumer] = None self._producer: Optional[Producer] = None self._errors: List[KafkaError] = [] def _get_producer(self) -> Producer: if self._producer is not None: return self._producer config_instance = esque_config.Config() with config_instance.temporary_context(self.config.esque_context): self._producer = Producer( config_instance.create_confluent_config( include_schema_registry=False)) return self._producer def _get_consumer(self) -> Consumer: if self._consumer is not None: return self._consumer config_instance = esque_config.Config() with config_instance.temporary_context(self.config.esque_context): group_id = self.config.consumer_group_id self._consumer = Consumer({ "group.id": group_id, "enable.partition.eof": True, "enable.auto.commit": False, **config_instance.create_confluent_config(include_schema_registry=False), }) topic_metadata: TopicMetadata = self._consumer.list_topics( self.config.topic_name).topics[self.config.topic_name] if topic_metadata.error is not None: raise EsqueIOHandlerReadException( f"Topic {self.config.topic_name!r} not found.") self._eof_reached = { partition_id: False for partition_id in topic_metadata.partitions.keys() } for partition_id in topic_metadata.partitions.keys(): self._high_watermarks[ partition_id] = self._consumer.get_watermark_offsets( TopicPartition(topic=self.config.topic_name, partition=partition_id))[1] return self._consumer def get_serializer_configs(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: raise EsqueIOSerializerConfigNotSupported def put_serializer_configs( self, config: Tuple[Dict[str, Any], Dict[str, Any]]) -> None: raise EsqueIOSerializerConfigNotSupported def write_message( self, binary_message: Union[BinaryMessage, StreamEvent]) -> None: self._produce_single_message(binary_message=binary_message) self._flush() def write_many_messages( self, message_stream: Iterable[Union[BinaryMessage, StreamEvent]]) -> None: for binary_message in message_stream: self._produce_single_message(binary_message=binary_message) self._flush() def _produce_single_message(self, binary_message: BinaryMessage) -> None: if isinstance(binary_message, StreamEvent): return partition_arg = {} partition = self._io_to_confluent_partition(binary_message.partition) if partition is not None: partition_arg["partition"] = partition self._get_producer().produce( topic=self.config.topic_name, value=binary_message.value, key=binary_message.key, headers=self._io_to_confluent_headers(binary_message.headers), timestamp=self._io_to_confluent_timestamp( binary_message.timestamp), on_delivery=self._delivery_callback, **partition_arg, ) def _delivery_callback(self, err: Optional[KafkaError], msg: str): if err is None: return self._errors.append(err) def _flush(self): self._get_producer().flush() if self._errors: exception = EsqueIOHandlerWriteException( "The following exception(s) occurred while writing to Kafka:\n " + "\n ".join(map(str, self._errors))) self._errors.clear() raise exception @staticmethod def _io_to_confluent_partition(partition: int) -> Optional[int]: # TODO: introduce something like the config.send_timestamp flag to make it possible to always return None here. # This would allow for moving messages between topics with different amounts of partitions without making them # unbalanced. if partition < 0: return None return partition def _io_to_confluent_timestamp(self, message_ts: datetime.datetime): return int(message_ts.timestamp() * 1000) if self.config.send_timestamp else 0 @staticmethod def _io_to_confluent_headers( headers: List[MessageHeader] ) -> Optional[List[Tuple[str, Optional[bytes]]]]: if not headers: return None confluent_headers: List[Tuple[str, Optional[bytes]]] = [] for header in headers: key = header.key if header.value is not None: value = header.value.encode("utf-8") else: value = None confluent_headers.append((key, value)) return confluent_headers def read_message(self) -> Union[BinaryMessage, StreamEvent]: if not self._assignment_created: self._assign() consumed_message: Optional[Message] = None while consumed_message is None: consumed_message = self._get_consumer().poll(timeout=0.1) if consumed_message is None and all(self._eof_reached.values()): return TemporaryEndOfPartition( "Reached end of all partitions", partition=EndOfStream.ALL_PARTITIONS) # TODO: process other error cases (connection issues etc.) if consumed_message.error() is not None and consumed_message.error( ).code() == KafkaError._PARTITION_EOF: self._eof_reached[consumed_message.partition()] = True return TemporaryEndOfPartition( "Reached end of partition", partition=consumed_message.partition()) else: self._eof_reached[consumed_message.partition()] = False binary_message = self._confluent_to_binary_message( consumed_message) return binary_message def _confluent_to_binary_message( self, consumed_message: Message) -> BinaryMessage: binary_message = BinaryMessage( key=consumed_message.key(), value=consumed_message.value(), partition=consumed_message.partition(), offset=consumed_message.offset(), timestamp=self._confluent_to_io_timestamp(consumed_message), headers=self._confluent_to_io_headers(consumed_message.headers()), ) return binary_message @staticmethod def _confluent_to_io_timestamp( consumed_message: Message) -> datetime.datetime: return datetime.datetime.fromtimestamp( consumed_message.timestamp()[1] / 1000, tz=datetime.timezone.utc) @staticmethod def _confluent_to_io_headers( confluent_headers: Optional[List[Tuple[str, Optional[bytes]]]] ) -> List[MessageHeader]: io_headers: List[MessageHeader] = [] if confluent_headers is None: return io_headers for confluent_header in confluent_headers: key, value = confluent_header if value is not None: value = value.decode("utf-8") io_headers.append(MessageHeader(key, value)) return io_headers def message_stream(self) -> Iterable[Union[BinaryMessage, StreamEvent]]: while True: yield self.read_message() def seek(self, position: int) -> None: self._seek = position def _assign(self) -> None: self._assignment_created = True if self._seek == self.OFFSET_AT_LAST_MESSAGE: self._get_consumer().assign([ TopicPartition(topic=self.config.topic_name, partition=partition_id, offset=high_watermark - 1) for partition_id, high_watermark in self._high_watermarks.items() ]) else: self._get_consumer().assign([ TopicPartition(topic=self.config.topic_name, partition=partition_id, offset=self._seek) for partition_id in self._eof_reached.keys() ]) def close(self) -> None: if self._consumer is not None: self._consumer.close() self._consumer = None if self._producer is not None: self._producer.flush() self._producer = None