def consume(self, count: int): consumer = DeserializingConsumer({ 'bootstrap.servers': self.brokers, 'key.deserializer': StringDeserializer('utf_8'), 'value.deserializer': self._make_deserializer(), 'group.id': self.group, 'auto.offset.reset': "earliest" }) consumer.subscribe([self.topic]) self.logger.info("Consuming %d %s records from topic %s with group %s", count, self.schema_type.name, self.topic, self.group) while self.consumed < count: msg = consumer.poll(1) if msg is None: continue payload = msg.value() self.logger.debug("Consumed %d at %d", payload.val, msg.offset()) assert payload.val == self.consumed self.consumed += 1 consumer.close()
def _consume(self, on_consume): if isinstance(on_consume, types.FunctionType): callback = on_consume else: callback_cls = on_consume() callback = callback_cls.on_message consumer = DeserializingConsumer(self.kafka_config) consumer.subscribe([self.topic]) q = Queue(maxsize=self.num_threads) msg = None while True: try: # Check if we should rate limit msg = consumer.poll(1) if msg is None: continue if msg.error(): logger.error( f'Worker for topic {self.topic} error: {msg.error()}') continue q.put(msg) t = threading.Thread( target=_process_msg, args=(q, consumer, callback, self.topic), ) t.start() except Exception as err: logger.error( f'Worker for topic {self.topic} terminated: {err}') logger.error(msg) consumer.close() break
def main(args): topic = args.topic protobuf_deserializer = ProtobufDeserializer(user_pb2.User) string_deserializer = StringDeserializer('utf_8') consumer_conf = {'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': protobuf_deserializer, 'group.id': args.group, 'auto.offset.reset': "earliest"} consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) while True: try: msg = consumer.poll(1.0) if msg is None: continue user = msg.value() if user is not None: print(f"User record {msg.key()}:\n name: {user.name}\n" f"\tfavorite_number: {user.favorite_color}\n" f"\tfavorite_color: {user.favorite_number}\n") except KeyboardInterrupt: break consumer.close()
def kafpubsub(args): publisher = pubsub.PublisherClient() project_id = args.project kafka_topic = args.topic pubsub_topic = f'projects/{project_id}/topics/{kafka_topic}' try: publisher.create_topic(pubsub_topic) except AlreadyExists: pass # I don't need an error if topic already created. consumer_conf = { 'bootstrap.servers': args.bootstrap_server, 'group.id': args.group_id, 'auto.offset.reset': args.auto_offset_reset } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([kafka_topic]) logging.info( f'Publish Kafka ({args.bootstrap_server}) values to pubsub...') while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. msg = consumer.poll(1.0) if msg is None: continue logging.debug(f'> {datetime.today()} | {msg.key()}\n') publisher.publish(pubsub_topic, msg.value()) except KeyboardInterrupt: break consumer.close()
def main(args): topic = args.topic protobuf_deserializer = ProtobufDeserializer(user_pb2.User) string_deserializer = StringDeserializer('utf_8') consumer_conf = {'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': protobuf_deserializer, 'group.id': args.group, 'auto.offset.reset': "earliest"} consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. msg = consumer.poll(1.0) if msg is None: continue user = msg.value() if user is not None: print("User record {}: name: {}\n" "\tfavorite_number: {}\n" "\tfavorite_color: {}\n" .format(msg.key(), user.name, user.favorite_color, user.favorite_number)) except KeyboardInterrupt: break consumer.close()
def run_consumer(shutdown_flag, clients, lock): print("Starting Kafka Consumer.") schema_registry_client = SchemaRegistryClient( {"url": "http://localhost:8081"}) deserializer = AvroDeserializer(schema_registry_client) config = { "bootstrap.servers": "localhost:9092", "group.id": "dashboard-demo", "value.deserializer": deserializer } consumer = DeserializingConsumer(config) consumer.subscribe(["DASHBOARD"]) while not shutdown_flag.done(): msg = consumer.poll(0.2) if msg is None: print("Waiting...") elif msg.error(): print(f"ERROR: {msg.error()}") else: value = msg.value() formatted = simplejson.dumps(value) print(f"Sending {formatted} to {clients}") with lock: websockets.broadcast(clients, formatted) print("Closing Kafka Consumer") consumer.close()
def receive(): json_deserializer = JSONDeserializer(USER_SCHEMA, from_dict=dict_to_user) string_deserializer = StringDeserializer('utf_8') consumer_conf = { 'bootstrap.servers': 'localhost:9092', 'key.deserializer': string_deserializer, 'value.deserializer': json_deserializer, 'group.id': 'django-kafka', 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([USER_TOPIC]) """ The idea is to start the Kafka consumer when the message is sent to the Kafka producer. Resulting in two queues: Task Queue and Message/Content Queue. Multi-threading might be an overkill for a simple application, hence the for loop (Temporary). """ for x in range(200): try: msg = consumer.poll(timeout=5.0) if msg is not None: user = msg.value() if user is not None: print("User record {}: username: {}\n" "\tdata: {}\n".format(msg.key(), user.username, user.data)) except Exception as e: print('An exception occurred: {}'.format(e)) logging.error(traceback.format_exc())
def main(args): topic = args.topic schema_str = """ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "User", "description": "A Confluent Kafka Python User", "type": "object", "properties": { "name": { "description": "User's name", "type": "string" }, "favorite_number": { "description": "User's favorite number", "type": "number", "exclusiveMinimum": 0 }, "favorite_color": { "description": "User's favorite color", "type": "string" } }, "required": [ "name", "favorite_number", "favorite_color" ] } """ json_deserializer = JSONDeserializer(schema_str, from_dict=dict_to_user) string_deserializer = StringDeserializer('utf_8') consumer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': json_deserializer, 'group.id': args.group, 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. msg = consumer.poll(1.0) if msg is None: continue user = msg.value() if user is not None: print("User record {}: name: {}\n" "\tfavorite_number: {}\n" "\tfavorite_color: {}\n".format(msg.key(), user.name, user.favorite_color, user.favorite_number)) except KeyboardInterrupt: break consumer.close()
def main(args): topic = args.topic outputtopic = args.outputtopic schema_str = EventSchema schema_enriched_event_str = EnrichedEventSchema sr_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(sr_conf) avro_deserializer = AvroDeserializer(schema_str, schema_registry_client) string_deserializer = StringDeserializer('utf_8') avro_serializer = AvroSerializer(schema_enriched_event_str, schema_registry_client) consumer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': args.group + str(random.Random()), 'auto.offset.reset': "latest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) cluster = Cluster([args.host]) session = cluster.connect("datascience") session.row_factory = dict_factory producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': avro_serializer } producer = SerializingProducer(producer_conf) while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. start = time.time() msg = consumer.poll(1.0) if msg is None: continue evt = msg.value() enrich(evt, session, producer, outputtopic) except Exception: print('Exception', sys.exc_info()[0]) continue consumer.close()
def consume(consumer: DeserializingConsumer, timeout) -> iter: while True: # Waiting for message until timeout reached if there is no message. # If message exists, message will be returned. message = consumer.poll(timeout) # print('[kafka] polling...') if message is None: continue if message.error(): print('Consumer error: {}'.format(message.error())) continue yield message
def main(): schema_registry_client = SchemaRegistryClient({'url': SCHEMA_REGISTRY_URL}) avro_deserializer = AvroDeserializer( schema_registry_client=schema_registry_client) string_deserializer = StringDeserializer('utf_8') consumer_conf = { 'bootstrap.servers': BOOTSTRAP_SERVERS, 'key.deserializer': string_deserializer, 'max.poll.interval.ms': MAX_POLL_INTERVAL_MS, 'value.deserializer': avro_deserializer, 'group.id': CONSUMER_GROUP } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([TOPIC]) while True: try: msg = consumer.poll(1.0) if msg is None: continue msg_value = msg.value() if msg_value is not None: try: measurements = list(dict(msg_value).get("measurements")) measurements_df = pd.DataFrame(measurements) groups = measurements_df.groupby("tenant") for _, group in groups: tenant = group.iloc[0]['tenant'] device_registry = DeviceRegistry( tenant, AIRQO_BASE_URL) group_measurements = list( group.to_dict(orient="records")) for i in range(0, len(group_measurements), int(REQUEST_BODY_SIZE)): measurements_list = group_measurements[ i:i + int(REQUEST_BODY_SIZE)] device_registry.insert_events(measurements_list) except Exception as ex: print(ex) except KeyboardInterrupt: break consumer.close()
def main(args): topic = args.topic schema_str = """ { "namespace": "confluent.io.examples.serialization.avro", "name": "User", "type": "record", "fields": [ {"name": "name", "type": "string"}, {"name": "favorite_number", "type": "int"}, {"name": "favorite_color", "type": "string"} ] } """ sr_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(sr_conf) avro_deserializer = AvroDeserializer(schema_str, schema_registry_client, dict_to_user) string_deserializer = StringDeserializer('utf_8') consumer_conf = {'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': args.group, 'auto.offset.reset': "earliest"} consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. msg = consumer.poll(1.0) if msg is None: continue user = msg.value() if user is not None: print("User record {}: name: {}\n" "\tfavorite_number: {}\n" "\tfavorite_color: {}\n" .format(msg.key(), user.name, user.favorite_color, user.favorite_number)) except KeyboardInterrupt: break consumer.close()
def main(args): topic = args.topic key_schema_str = open('schema/KeySchema.avsc', "r").read() value_schema_str = open('schema/ValueSchema.avsc', "r").read() sr_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(sr_conf) avro_key_deserializer = AvroDeserializer(key_schema_str, schema_registry_client, dict_to_user_quote_key) avro_value_deserializer = AvroDeserializer(value_schema_str, schema_registry_client, dict_to_user_quote_value) consumer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': avro_key_deserializer, 'value.deserializer': avro_value_deserializer, 'group.id': args.group, 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. msg = consumer.poll(1.0) if msg is None: continue user_quote = msg.value() if user_quote is not None: print("User {} Quote record: product_id: {}\n" "\tquoted_price: {}\n" "\tquoted_quantity: {}\n" "\tuser_note: {}\n".format(msg.key().user_id, user_quote.product_id, user_quote.quoted_price, user_quote.quoted_quantity, user_quote.user_note)) except KeyboardInterrupt: break consumer.close()
def main(): top = 20 consumer = DeserializingConsumer({ 'bootstrap.servers': os.environ['KAFKA_BROKERS'], 'security.protocol': 'SASL_SSL', 'sasl.mechanism': 'SCRAM-SHA-512', 'sasl.password': os.environ['KAFKA_PASS'], 'sasl.username': os.environ['KAFKA_USER'], 'ssl.ca.location': '/usr/local/share/ca-certificates/Yandex/YandexCA.crt', 'group.id': 'group1', 'key.deserializer': StringDeserializer(), 'value.deserializer': LongDeserializer(), }) consumer.subscribe(['streams-wordcount-output']) try: frequencies = [] while True: msg = consumer.poll(1.0) if msg is None: if frequencies: print('==============================================') print(f'Current list of top {top} most frequent words:') frequencies = sorted(frequencies, key=lambda x: x[1], reverse=True) for frequency in frequencies[0:top]: print(f'{frequency[0]}: {frequency[1]}') frequencies.clear() continue elif msg.error(): print('error: {}'.format(msg.error())) else: frequencies.append((msg.key(), msg.value())) except KeyboardInterrupt: pass finally: consumer.close()
def main(): string_deserializer = StringDeserializer('utf_8') conf = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'bitcoin_group', 'key.deserializer': string_deserializer, 'value.deserializer': string_deserializer, 'session.timeout.ms': 6000, 'fetch.wait.max.ms': 5000, 'auto.offset.reset': 'smallest', 'enable.auto.commit': 'false', 'fetch.min.bytes': 307200 } consumer = DeserializingConsumer(conf) consumer.subscribe(['bitcoin-transaction']) messages = [] try: while True: msg = consumer.poll(timeout=1000) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event sys.stderr.write( '%% %s [%d] reached end at offset %d\n' % (msg.topic(), msg.partition(), msg.offset())) elif msg.error(): raise KafkaException(msg.error()) else: obj = json.loads(msg.value()) transaction = dict_to_transaction(obj) messages.append(transaction) if len(messages) > 100: messages = sorted(messages, key=lambda x: x.price, reverse=True)[0:10] print(messages) consumer.commit(asynchronous=False) except KeyboardInterrupt: sys.stderr.write('%% Aborted by user\n') finally: # Close down consumer to commit final offsets. consumer.close()
def run_consumer(container_manager): schema_registry_conf = {'url': config['kafka']['schema_registry']} schema_registry_client = SchemaRegistryClient(schema_registry_conf) avro_deserializer = AvroDeserializer(schemas.run_record_schema, schema_registry_client) string_deserializer = StringDeserializer('utf_8') conf = { 'bootstrap.servers': config['kafka']['servers'], 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': "runs-consumers", 'auto.offset.reset': 'earliest', 'enable.auto.commit': 'false' } consumer = DeserializingConsumer(conf) print('[+] Listening for incoming runs') try: consumer_topics = [config['kafka']['runs-topic']] consumer.subscribe(consumer_topics) while True: try: msg = consumer.poll(timeout=1.0) if msg is None: continue if msg.error(): raise KafkaException(msg.error()) else: print('[-] Run initialization') print(msg.value()) consumer.commit(asynchronous=False) # handlers.handle_run_execution(container_manager, msg.value()) threading.Thread(target=handlers.handle_run_execution, args=(container_manager, msg.value())).start() except ConsumeError as e: print( f'[Exception] error_code: {e.code()} message: {e.message()} exception: {e}' ) finally: consumer.close()
def test_consumer(self): consumer_config = { 'bootstrap.servers': self.conf['bootstrap.servers'], 'key.deserializer': self.key_avro_deserializer, 'value.deserializer': self.value_avro_deserializer, 'group.id': '1', 'auto.offset.reset': 'earliest' } offset = kafka_utils.offset - len(self.test_messages) + 1 consumer = DeserializingConsumer(consumer_config) partitions = [] partition = TopicPartition(topic=self.topic, partition=0, offset=offset) partitions.append(partition) consumer.assign(partitions) # Process messages result = [] attempt = 0 while len(result) < len(self.test_messages): try: msg = consumer.poll(1.0) attempt += 1 if msg is None: print("no message received") if attempt < 10: pass else: break elif msg.error(): break else: value_object = msg.value() text = value_object.text print("adding {} to result".format(text)) result.append(text) except KeyboardInterrupt: break except SerializerError as e: break # Leave group and commit final offsets consumer.close() assert result == self.test_messages
def main(args): topic = args.topic schema_str = MetricSchema sr_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(sr_conf) avro_deserializer = AvroDeserializer(schema_str, schema_registry_client) string_deserializer = StringDeserializer('utf_8') consumer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': args.group, 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) client = InfluxDBClient(host=args.host_influx, port=8086, username='******', password='******') while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. msg = consumer.poll(1.0) if msg is None: continue timespent = msg.value() if timespent is not None: print("time ==>", timespent) print(timespent["metricName"]) print(timespent["time"]) client.switch_database('datascience') json_body = [{ "measurement": "metric", "fields": { "name": timespent["metricName"], "value": timespent["time"] } }] client.write_points(json_body) except KeyboardInterrupt: break consumer.close()
class Consumer: def __init__(self, bootstrap_servers: str, topic: str, group: str, callback: Callable[[Message], None], value_deserializer=None, poll_timeout: float = 1.0, config=None): consumer_config = { "bootstrap.servers": bootstrap_servers, "group.id": group, "value.deserializer": value_deserializer } if config: consumer_config.update(config) self.consumer = DeserializingConsumer(consumer_config) self.topic = topic self.callback = callback self.poll_timeout = poll_timeout def start(self): logger.info("Starting Kafka consumer") self.consumer.subscribe([self.topic]) while True: message = self.consumer.poll(self.poll_timeout) if message is None: continue if message.error(): print(f"Consumer error: {message.error()}") continue self.callback(message) def close(self): logger.info("Closing Kafka consumer") self.consumer.close()
def consume(): reusableConsumer = DeserializingConsumer(getConfigs()) reusableConsumer.subscribe(["myprototopic"]) while (True): try: msg = reusableConsumer.poll(0.1) if msg is None: continue else: key = msg.key() value = msg.value() print("Received msg name: {}, fav food: {}, times eaten: {}". format(value.name, value.favoriteFood, value.timesEaten)) except KeyboardInterrupt: break print("Closing Consumer") reusableConsumer.close()
def receive_record(args): """ Receives Record using a DeserializingConsumer & AvroDeserializer """ topics = [args.topic.rstrip()] schema_registry_config = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(schema_registry_config) avro_deserializer = AvroDeserializer(schema_registry_client, DATA_SCHEMA, dict_to_data) string_deserializer = StringDeserializer('utf_8') consumer_config = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': args.group, 'auto.offset.reset': 'earliest' } consumer = DeserializingConsumer(consumer_config) consumer.subscribe(topics) print(f'Consuming data records from topic(s) {topics}. ^C to exit.') while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. msg = consumer.poll(10.0) if msg is None: print('\t---Waiting. . .') continue data = msg.value() if data is not None: print(f'Data record {msg.key()}:\n' f'\tValues: {data}') except KeyboardInterrupt: break print('\nClosing consumer.') consumer.close()
def main(args): topic = args.topic schema_registry_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(schema_registry_conf) schema_obj = schema_registry_client.get_latest_version( subject_name='example_serde_json-value') json_deserializer = JSONDeserializer(schema_obj.schema.schema_str, from_dict=dict_to_user) string_deserializer = StringDeserializer('utf_8') consumer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': json_deserializer, 'group.id': args.group, 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) while True: try: msg = consumer.poll(1.0) if msg is None: continue user = msg.value() if user is not None: print(f"User record {msg.key()}:\n name: {user.name}\n" f"\tfavorite_number: {user.favorite_color}\n" f"\tfavorite_color: {user.favorite_number}\n") except KeyboardInterrupt: break consumer.close()
class Broker: def __init__(self, consumer_topic, producer_topic, client_id, bootstrap_servers, consumer_proto_class, producer_proto_class, processor, max_thread_calls): self.consumer_topic = consumer_topic self.producer_topic = producer_topic self.client_id = client_id self.bootstrap_servers = bootstrap_servers self.consumer_proto_class = consumer_proto_class self.producer_proto_class = producer_proto_class self.processor = processor self.max_thread_calls = max_thread_calls self.kafka_consumer = DeserializingConsumer({ 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.client_id, 'auto.offset.reset': "earliest", 'value.deserializer': self.derializer }) self.kafka_consumer.subscribe([self.consumer_topic]) self.kafka_producer = SerializingProducer({ 'bootstrap.servers': self.bootstrap_servers, 'queue.buffering.max.messages': 500000, 'value.serializer': self.serialize }) self.thread_queue = deque(maxlen=self.max_thread_calls) self.latest_thread_queue_id = 1 def derializer(self, bytes_message, _): message = image_pb2.ImageInfo() message.ParseFromString(bytes_message) return message def serialize(self, message, _): return message.SerializeToString() def get_thread_id(self): result = self.latest_thread_queue_id if result == self.max_thread_calls: self.latest_thread_queue_id = 1 else: self.latest_thread_queue_id += 1 return result def is_thread_queue_full(self): return len(self.thread_queue) == self.max_thread_calls def produce_when_ready(self, thread_id, message): while self.thread_queue[-1] != thread_id: logging.warning("Thread {} got stuck in queue".format(thread_id)) # time.sleep(0.01) self.kafka_producer.poll(0.0) self.kafka_producer.produce(topic=self.producer_topic, value=message) self.thread_queue.pop() def call_processor(self, thread_id, value, start_time): result = self.processor.process(value) self.produce_when_ready(thread_id, result) logging.debug("Total time for thead" + str(thread_id) + " is " + str(time.time() - start_time / 1000)) def run(self): while True: try: if self.is_thread_queue_full(): logging.warning( "Thread queue is full, waiting for previous threads to finished" ) continue msg = self.kafka_consumer.poll(1.0) if msg is None or msg.value() is None: logging.warning("No messages from kafka") continue caller_thread_id = self.get_thread_id() caller_thread = threading.Thread(target=self.call_processor, args=(caller_thread_id, msg.value(), msg.timestamp()[1])) self.thread_queue.appendleft(caller_thread_id) caller_thread.start() except KeyboardInterrupt: break self.kafka_consumer.close() self.kafka_producer.flush()
def main(args): topic = args.topic outputtopic = args.outputtopic schema_enriched_event_str = EnrichedEventSchema schema_metrics = MetricSchema sr_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(sr_conf) string_deserializer = StringDeserializer('utf_8') avro_serializer = AvroSerializer(schema_metrics, schema_registry_client) producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': avro_serializer } producer = SerializingProducer(producer_conf) avro_deserializer = AvroDeserializer(schema_enriched_event_str, schema_registry_client) consumer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': args.group + str(random.Random()), 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) cluster = Cluster([args.host]) session = cluster.connect("datascience") session.row_factory = dict_factory client_influxdb = InfluxDBClient('35.181.155.182', 8086, "dbsaleh2") while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. start = time.time() msg = consumer.poll(1.0) if msg is None: continue evt = msg.value() idPersonne = evt["idPersonne"] rows = session.execute(GET_ENRICHED_EVENT_QUERY, (idPersonne, )) if rows: # print(idPersonne, f"rows={rows.all().__len__()}") # stat_process(idPersonne, rows) # som = rec_process(rows,0,0) # print("some", som) # row["csp"] = get_value_column_enriched_data(row, "csp") # row["paysNaissance"] = get_value_column_enriched_data(row, "paysNaissance") # # # #get_value_column_event_content # row['appVersion'] = get_value_column_event_content(row, "appVersion") # row['montant'] = get_value_column_event_content(row, "montant") # row['androidID'] = get_value_column_event_content(row, "androidID") # del rows[0]['eventContent'] elapsed_time = time.time() - start #producer.produce(topic=outputtopic, key=str(uuid4()), value={'metricName':"hystorize",'time':elapsed_time}, on_delivery=delivery_report) #producer.flush() except Exception: print('Exception') continue metrics = [{ "measurement": "metrics", "fields": { "metricName": "score", "timeforscore": elapsed_time } }] print(elapsed_time) client_influxdb.write_points(metrics, database="dbsaleh2") producer.produce(topic=outputtopic, value={ 'metricName': "score", 'time': elapsed_time }, on_delivery=delivery_report) producer.flush() consumer.close()
def main(): sr_conf = {'url': SCHEMA_REGISTRY_URL} schema_registry_client = SchemaRegistryClient(sr_conf) schema_str = """ { "namespace": "io.confluent.ksql.avro_schemas", "name": "User", "type": "record", "fields":[ {"name":"DATESTAMP","type":"string"}, {"name":"TIMESTAMP","type":"string"}, {"name":"MILLISEC","type":"string"}, {"name":"LOGLEVEL","type":"string"}, {"name":"REQUESTID","type":"string"}, {"name":"RECORDFORMATVERSION","type":"string"}, {"name":"SOURCEIP","type":"string"}, {"name":"DNSDOMAIN","type":"string"}, {"name":"MESSAGETYPE","type":"string"}, {"name":"OPERATION","type":"string"}, {"name":"AUTHUSER","type":"string"}, {"name":"AUTHDOMAIN","type":"string"}, {"name":"HTTPCODE","type":"string"}, {"name":"SOURCEBYTES","type":"string"}, {"name":"RESPONSEBYTES","type":"string"}, {"name":"ELAPSEDTIME","type":"string"}, {"name":"DOMAIN","type":"string"}, {"name":"BUCKET","type":"string"}, {"name":"OBJECT","type":"string"} ] } """ avro_deserializer = AvroDeserializer(schema_str, schema_registry_client) string_deserializer = StringDeserializer('utf_8') consumer_conf = { 'bootstrap.servers': bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': group, 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. msg = consumer.poll(1.0) if msg is None: continue record = msg.value() if record is not None: if record['OPERATION'] == "POST" and record[ 'DOMAIN'] != "%28none%29": urllistraw = "http://" + record['DOMAIN'] + "/" + record[ 'BUCKET'] + "/" + record['OBJECT'] urllist = urllistraw[:-1] print(urllist) r = requests.head(urllist) print(r.headers) else: continue except KeyboardInterrupt: break consumer.close()
def main(args): topic = args.topic outputtopic = args.outputtopic schema_str = EventSchema schema_enriched_event_str = EnrichedEventSchema sr_conf = {'url': args.schema_registry} schema_registry_client = SchemaRegistryClient(sr_conf) avro_deserializer = AvroDeserializer(schema_str, schema_registry_client) string_deserializer = StringDeserializer('utf_8') avro_serializer = AvroSerializer(schema_enriched_event_str, schema_registry_client) consumer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.deserializer': string_deserializer, 'value.deserializer': avro_deserializer, 'group.id': args.group + str(random.Random()), 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe([topic]) cluster = Cluster([args.host]) session = cluster.connect("datascience") session.row_factory = dict_factory producer_conf = { 'bootstrap.servers': args.bootstrap_servers, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': avro_serializer } producer = SerializingProducer(producer_conf) loop = asyncio.get_event_loop() while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. start = time.time() msg = consumer.poll(1.0) if msg is None: continue evt = msg.value() print("msg=>", evt) def enrich(evt): print("evt", evt) if evt is not None: print("récupérer dans kafka") row = session.execute( GET_ENRICHED_DATA_QUERY, (evt["EventHeader"]["acteurDeclencheur"]["idPersonne"], )).one() if row: evt['EnrichedData'] = row # evt['EventBusinessContext'] = evt["EventBusinessContext"][1] EnrichedEvent = { "eventId": evt["EventHeader"]["eventId"], "dateTimeRef": evt["EventHeader"]["dateTimeRef"], "nomenclatureEv": evt["EventHeader"]["nomenclatureEv"], "canal": evt["EventHeader"]["canal"], "media": evt["EventHeader"]["media"], "schemaVersion": evt["EventHeader"]["schemaVersion"], "headerVersion": evt["EventHeader"]["headerVersion"], "serveur": evt["EventHeader"]["serveur"], "adresseIP": evt["EventHeader"]["acteurDeclencheur"] ["adresseIP"], "idTelematique": evt["EventHeader"]["acteurDeclencheur"] ["idTelematique"], "idPersonne": evt["EventHeader"]["acteurDeclencheur"] ["idPersonne"], "dateNaissance": row["dateNaissance"], "paysResidence": row["paysResidence"], "paysNaissance": row["paysNaissance"], "revenusAnnuel": row["revenusAnnuel"], "csp": row["csp"], "EventBusinessContext": evt["EventBusinessContext"] } producer.produce(topic=outputtopic, key=str(uuid4()), value=EnrichedEvent, on_delivery=delivery_report) producer.flush() async_enrich = async_wrap(enrich) loop.run_until_complete(async_enrich(evt)) except Exception: print('Exception') continue consumer.close()
class KafkaAvroConsumer: def __init__(self, consumer_name, value_schema, topic_name = "kafka-avro-producer", groupID = 'KafkaAvroConsumer', autocommit = True): # Consumer name for logging purposes self.logging_prefix = '['+ consumer_name + '][KafkaAvroConsumer]' # Schema Registry configuration self.schema_registry_conf = EventBackboneConfig.getSchemaRegistryConf() # Schema Registry Client self.schema_registry_client = SchemaRegistryClient(self.schema_registry_conf) # Key Deserializer self.key_deserializer = StringDeserializer('utf_8') # Get Schema for the value self.schema_id_value = self.schema_registry_client.get_latest_version(topic_name + "-value").schema_id # print('The Schema ID for the value is: {}'.format(self.schema_id_value)) self.value_schema = self.schema_registry_client.get_schema(self.schema_id_value).schema_str print(self.logging_prefix + ' - Value Subject: {}'.format(topic_name)) print(self.logging_prefix + ' - Value Schema:') print(self.logging_prefix + ' - -------------\n') print(self.logging_prefix + ' - ' + self.value_schema + '\n') # Value Deserializer # Presenting the schema to the Avro Deserializer is needed at the moment. In the future it might change # https://github.com/confluentinc/confluent-kafka-python/issues/834 self.value_deserializer = AvroDeserializer(self.value_schema,self.schema_registry_client) # Get the consumer configuration self.consumer_conf = EventBackboneConfig.getConsumerConfiguration(groupID, autocommit, self.key_deserializer, self.value_deserializer) # Create the consumer self.consumer = DeserializingConsumer(self.consumer_conf) # Print consumer configuration EventBackboneConfig.printConsumerConfiguration(self.logging_prefix,self.consumer_conf,self.schema_registry_conf['url']) # Subscribe to the topic self.consumer.subscribe([topic_name]) def traceResponse(self, msg): print(self.logging_prefix + ' - New event received\n\tTopic: {}\n\tPartition: {}\n\tOffset: {}\n\tkey: {}\n\tvalue: {}\n' .format(msg.topic(), msg.partition(), msg.offset(), msg.key(), msg.value())) # Polls for next event def pollNextEvent(self): # Poll for messages msg = self.consumer.poll(timeout=POLL_TIMEOUT) anEvent = {} # Validate the returned message if msg is None: print(self.logging_prefix + ' - [INFO] - No new messages on the topic') return None elif msg.error(): if ("PARTITION_EOF" in msg.error()): print(self.logging_prefix + ' - [INFO] - End of partition') else: print(self.logging_prefix + ' - [ERROR] - Consumer error: {}'.format(msg.error())) return None else: # Print the message self.traceResponse(msg) return msg.value() # Polls for the next event but returns the raw event def pollNextRawEvent(self): records = self.consumer.poll(timeout=POLL_TIMEOUT) if records is None: return None if records.error(): # Stop reading if we find end of partition in the error message if ("PARTITION_EOF" in records.error()): return None else: print(self.logging_prefix + ' - [ERROR] - Consumer error: {}'.format(records.error())) return None else: self.traceResponse(records) return records def commitEvent(self,event): self.consumer.commit(event) def close(self): self.consumer.close()
string_serialiser = StringDeserializer('utf_8') consumer_conf = { 'bootstrap.servers': 'localhost:9092', 'group.id': "groupid1234", 'key.deserializer': string_serialiser, 'value.deserializer': string_serialiser } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe(['test-topic-profileus1']) while True: try: msg = consumer.poll(10) print("poll returned {}".format(msg)) except Exception as e: print("Message deserialization failed for {}: {}".format(msg, e)) break if msg is None: continue if msg.error(): print("AvroConsumer error: {}".format(msg.error())) continue print("Topic: {} Message val: {}".format(msg.topic(), msg.value()))
from confluent_kafka import DeserializingConsumer if __name__ == '__main__': consumer_conf = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'kafka-client', 'auto.offset.reset': "earliest" } consumer = DeserializingConsumer(consumer_conf) consumer.subscribe(['dbserver1.inventory.customers']) print('Kafka Client Listening...') while True: try: # SIGINT can't be handled when polling, limit timeout to 1 second. msg = consumer.poll(1.0) if msg is None: continue print(msg.value()) print() except KeyboardInterrupt: break consumer.close()
class KafkaAvroConsumer: def __init__(self, value_schema, topic_name = "kafka-avro-producer", groupID = 'KafkaAvroConsumer', autocommit = True): # Schema Registry configuration self.schema_registry_conf = self.getSchemaRegistryConf() # Schema Registry Client self.schema_registry_client = SchemaRegistryClient(self.schema_registry_conf) # Key Deserializer self.key_deserializer = StringDeserializer('utf_8') # Value Deserializer # Presenting the schema to the Avro Deserializer is needed at the moment. In the future it might change # https://github.com/confluentinc/confluent-kafka-python/issues/834 self.value_deserializer = AvroDeserializer(value_schema,self.schema_registry_client) # Get the consumer configuration self.consumer_conf = self.getConsumerConfiguration(groupID, autocommit) # Create the consumer self.consumer = DeserializingConsumer(self.consumer_conf) # Subscribe to the topic self.consumer.subscribe([topic_name]) def getSchemaRegistryConf(self): try: # For IBM Event Streams on IBM Cloud and on OpenShift, the Schema Registry URL is some sort of # https://KAFKA_USER:KAFKA_PASSWORD@SCHEMA_REGISTRY_URL # Make sure the SCHEMA_REGISTRY_URL your provide is in the form described above. url = os.environ['SCHEMA_REGISTRY_URL'] # If we are talking to ES on prem, it uses an SSL self-signed certificate. # Therefore, we need the CA public certificate for the SSL connection to happen. if (os.path.isfile(os.getenv('KAFKA_CERT','/certs/es-cert.pem'))): ssl = os.getenv('KAFKA_CERT','/certs/es-cert.pem') return {'url': url, 'ssl.ca.location': ssl} return {'url': url} except KeyError: print('[KafkaAvroConsumer] - [ERROR] - There is no SCHEMA_REGISTRY_URL environment variable') exit(1) def getConsumerConfiguration(self, groupID, autocommit): try: options ={ 'bootstrap.servers': os.environ['KAFKA_BROKERS'], 'group.id': groupID, 'key.deserializer': self.key_deserializer, 'value.deserializer': self.value_deserializer, 'auto.offset.reset': "earliest", 'enable.auto.commit': autocommit, } if (os.getenv('KAFKA_PASSWORD','') != ''): # Set security protocol common to ES on prem and on IBM Cloud options['security.protocol'] = 'SASL_SSL' # Depending on the Kafka User, we will know whether we are talking to ES on prem or on IBM Cloud # If we are connecting to ES on IBM Cloud, the SASL mechanism is plain if (os.getenv('KAFKA_USER','') == 'token'): options['sasl.mechanisms'] = 'PLAIN' # If we are connecting to ES on OCP, the SASL mechanism is scram-sha-512 else: options['sasl.mechanisms'] = 'SCRAM-SHA-512' # Set the SASL username and password options['sasl.username'] = os.getenv('KAFKA_USER','') options['sasl.password'] = os.getenv('KAFKA_PASSWORD','') # If we are talking to ES on prem, it uses an SSL self-signed certificate. # Therefore, we need the CA public certificate for the SSL connection to happen. if (os.path.isfile(os.getenv('KAFKA_CERT','/certs/es-cert.pem'))): options['ssl.ca.location'] = os.getenv('KAFKA_CERT','/certs/es-cert.pem') # Print out the producer configuration self.printConsumerConfiguration(options) return options except KeyError as error: print('[KafkaAvroConsumer] - [ERROR] - A required environment variable does not exist: ' + error) exit(1) def printConsumerConfiguration(self,options): # Printing out consumer config for debugging purposes print("[KafkaAvroConsumer] - This is the configuration for the consumer:") print("[KafkaAvroConsumer] - -------------------------------------------") print('[KafkaAvroConsumer] - Bootstrap Server: {}'.format(options['bootstrap.servers'])) print('[KafkaAvroConsumer] - Schema Registry url: {}'.format(self.schema_registry_conf['url'].split('@')[-1])) if (os.getenv('KAFKA_PASSWORD','') != ''): # Obfuscate password if (len(options['sasl.password']) > 3): obfuscated_password = options['sasl.password'][0] + "*****" + options['sasl.password'][len(options['sasl.password'])-1] else: obfuscated_password = "******" print('[KafkaAvroConsumer] - Security Protocol: {}'.format(options['security.protocol'])) print('[KafkaAvroConsumer] - SASL Mechanism: {}'.format(options['sasl.mechanisms'])) print('[KafkaAvroConsumer] - SASL Username: {}'.format(options['sasl.username'])) print('[KafkaAvroConsumer] - SASL Password: {}'.format(obfuscated_password)) if (os.path.isfile(os.getenv('KAFKA_CERT','/certs/es-cert.pem'))): print('[KafkaAvroConsumer] - SSL CA Location: {}'.format(options['ssl.ca.location'])) print('[KafkaAvroConsumer] - Offset Reset: {}'.format(options['auto.offset.reset'])) print('[KafkaAvroConsumer] - Autocommit: {}'.format(options['enable.auto.commit'])) print("[KafkaAvroConsumer] - -------------------------------------------") def traceResponse(self, msg): print('[KafkaConsumer] - Topic {} partition [{}] at offset {}:\n\tkey: {}\n\tvalue: {}' .format(msg.topic(), msg.partition(), msg.offset(), msg.key(), msg.value() )) # Polls for next event def pollNextEvent(self): # Poll for messages msg = self.consumer.poll(timeout=10.0) anEvent = {} # Validate the returned message if msg is None: print("[KafkaAvroConsumer] - [INFO] - No new messages on the topic") return None elif msg.error(): if ("PARTITION_EOF" in msg.error()): print("[KafkaAvroConsumer] - [INFO] - End of partition") else: print("[KafkaAvroConsumer] - [ERROR] - Consumer error: {}".format(msg.error())) return None else: # Print the message self.traceResponse(msg) return msg.value() # Polls for events until it finds an event where keyId=keyname def pollNextEventKeyIdKeyName(self, keyID, keyname): gotIt = False while not gotIt: msg = self.consumer.poll(timeout=10.0) # Continue if we have not received a message yet if msg is None: continue if msg.error(): print("[KafkaAvroConsumer] - [ERROR] - Consumer error: {}".format(msg.error())) # Stop reading if we find end of partition in the error message if ("PARTITION_EOF" in msg.error()): gotIt= True continue self.traceResponse(msg) # If we've found our event based on keyname and keyID, stop reading messages if (msg.value()[keyname] == keyID): gotIt = True return msg.value() # Polls for events until it finds an event with same key def pollNextEventByKey(self, keyID): if (str(keyID) == ""): print("[KafkaAvroConsumer] - [ERROR] - Consumer error: Key is an empty string") return None gotIt = False while not gotIt: msg = self.consumer.poll(timeout=10.0) # Continue if we have not received a message yet if msg is None: continue if msg.error(): print("[KafkaAvroConsumer] - [ERROR] - Consumer error: {}".format(msg.error())) # Stop reading if we find end of partition in the error message if ("PARTITION_EOF" in msg.error()): gotIt= True continue self.traceResponse(msg) # If we've found our event based on keyname and keyID, stop reading messages if (msg.key() == keyID): gotIt = True return msg.value() # Polls for the next event but returns the raw event def pollNextRawEvent(self): msg = self.consumer.poll(timeout=5.0) if msg is None: return None if msg.error(): # Stop reading if we find end of partition in the error message if ("PARTITION_EOF" in msg.error()): return None else: print("[KafkaAvroConsumer] - [ERROR] - Consumer error: {}".format(msg.error())) return None return msg # Polls for events endlessly def pollEvents(self): gotIt = False while not gotIt: msg = self.consumer.poll(timeout=10.0) if msg is None: continue if msg.error(): print("[KafkaAvroConsumer] - [ERROR] - Consumer error: {}".format(msg.error())) if ("PARTITION_EOF" in msg.error()): gotIt= True continue self.traceResponse(msg) def close(self): self.consumer.close()