def consume(self, topic): #consumer=KafkaConsumer(topic,group_id=self.configs["group_id"],bootstrap_servers=self.configs["zookeeper"].split(","), auto_commit_enable=False) client=KafkaClient(self.configs["broker_list"].split(",")) consumer=SimpleConsumer(topic=topic,group=self.configs["group_id"],client=client, auto_commit=False) while(True): # message is (partition, msg). for msg, it has key, value as param. for message in consumer.get_messages(10): #print("[%s.consumer] %s-part-%d: value=%s" % (self.configs["group_id"],topic,message[0],message[1])) print("[%s.consumer] %s-part-%d: key=%s value=%s" % (self.configs["group_id"],topic, message[0], message[1].key,message[1].value))
def assert_kafka(self, expected_file_name): #print("reading server "+config.KAFKA_SERVER+" on topic:"+config.KAFKA_TOPIC) kafka_client = KafkaClient(config.KAFKA_SERVER) #simpleconsumer takes its timeout in seconds... hence 1, allowing all messages to appear but not hanging too long consumer = SimpleConsumer(kafka_client, b"my_group", config.KAFKA_TOPIC.encode("utf8"), iter_timeout=1) #seek(1,0) means to start processing from the begining (the 0) but skip 1 message from this index (the first msg) #we bypass the first message since it is just used to autostart the topic consumer.seek(1, 0) actual = "" for msg in consumer: #the linefeed at the end is not really needed but it makes for more readable error reports actual += msg.message.value.decode('utf8')+"\n" expected = pkg_resources.resource_string(__name__, expected_file_name).decode('utf8') t_assert.equal(actual, expected)
def set_consumer_partition(self, consumerPartitions): if not consumerPartitions: logger.warning('consumer partitions can not be empty') return if self.consumer: self.consumer.commit() self.consumer.stop() self.consumer = None self.consumerPartitions = consumerPartitions try: self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, self.kafkaTopic, partitions=self.consumerPartitions) except KafkaError as e: logger.warning('Exception {}'.format(e)) logger.debug(traceback.format_exc()) self.reconnect() except Exception as e: logger.warning('Exception {}'.format(e)) logger.debug(traceback.format_exc())
def test_writer(): topic = '%s-mutations' % (uuid.uuid1().hex,) client = KafkaClient('kafka') producer = SimpleProducer(client) writer = KafkaWriter(producer, topic) inputs = list(transaction) writer.push(inputs) consumer = SimpleConsumer(client, 'test', topic, auto_offset_reset='smallest') outputs = map( writer.codec.decode, map( operator.attrgetter('message.value'), list(consumer.get_messages(count=3)), ), ) assert outputs == inputs
def __init__(self, kafkaHost=None, kafkaGroup=None, kafkaTopic=None, consumerType=NON_CONSUMER, consumerPartitions=[], producerType=NON_PRODUCER, producerPartitions=[]): self.kafkaHost = kafkaHost self.kafkaGroup = kafkaGroup self.kafkaTopic = kafkaTopic self.consumerPartitions = consumerPartitions self.producerPartitions = producerPartitions self.connect(kafkaHost) try: if producerType == self.SIMPLE_PRODUCER: self.producer = SimpleProducer(self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED) elif producerType == self.FIXED_PRODUCER: self.producer = FixedProducer(self.kafkaClient, producerPartitions[0], async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED) elif producerType == self.USER_PRODUCER: self.producer = UserProducer(self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED) elif producerType == self.NON_PRODUCER: self.producer = None else: raise Exception("wrong producer type {}".format(producerType)) if consumerType == self.SIMPLE_CONSUMER: if not consumerPartitions: self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, self.kafkaTopic) else: self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, self.kafkaTopic, partitions=self.consumerPartitions) logger.debug('consumer is listening on {}@{}'.format(self.kafkaTopic, self.consumerPartitions)) elif consumerType == self.NON_CONSUMER: self.consumer = None else: raise Exception("wrong consumer type {}".format(consumerType)) except Exception as e: logger.warning('Exception {}'.format(e)) logger.debug(traceback.format_exc()) self.consumer = None self.producer = None self.kafkaClient = None
def run(self): client = KafkaClient("localhost:9092") consumer = SimpleConsumer(client, "test-group", "topic.test.min.v1", max_buffer_size = None, ) self.valid = 0 self.invalid = 0 m_len=len("Hello master wayne" * 10) consumer.seek(0,0) for message in consumer: try: if len(message.message.value) == m_len: self.valid += 1 else: self.invalid += 1 except: print "Reset Offset" consumer.seek(0,0)
def test_handler(): topic = '%s-mutations' % (uuid.uuid1().hex,) codec = BinaryCodec(Message) client = KafkaClient('kafka') producer = SimpleProducer(client) writer = KafkaWriter(producer, topic, codec) inputs = list(transaction) writer.push(inputs) consumer = SimpleConsumer(client, 'test', topic, auto_offset_reset='smallest') outputs = map( codec.decode, map( operator.attrgetter('message.value'), list(consumer.get_messages(count=3)), ), ) assert outputs == inputs
class KafkaBroker(object): USER_PRODUCER = 0 FIXED_PRODUCER = 1 SIMPLE_PRODUCER = 2 NON_PRODUCER = 3 SIMPLE_CONSUMER = 0 NON_CONSUMER = 1 SOCKET_TIMEOUT = 60 #second def __init__(self, kafkaHost=None, kafkaGroup=None, kafkaTopic=None, consumerType=NON_CONSUMER, consumerPartitions=[], producerType=NON_PRODUCER, producerPartitions=[]): self.kafkaHost = kafkaHost self.kafkaGroup = kafkaGroup self.kafkaTopic = kafkaTopic self.consumerPartitions = consumerPartitions self.producerPartitions = producerPartitions self.connect(kafkaHost) try: if producerType == self.SIMPLE_PRODUCER: self.producer = SimpleProducer(self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED) elif producerType == self.FIXED_PRODUCER: self.producer = FixedProducer(self.kafkaClient, producerPartitions[0], async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED) elif producerType == self.USER_PRODUCER: self.producer = UserProducer(self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_NOT_REQUIRED) elif producerType == self.NON_PRODUCER: self.producer = None else: raise Exception("wrong producer type {}".format(producerType)) if consumerType == self.SIMPLE_CONSUMER: if not consumerPartitions: self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, self.kafkaTopic) else: self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, self.kafkaTopic, partitions=self.consumerPartitions) logger.debug('consumer is listening on {}@{}'.format(self.kafkaTopic, self.consumerPartitions)) elif consumerType == self.NON_CONSUMER: self.consumer = None else: raise Exception("wrong consumer type {}".format(consumerType)) except Exception as e: logger.warning('Exception {}'.format(e)) logger.debug(traceback.format_exc()) self.consumer = None self.producer = None self.kafkaClient = None def close(self): if self.consumer: self.consumer.commit() self.consumer.stop() self.consumer = None if self.producer: self.producer.stop() self.producer = None if self.kafkaClient: self.kafkaClient.close() self.kafkaClient = None logger.info('Kafka connection closed') def connect(self, kafkaHost, countdown=COUNT_DOWN): if countdown == 0: logger.error('kafka server can not be connected in {} times'.format(COUNT_DOWN)) return try: self.kafkaClient = KafkaClient(kafkaHost, timeout=self.SOCKET_TIMEOUT) except: logger.warning('try to connect kafka server again {}'.format(countdown)) self.connect(kafkaHost, countdown - 1) logger.info('Kafka client connected {}'.format(self.kafkaClient)) def reconnect(self, countdown=COUNT_DOWN): if countdown == 0: logger.error('kafka server can not be connected in {} times'.format(COUNT_DOWN)) return try: self.kafkaClient.reinit() except: self.reconnect(countdown - 1) def produce(self, op, name, **kwargs): # TODO: when name is None, the operation is propagated to all partitions if not op or not name: logger.warning('op or name must not be empty') return try: dictMessage = dict(kwargs) dictMessage['op'] = op dictMessage['name'] = name encodedMessage = simplejson.dumps(dictMessage) self.producer.send(self.kafkaTopic, name, encodedMessage) except KafkaError as e: logger.warning('Exception {}'.format(e)) logger.debug(traceback.format_exc()) self.reconnect() except Exception as e: logger.warning('Exception {}'.format(e)) logger.debug(traceback.format_exc()) def echo(self, message=''): self.produce('Echo', 'testing', message=message) def set_consumer_partition(self, consumerPartitions): if not consumerPartitions: logger.warning('consumer partitions can not be empty') return if self.consumer: self.consumer.commit() self.consumer.stop() self.consumer = None self.consumerPartitions = consumerPartitions try: self.consumer = SimpleConsumer(self.kafkaClient, self.kafkaGroup, self.kafkaTopic, partitions=self.consumerPartitions) except KafkaError as e: logger.warning('Exception {}'.format(e)) logger.debug(traceback.format_exc()) self.reconnect() except Exception as e: logger.warning('Exception {}'.format(e)) logger.debug(traceback.format_exc()) def is_consumer_ready(self): if not self.consumer: logger.warning('Consumer is not ready yet') return False return True def seek(self, skip): if self.is_consumer_ready(): if skip == -1: self.consumer.seek(0, 2) else: self.consumer.seek(skip, 1) def commit(self): if self.is_consumer_ready(): self.consumer.commit() def consume_one(self): if not self.is_consumer_ready(): return None try: message = self.consumer.get_message() if not message: return None logger.debug('received message {}'.format(message.message.value)) return message.message.value except Exception as e: logger.warning('Exception {}'.format(e)) logger.debug(traceback.format_exc()) self.reconnect() return None def consume(self, count=10): if not self.is_consumer_ready(): return [] try: messages = self.consumer.get_messages(count=count) return [message.message.value for message in messages] except Exception as e: logger.warning('Exception {}'.format(e)) logger.debug(traceback.format_exc()) self.reconnect() return []
def consume(self, state): """ Starts consuming from the configured Kafka topic given a possible existing ``pgshovel.interfaces.replication_pb2:State``. If the provided ``state`` does not contain a ``stream_state.consumer_state`` value, the ``KafaStream`` attempts to start reading from the Kafka topic after first "priming" the stream. Priming involves consuming messages from the topic looking for a ``BeginOperation``. Any message that is not a ``BeginOperation`` is dropped, until a ``BeginOperation`` is seen or the ``prime_threshold`` is reached. The latter of which raises a ``pgshovel.streams.utilities:UnableToPrimeError`` error. In general, it makes sense to set the ``prime_threshold`` to high enough value that exceeds the max transaction size you expect to see in your data. Generally speaking a ``prime_threshold`` can effectively be infinite (and you could construct the stream with ``float('inf')``, however the lack of a ``BeginOperation`` in the stream would cause the stream to hang, possibly forever, so the ``prime_threshold`` config parameter is provided to raise an exception if this unexpected behavior occurs. """ consumer = SimpleConsumer(KafkaClient(self.hosts), None, self.topic) # You can only update one offset at a time with kafka-python, plus # dealing with reconstituting global order from a partitioned stream is # hard we don't really need to deal with it right now. assert len(consumer.offsets) is 1 decoded = imap( lambda (offset, msg): (offset, self.codec.decode(msg.value)), consumer ) if state.stream_state.HasField('consumer_state'): # Seeking to a direct offset was not in the PyPI release of # kafka-python when this was implemented: # https://github.com/mumrah/kafka-python/pull/412 current = consumer.offsets[0] offset = state.stream_state.consumer_state.offset + 1 delta = offset - current logger.debug('Moving to previous replication log offset: %s (current position: %s)...', offset, current) consumer.seek(delta, 1) assert consumer.offsets[0] == offset else: logger.info('No consumer state provided, will attempt to prime to begin BeginOperation') # The call to ``prime_for_batch_start`` "primes" the stream by # dropping messages until it sees a message that is an intance of # one of the types in # ``pgshovel.replication.validation.TRANSACTION_START_EVENT_TYPES`` decoded = prime_for_batch_start( max_messages=self.prime_threshold, stream=decoded ) for offset, message in decoded: state = validate_state(state, offset, message) # XXX: This is necessary because of a bug in protocol buffer oneof. state = type(state).FromString(state.SerializeToString()) yield state, offset, message
def _mp_consume(client, group, topic, message_queue, size, events, **consumer_options): """ A child process worker which consumes messages based on the notifications given by the controller process NOTE: Ideally, this should have been a method inside the Consumer class. However, multiprocessing module has issues in windows. The functionality breaks unless this function is kept outside of a class """ # Initial interval for retries in seconds. interval = 1 while not events.exit.is_set(): try: # Make the child processes open separate socket connections client.reinit() # We will start consumers without auto-commit. Auto-commit will be # done by the master controller process. consumer = SimpleConsumer(client, group, topic, auto_commit=False, auto_commit_every_n=None, auto_commit_every_t=None, **consumer_options) # Ensure that the consumer provides the partition information consumer.provide_partition_info() while True: # Wait till the controller indicates us to start consumption events.start.wait() # If we are asked to quit, do so if events.exit.is_set(): break # Consume messages and add them to the queue. If the controller # indicates a specific number of messages, follow that advice count = 0 message = consumer.get_message() if message: while True: try: message_queue.put(message, timeout=FULL_QUEUE_WAIT_TIME_SECONDS) break except queue.Full: if events.exit.is_set(): break count += 1 # We have reached the required size. The controller might have # more than what he needs. Wait for a while. # Without this logic, it is possible that we run into a big # loop consuming all available messages before the controller # can reset the 'start' event if count == size.value: events.pause.wait() else: # In case we did not receive any message, give up the CPU for # a while before we try again time.sleep(NO_MESSAGES_WAIT_TIME_SECONDS) consumer.stop() except KafkaError as e: # Retry with exponential backoff log.exception("Problem communicating with Kafka, retrying in %d seconds...", interval) time.sleep(interval) interval = interval*2 if interval*2 < MAX_BACKOFF_SECONDS else MAX_BACKOFF_SECONDS
def _mp_consume(client, group, topic, message_queue, size, events, **consumer_options): """ A child process worker which consumes messages based on the notifications given by the controller process NOTE: Ideally, this should have been a method inside the Consumer class. However, multiprocessing module has issues in windows. The functionality breaks unless this function is kept outside of a class """ # Initial interval for retries in seconds. interval = 1 while not events.exit.is_set(): try: # Make the child processes open separate socket connections client.reinit() # We will start consumers without auto-commit. Auto-commit will be # done by the master controller process. consumer = SimpleConsumer(client, group, topic, auto_commit=False, auto_commit_every_n=None, auto_commit_every_t=None, **consumer_options) # Ensure that the consumer provides the partition information consumer.provide_partition_info() while True: # Wait till the controller indicates us to start consumption events.start.wait() # If we are asked to quit, do so if events.exit.is_set(): break # Consume messages and add them to the queue. If the controller # indicates a specific number of messages, follow that advice count = 0 message = consumer.get_message() if message: while True: try: message_queue.put( message, timeout=FULL_QUEUE_WAIT_TIME_SECONDS) break except queue.Full: if events.exit.is_set(): break count += 1 # We have reached the required size. The controller might have # more than what he needs. Wait for a while. # Without this logic, it is possible that we run into a big # loop consuming all available messages before the controller # can reset the 'start' event if count == size.value: events.pause.wait() else: # In case we did not receive any message, give up the CPU for # a while before we try again time.sleep(NO_MESSAGES_WAIT_TIME_SECONDS) consumer.stop() except KafkaError as e: # Retry with exponential backoff log.error( "Problem communicating with Kafka (%s), retrying in %d seconds..." % (e, interval)) time.sleep(interval) interval = interval * 2 if interval * 2 < MAX_BACKOFF_SECONDS else MAX_BACKOFF_SECONDS
def consumeLatest(self,topic): consumer = SimpleConsumer(topic=topic,group=self.group_id,client=self.client,auto_commit=True) consumer.get_message(timeout=1) consumer.stop()