def __init__(self, client, group, topic, partitions=None, auto_commit=True, auto_commit_every_n=AUTO_COMMIT_MSG_COUNT, auto_commit_every_t=AUTO_COMMIT_INTERVAL): self.client = client self.topic = kafka_bytestring(topic) self.group = None if group is None else kafka_bytestring(group) self.client.load_metadata_for_topics(topic) self.offsets = {} if partitions is None: partitions = self.client.get_partition_ids_for_topic(topic) else: assert all(isinstance(x, numbers.Integral) for x in partitions) # Variables for handling offset commits self.commit_lock = Lock() self.commit_timer = None self.count_since_commit = 0 self.auto_commit = auto_commit self.auto_commit_every_n = auto_commit_every_n self.auto_commit_every_t = auto_commit_every_t # Set up the auto-commit timer if auto_commit is True and auto_commit_every_t is not None: self.commit_timer = ReentrantTimer(auto_commit_every_t, self.commit) self.commit_timer.start() # Set initial offsets if self.group is not None: self.fetch_last_known_offsets(partitions) else: for partition in partitions: self.offsets[partition] = 0 # Register a cleanup handler def cleanup(obj): obj.stop() self._cleanup_func = cleanup atexit.register(cleanup, self) self.partition_info = False # Do not return partition info in msgs
def __init__(self, client, group, topic, partitions=None, auto_commit=True, auto_commit_every_n=AUTO_COMMIT_MSG_COUNT, auto_commit_every_t=AUTO_COMMIT_INTERVAL): self.client = client self.topic = kafka_bytestring(topic) self.group = None if group is None else kafka_bytestring(group) self.client.load_metadata_for_topics(topic) self.offsets = {} if partitions is None: partitions = self.client.get_partition_ids_for_topic(topic) else: assert all(isinstance(x, numbers.Integral) for x in partitions) # Variables for handling offset commits self.commit_lock = Lock() self.commit_timer = None self.count_since_commit = 0 self.auto_commit = auto_commit self.auto_commit_every_n = auto_commit_every_n self.auto_commit_every_t = auto_commit_every_t # Set up the auto-commit timer if auto_commit is True and auto_commit_every_t is not None: self.commit_timer = ReentrantTimer(auto_commit_every_t, self.commit) self.commit_timer.start() # Set initial offsets if self.group is not None: self.fetch_last_known_offsets(partitions) else: for partition in partitions: self.offsets[partition] = 0 # Register a cleanup handler def cleanup(obj): obj.stop() self._cleanup_func = cleanup atexit.register(cleanup, self) self.partition_info = False # Do not return partition info in msgs
def _consume_topic_partition(self, topic, partition): topic = kafka_bytestring(topic) if not isinstance(partition, int): raise KafkaConfigurationError('Unknown partition type (%s) ' '-- expected int' % type(partition)) if topic not in self._client.topic_partitions: raise UnknownTopicOrPartitionError("Topic %s not found in broker metadata" % topic) if partition not in self._client.get_partition_ids_for_topic(topic): raise UnknownTopicOrPartitionError("Partition %d not found in Topic %s " "in broker metadata" % (partition, topic)) logger.info("Configuring consumer to fetch topic '%s', partition %d", topic, partition) self._topics.append((topic, partition))
def __init__(self, hosts, client_id=CLIENT_ID, timeout=DEFAULT_SOCKET_TIMEOUT_SECONDS, correlation_id=0): # We need one connection to bootstrap self.client_id = kafka_bytestring(client_id) self.timeout = timeout self.hosts = collect_hosts(hosts) self.correlation_id = correlation_id # create connections only when we need them self.conns = {} self.brokers = {} # broker_id -> BrokerMetadata self.topics_to_brokers = {} # TopicAndPartition -> BrokerMetadata self.topic_partitions = {} # topic -> partition -> PartitionMetadata self.load_metadata_for_topics() # bootstrap with all metadata
def __init__(self, hosts, client_id=CLIENT_ID, timeout=DEFAULT_SOCKET_TIMEOUT_SECONDS, correlation_id=0): # We need one connection to bootstrap self.client_id = kafka_bytestring(client_id) self.timeout = timeout self.hosts = collect_hosts(hosts) self.correlation_id = correlation_id # create connections only when we need them self.conns = {} self.brokers = {} # broker_id -> BrokerMetadata self.topics_to_brokers = {} # TopicAndPartition -> BrokerMetadata self.topic_partitions = {} # topic -> partition -> PartitionMetadata self.load_metadata_for_topics() # bootstrap with all metadata
def send_messages(self, topic, partition, *msg): """ Helper method to send produce requests @param: topic, name of topic for produce request -- type str @param: partition, partition number for produce request -- type int @param: *msg, one or more message payloads -- type bytes @returns: ResponseRequest returned by server raises on error Note that msg type *must* be encoded to bytes by user. Passing unicode message will not work, for example you should encode before calling send_messages via something like `unicode_message.encode('utf-8')` All messages produced via this method will set the message 'key' to Null """ topic = kafka_bytestring(topic) return self._send_messages(topic, partition, *msg)
def _get_commit_offsets(self): logger.info("Consumer fetching stored offsets") for topic_partition in self._topics: (resp,) = self._client.send_offset_fetch_request( kafka_bytestring(self._config['group_id']), [OffsetFetchRequest(topic_partition[0], topic_partition[1])], fail_on_error=False) try: check_error(resp) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored if resp.offset == -1: self._offsets.commit[topic_partition] = None # Otherwise we committed the stored offset # and need to fetch the next one else: self._offsets.commit[topic_partition] = resp.offset
def commit(self): """Store consumed message offsets (marked via task_done()) to kafka cluster for this consumer_group. Returns: True on success, or False if no offsets were found for commit Note: this functionality requires server version >=0.8.1.1 https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI """ if not self._config['group_id']: logger.warning('Cannot commit without a group_id!') raise KafkaConfigurationError( 'Attempted to commit offsets ' 'without a configured consumer group (group_id)' ) # API supports storing metadata with each commit # but for now it is unused metadata = b'' offsets = self._offsets.task_done commits = [] for topic_partition, task_done_offset in six.iteritems(offsets): # Skip if None if task_done_offset is None: continue # Commit offsets as the next offset to fetch # which is consistent with the Java Client # task_done is marked by messages consumed, # so add one to mark the next message for fetching commit_offset = (task_done_offset + 1) # Skip if no change from previous committed if commit_offset == self._offsets.commit[topic_partition]: continue commits.append( OffsetCommitRequest(topic_partition[0], topic_partition[1], commit_offset, metadata) ) if commits: logger.info('committing consumer offsets to group %s', self._config['group_id']) resps = self._client.send_offset_commit_request( kafka_bytestring(self._config['group_id']), commits, fail_on_error=False ) for r in resps: check_error(r) topic_partition = (r.topic, r.partition) task_done = self._offsets.task_done[topic_partition] self._offsets.commit[topic_partition] = (task_done + 1) if self._config['auto_commit_enable']: self._reset_auto_commit() return True else: logger.info('No new offsets found to commit in group %s', self._config['group_id']) return False
def fetch_messages(self): """Sends FetchRequests for all topic/partitions set for consumption Returns: Generator that yields KafkaMessage structs after deserializing with the configured `deserializer_class` Note: Refreshes metadata on errors, and resets fetch offset on OffsetOutOfRange, per the configured `auto_offset_reset` policy See Also: Key KafkaConsumer configuration parameters: * `fetch_message_max_bytes` * `fetch_max_wait_ms` * `fetch_min_bytes` * `deserializer_class` * `auto_offset_reset` """ max_bytes = self._config['fetch_message_max_bytes'] max_wait_time = self._config['fetch_wait_max_ms'] min_bytes = self._config['fetch_min_bytes'] if not self._topics: raise KafkaConfigurationError('No topics or partitions configured') if not self._offsets.fetch: raise KafkaConfigurationError( 'No fetch offsets found when calling fetch_messages' ) fetches = [FetchRequest(topic, partition, self._offsets.fetch[(topic, partition)], max_bytes) for (topic, partition) in self._topics] # send_fetch_request will batch topic/partition requests by leader responses = self._client.send_fetch_request( fetches, max_wait_time=max_wait_time, min_bytes=min_bytes, fail_on_error=False ) for resp in responses: if isinstance(resp, FailedPayloadsError): logger.warning('FailedPayloadsError attempting to fetch data') self._refresh_metadata_on_error() continue topic = kafka_bytestring(resp.topic) partition = resp.partition try: check_error(resp) except OffsetOutOfRangeError: logger.warning('OffsetOutOfRange: topic %s, partition %d, ' 'offset %d (Highwatermark: %d)', topic, partition, self._offsets.fetch[(topic, partition)], resp.highwaterMark) # Reset offset self._offsets.fetch[(topic, partition)] = ( self._reset_partition_offset((topic, partition)) ) continue except NotLeaderForPartitionError: logger.warning("NotLeaderForPartitionError for %s - %d. " "Metadata may be out of date", topic, partition) self._refresh_metadata_on_error() continue except RequestTimedOutError: logger.warning("RequestTimedOutError for %s - %d", topic, partition) continue # Track server highwater mark self._offsets.highwater[(topic, partition)] = resp.highwaterMark # Yield each message # Kafka-python could raise an exception during iteration # we are not catching -- user will need to address for (offset, message) in resp.messages: # deserializer_class could raise an exception here val = self._config['deserializer_class'](message.value) msg = KafkaMessage(topic, partition, offset, message.key, val) # in some cases the server will return earlier messages # than we requested. skip them per kafka spec if offset < self._offsets.fetch[(topic, partition)]: logger.debug('message offset less than fetched offset ' 'skipping: %s', msg) continue # Only increment fetch offset # if we safely got the message and deserialized self._offsets.fetch[(topic, partition)] = offset + 1 # Then yield to user yield msg
def set_topic_partitions(self, *topics): """ Set the topic/partitions to consume Optionally specify offsets to start from Accepts types: * str (utf-8): topic name (will consume all available partitions) * tuple: (topic, partition) * dict: - { topic: partition } - { topic: [partition list] } - { topic: (partition tuple,) } Optionally, offsets can be specified directly: * tuple: (topic, partition, offset) * dict: { (topic, partition): offset, ... } Example: .. code:: python kafka = KafkaConsumer() # Consume topic1-all; topic2-partition2; topic3-partition0 kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0}) # Consume topic1-0 starting at offset 12, and topic2-1 at offset 45 # using tuples -- kafka.set_topic_partitions(("topic1", 0, 12), ("topic2", 1, 45)) # using dict -- kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 }) """ self._topics = [] self._client.load_metadata_for_topics() # Setup offsets self._offsets = OffsetsStruct(fetch=dict(), commit=dict(), highwater=dict(), task_done=dict()) # Handle different topic types for arg in topics: # Topic name str -- all partitions if isinstance(arg, (six.string_types, six.binary_type)): topic = kafka_bytestring(arg) for partition in self._client.get_partition_ids_for_topic(topic): self._consume_topic_partition(topic, partition) # (topic, partition [, offset]) tuple elif isinstance(arg, tuple): topic = kafka_bytestring(arg[0]) partition = arg[1] self._consume_topic_partition(topic, partition) if len(arg) == 3: offset = arg[2] self._offsets.fetch[(topic, partition)] = offset # { topic: partitions, ... } dict elif isinstance(arg, dict): for key, value in six.iteritems(arg): # key can be string (a topic) if isinstance(key, (six.string_types, six.binary_type)): topic = kafka_bytestring(key) # topic: partition if isinstance(value, int): self._consume_topic_partition(topic, value) # topic: [ partition1, partition2, ... ] elif isinstance(value, (list, tuple)): for partition in value: self._consume_topic_partition(topic, partition) else: raise KafkaConfigurationError( 'Unknown topic type ' '(dict key must be int or list/tuple of ints)' ) # (topic, partition): offset elif isinstance(key, tuple): topic = kafka_bytestring(key[0]) partition = key[1] self._consume_topic_partition(topic, partition) self._offsets.fetch[(topic, partition)] = value else: raise KafkaConfigurationError('Unknown topic type (%s)' % type(arg)) # If we have a consumer group, try to fetch stored offsets if self._config['group_id']: self._get_commit_offsets() # Update missing fetch/commit offsets for topic_partition in self._topics: # Commit offsets default is None if topic_partition not in self._offsets.commit: self._offsets.commit[topic_partition] = None # Skip if we already have a fetch offset from user args if topic_partition not in self._offsets.fetch: # Fetch offsets default is (1) commit if self._offsets.commit[topic_partition] is not None: self._offsets.fetch[topic_partition] = self._offsets.commit[topic_partition] # or (2) auto reset else: self._offsets.fetch[topic_partition] = self._reset_partition_offset(topic_partition) # highwater marks (received from server on fetch response) # and task_done (set locally by user) # should always get initialized to None self._reset_highwater_offsets() self._reset_task_done_offsets() # Reset message iterator in case we were in the middle of one self._reset_message_iterator()
def load_metadata_for_topics(self, *topics): """ Fetch broker and topic-partition metadata from the server, and update internal data: broker list, topic/partition list, and topic/parition -> broker map This method should be called after receiving any error Arguments: *topics (optional): If a list of topics is provided, the metadata refresh will be limited to the specified topics only. Exceptions: ---------- If the broker is configured to not auto-create topics, expect UnknownTopicOrPartitionError for topics that don't exist If the broker is configured to auto-create topics, expect LeaderNotAvailableError for new topics until partitions have been initialized. Exceptions *will not* be raised in a full refresh (i.e. no topic list) In this case, error codes will be logged as errors Partition-level errors will also not be raised here (a single partition w/o a leader, for example) """ topics = [kafka_bytestring(t) for t in topics] if topics: for topic in topics: self.reset_topic_metadata(topic) else: self.reset_all_metadata() resp = self.send_metadata_request(topics) log.debug('Updating broker metadata: %s', resp.brokers) log.debug('Updating topic metadata: %s', resp.topics) self.brokers = dict([(broker.nodeId, broker) for broker in resp.brokers]) for topic_metadata in resp.topics: topic = topic_metadata.topic partitions = topic_metadata.partitions # Errors expected for new topics try: kafka_common.check_error(topic_metadata) except (UnknownTopicOrPartitionError, LeaderNotAvailableError) as e: # Raise if the topic was passed in explicitly if topic in topics: raise # Otherwise, just log a warning log.error('Error loading topic metadata for %s: %s', topic, type(e)) continue self.topic_partitions[topic] = {} for partition_metadata in partitions: partition = partition_metadata.partition leader = partition_metadata.leader self.topic_partitions[topic][partition] = partition_metadata # Populate topics_to_brokers dict topic_part = TopicAndPartition(topic, partition) # Check for partition errors try: kafka_common.check_error(partition_metadata) # If No Leader, topics_to_brokers topic_partition -> None except LeaderNotAvailableError: log.error('No leader for topic %s partition %d', topic, partition) self.topics_to_brokers[topic_part] = None continue # If one of the replicas is unavailable -- ignore # this error code is provided for admin purposes only # we never talk to replicas, only the leader except ReplicaNotAvailableError: log.debug( 'Some (non-leader) replicas not available for topic %s partition %d', topic, partition) # If Known Broker, topic_partition -> BrokerMetadata if leader in self.brokers: self.topics_to_brokers[topic_part] = self.brokers[leader] # If Unknown Broker, fake BrokerMetadata so we dont lose the id # (not sure how this could happen. server could be in bad state) else: self.topics_to_brokers[topic_part] = BrokerMetadata( leader, None, None)
def get_partition_ids_for_topic(self, topic): topic = kafka_bytestring(topic) if topic not in self.topic_partitions: return [] return sorted(list(self.topic_partitions[topic]))
def has_metadata_for_topic(self, topic): topic = kafka_bytestring(topic) return topic in self.topic_partitions and len( self.topic_partitions[topic]) > 0
def load_metadata_for_topics(self, *topics): """ Fetch broker and topic-partition metadata from the server, and update internal data: broker list, topic/partition list, and topic/parition -> broker map This method should be called after receiving any error Arguments: *topics (optional): If a list of topics is provided, the metadata refresh will be limited to the specified topics only. Exceptions: ---------- If the broker is configured to not auto-create topics, expect UnknownTopicOrPartitionError for topics that don't exist If the broker is configured to auto-create topics, expect LeaderNotAvailableError for new topics until partitions have been initialized. Exceptions *will not* be raised in a full refresh (i.e. no topic list) In this case, error codes will be logged as errors Partition-level errors will also not be raised here (a single partition w/o a leader, for example) """ topics = [kafka_bytestring(t) for t in topics] if topics: for topic in topics: self.reset_topic_metadata(topic) else: self.reset_all_metadata() resp = self.send_metadata_request(topics) log.debug('Updating broker metadata: %s', resp.brokers) log.debug('Updating topic metadata: %s', resp.topics) self.brokers = dict([(broker.nodeId, broker) for broker in resp.brokers]) for topic_metadata in resp.topics: topic = topic_metadata.topic partitions = topic_metadata.partitions # Errors expected for new topics try: kafka_common.check_error(topic_metadata) except (UnknownTopicOrPartitionError, LeaderNotAvailableError) as e: # Raise if the topic was passed in explicitly if topic in topics: raise # Otherwise, just log a warning log.error('Error loading topic metadata for %s: %s', topic, type(e)) continue self.topic_partitions[topic] = {} for partition_metadata in partitions: partition = partition_metadata.partition leader = partition_metadata.leader self.topic_partitions[topic][partition] = partition_metadata # Populate topics_to_brokers dict topic_part = TopicAndPartition(topic, partition) # Check for partition errors try: kafka_common.check_error(partition_metadata) # If No Leader, topics_to_brokers topic_partition -> None except LeaderNotAvailableError: log.error('No leader for topic %s partition %d', topic, partition) self.topics_to_brokers[topic_part] = None continue # If one of the replicas is unavailable -- ignore # this error code is provided for admin purposes only # we never talk to replicas, only the leader except ReplicaNotAvailableError: log.debug('Some (non-leader) replicas not available for topic %s partition %d', topic, partition) # If Known Broker, topic_partition -> BrokerMetadata if leader in self.brokers: self.topics_to_brokers[topic_part] = self.brokers[leader] # If Unknown Broker, fake BrokerMetadata so we dont lose the id # (not sure how this could happen. server could be in bad state) else: self.topics_to_brokers[topic_part] = BrokerMetadata( leader, None, None )
def get_partition_ids_for_topic(self, topic): topic = kafka_bytestring(topic) if topic not in self.topic_partitions: return [] return sorted(list(self.topic_partitions[topic]))
def has_metadata_for_topic(self, topic): topic = kafka_bytestring(topic) return topic in self.topic_partitions and len(self.topic_partitions[topic]) > 0