def _get_commit_offsets(self): logger.info("Consumer fetching stored offsets") for topic_partition in self._topics: resps = [] if self._config['offset_storage'] in ('zookeeper', 'dual'): resps += self._client.send_offset_fetch_request( kafka_bytestring(self._config['group_id']), [OffsetFetchRequest(topic_partition[0], topic_partition[1])], fail_on_error=False) if self._config['offset_storage'] in ('kafka', 'dual'): resps += self._client.send_offset_fetch_request_kafka( kafka_bytestring(self._config['group_id']), [OffsetFetchRequest(topic_partition[0], topic_partition[1])], fail_on_error=False) try: for r in resps: check_error(r) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored max_offset = max(r.offset for r in resps) if max_offset == -1: self._offsets.commit[topic_partition] = None # Otherwise we committed the stored offset # and need to fetch the next one else: self._offsets.commit[topic_partition] = max_offset
def _get_topic_offsets(topics, latest): """ :param topics: list of topics :param latest: True to fetch latest offsets, False to fetch earliest available :return: dict: { (topic, partition): offset, ... } """ # https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetRequest # https://cfchou.github.io/blog/2015/04/23/a-closer-look-at-kafka-offsetrequest/ assert set(topics) <= set(ALL) client = get_kafka_client() partition_meta = client.topic_partitions # only return the offset of the latest message in the partition num_offsets = 1 time_value = -1 if latest else -2 offsets = {} offset_requests = [] for topic in topics: partitions = list(partition_meta.get(topic, {})) for partition in partitions: offsets[(kafka_bytestring(topic), partition)] = None offset_requests.append( OffsetRequest(kafka_bytestring(topic), partition, time_value, num_offsets)) responses = client.send_offset_request(offset_requests) for r in responses: offsets[(kafka_bytestring(r.topic), r.partition)] = r.offsets[0] return offsets
def _commit_offsets_to_watermark( kafka_client, group, topics, watermark, raise_on_error, offset_storage, ): topics = _verify_topics_and_partitions(kafka_client, topics, raise_on_error) watermark_offsets = get_topics_watermarks(kafka_client, topics, raise_on_error) if watermark == HIGH_WATERMARK: group_offset_reqs = [ OffsetCommitRequest( kafka_bytestring(topic), partition, watermark_offsets[topic][partition].highmark, None ) for topic, partitions in topics.iteritems() for partition in partitions ] elif watermark == LOW_WATERMARK: group_offset_reqs = [ OffsetCommitRequest( kafka_bytestring(topic), partition, watermark_offsets[topic][partition].lowmark, None ) for topic, partitions in topics.iteritems() for partition in partitions ] else: raise ValueError( "Unknown watermark: {watermark}".format(watermark=watermark) ) if offset_storage == 'zookeeper' or not offset_storage: send_api = kafka_client.send_offset_commit_request elif offset_storage == 'kafka': send_api = kafka_client.send_offset_commit_request_kafka else: raise InvalidOffsetStorageError(offset_storage) status = [] if group_offset_reqs: status = send_api( kafka_bytestring(group), group_offset_reqs, raise_on_error, callback=_check_commit_response_error ) return filter(None, status)
def test_get_partitions_set(self, partitioner): with mock.patch('yelp_kafka.partitioner.get_kafka_topics', autospec=True) as mock_topics: mock_topics.return_value = { kafka_bytestring('topic1'): [0, 1, 2, 3], kafka_bytestring('topic2'): [0, 1, 2], kafka_bytestring('topic3'): [0, 1, 2, 3], } actual = partitioner.get_partitions_set() assert actual == set([ 'topic1-0', 'topic1-1', 'topic1-2', 'topic1-3', 'topic2-0', 'topic2-1', 'topic2-2' ])
def __init__(self, client, group, topic, partitions=None, auto_commit=True, auto_commit_every_n=AUTO_COMMIT_MSG_COUNT, auto_commit_every_t=AUTO_COMMIT_INTERVAL, offset_storage='zookeeper'): self.client = client self.topic = kafka_bytestring(topic) self.group = None if group is None else kafka_bytestring(group) self.client.load_metadata_for_topics(topic) self.offsets = {} if partitions is None: partitions = self.client.get_partition_ids_for_topic(topic) else: assert all(isinstance(x, numbers.Integral) for x in partitions) # Variables for handling offset commits self.commit_lock = Lock() self.commit_timer = None self.count_since_commit = 0 self.auto_commit = auto_commit self.auto_commit_every_n = auto_commit_every_n self.auto_commit_every_t = auto_commit_every_t self.offset_storage = offset_storage # Set up the auto-commit timer if auto_commit is True and auto_commit_every_t is not None: self.commit_timer = ReentrantTimer(auto_commit_every_t, self.commit) self.commit_timer.start() # Set initial offsets if self.group is not None: self.fetch_last_known_offsets(partitions) else: for partition in partitions: self.offsets[partition] = 0 # Register a cleanup handler def cleanup(obj): obj.stop() self._cleanup_func = cleanup atexit.register(cleanup, self) self.partition_info = False # Do not return partition info in msgs
def _commit_offsets_to_watermark( kafka_client, group, topics, watermark, raise_on_error, offset_storage, ): topics = _verify_topics_and_partitions(kafka_client, topics, raise_on_error) watermark_offsets = get_topics_watermarks(kafka_client, topics, raise_on_error) if watermark == HIGH_WATERMARK: group_offset_reqs = [ OffsetCommitRequest(kafka_bytestring(topic), partition, watermark_offsets[topic][partition].highmark, None) for topic, partitions in topics.iteritems() for partition in partitions ] elif watermark == LOW_WATERMARK: group_offset_reqs = [ OffsetCommitRequest(kafka_bytestring(topic), partition, watermark_offsets[topic][partition].lowmark, None) for topic, partitions in topics.iteritems() for partition in partitions ] else: raise ValueError( "Unknown watermark: {watermark}".format(watermark=watermark)) if offset_storage == 'zookeeper' or not offset_storage: send_api = kafka_client.send_offset_commit_request elif offset_storage == 'kafka': send_api = kafka_client.send_offset_commit_request_kafka else: raise InvalidOffsetStorageError(offset_storage) status = [] if group_offset_reqs: status = send_api(kafka_bytestring(group), group_offset_reqs, raise_on_error, callback=_check_commit_response_error) return filter(None, status)
def _send_offset_commit_requests(self, offset_commit_request_list): if len(offset_commit_request_list) > 0: retry_on_exception(self._consumer_retry_policy, (FailedPayloadsError), self.kafka_client.send_offset_commit_request, group=kafka_bytestring(self.client_name), payloads=offset_commit_request_list)
def run(self, topic, message, hosts=None): """ Simple round-robin synchronous producer to send one message to one topic. :param hosts: Kafka hostname(s) to connect in host:port format. Comma-separated for several hosts. :type hosts: ``str`` :param topic: Kafka Topic to publish the message on. :type topic: ``str`` :param message: The message to publish. :type message: ``str`` :returns: Response data: `topic`, target `partition` where message was sent, `offset` number and `error` code (hopefully 0). :rtype: ``dict`` """ if hosts: _hosts = hosts elif self.config.get('hosts', None): _hosts = self.config['hosts'] else: raise ValueError("Need to define 'hosts' in either action or in config") # set default for empty value _client_id = self.config.get('client_id') or self.DEFAULT_CLIENT_ID client = KafkaClient(_hosts, client_id=_client_id) client.ensure_topic_exists(topic) producer = SimpleProducer(client) result = producer.send_messages(topic, kafka_bytestring(message)) if result[0]: return result[0].__dict__
def commit_offsets(self, topic_to_partition_offset_map): """Commits offset information to kafka. Allows lower-level control for committing offsets. In general, :meth:`commit_message` or :meth:`commit_messages` should be used, but this can be useful when paired with :meth:`data_pipeline.position_data.PositionData.topic_to_last_position_info_map`. **Example**:: The `topic_to_partition_offset_map` should be formatted like:: { 'topic1': {0: 83854, 1: 8943892}, 'topic2': {0: 190898} } Args:: topic_to_partition_offset_map (Dict[str, Dict[int, int]]): Maps from topics to a partition and offset map for each topic. """ topic_to_partition_offset_map = self._get_offsets_map_to_be_committed( topic_to_partition_offset_map) return self._send_offset_commit_requests(offset_commit_request_list=[ OffsetCommitRequest(topic=kafka_bytestring(topic), partition=partition, offset=offset, metadata=None) for topic, partition_map in topic_to_partition_offset_map.iteritems() for partition, offset in partition_map.iteritems() ])
def get_partitions_set(self): """ Load partitions metadata from kafka and create a set containing "<topic>-<partition_id>" :returns: partitions for user topics :rtype: set :raises PartitionerError: if no partitions have been found """ topic_partitions = get_kafka_topics(self.kafka_client) partitions = [] missing_topics = set() for topic in self.topics: kafka_topic = kafka_bytestring(topic) if kafka_topic not in topic_partitions: missing_topics.add(topic) else: partitions += [ "{0}-{1}".format(topic, p) for p in topic_partitions[kafka_topic] ] if missing_topics: self.log.info("Missing topics: %s", missing_topics) if not partitions: self.release_and_finish() raise PartitionerError( "No partitions found for topics: {topics}".format( topics=self.topics)) return set(partitions)
def run(self, topic, message, hosts=None): """ Simple round-robin synchronous producer to send one message to one topic. :param hosts: Kafka hostname(s) to connect in host:port format. Comma-separated for several hosts. :type hosts: ``str`` :param topic: Kafka Topic to publish the message on. :type topic: ``str`` :param message: The message to publish. :type message: ``str`` :returns: Response data: `topic`, target `partition` where message was sent, `offset` number and `error` code (hopefully 0). :rtype: ``dict`` """ if hosts: _hosts = hosts elif self.config.get('hosts', None): _hosts = self.config['hosts'] else: raise ValueError( "Need to define 'hosts' in either action or in config") # set default for empty value _client_id = self.config.get('client_id') or self.DEFAULT_CLIENT_ID client = KafkaClient(_hosts, client_id=_client_id) client.ensure_topic_exists(topic) producer = SimpleProducer(client) result = producer.send_messages(topic, kafka_bytestring(message)) if result[0]: return result[0].__dict__
def commit_offsets(self, topic_to_partition_offset_map): """Commits offset information to kafka. Allows lower-level control for committing offsets. In general, :meth:`commit_message` or :meth:`commit_messages` should be used, but this can be useful when paired with :meth:`data_pipeline.position_data.PositionData.topic_to_last_position_info_map`. **Example**:: The `topic_to_partition_offset_map` should be formatted like:: { 'topic1': {0: 83854, 1: 8943892}, 'topic2': {0: 190898} } Args:: topic_to_partition_offset_map (Dict[str, Dict[int, int]]): Maps from topics to a partition and offset map for each topic. """ topic_to_partition_offset_map = self._get_offsets_map_to_be_committed( topic_to_partition_offset_map ) return self._send_offset_commit_requests( offset_commit_request_list=[ OffsetCommitRequest( topic=kafka_bytestring(topic), partition=partition, offset=offset, metadata=None ) for topic, partition_map in topic_to_partition_offset_map.iteritems() for partition, offset in partition_map.iteritems() ] )
def commit(self): """Store consumed message offsets (marked via task_done()) to kafka cluster for this consumer_group. Returns: True on success, or False if no offsets were found for commit Note: this functionality requires server version >=0.8.1.1 https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI """ if not self._config['group_id']: logger.warning('Cannot commit without a group_id!') raise KafkaConfigurationError('Attempted to commit offsets without a configured consumer group (group_id)') # API supports storing metadata with each commit # but for now it is unused metadata = b'' offsets = self._offsets.task_done commits = [] for topic_partition, task_done_offset in six.iteritems(offsets): # Skip if None if task_done_offset is None: continue # Commit offsets as the next offset to fetch # which is consistent with the Java Client # task_done is marked by messages consumed, # so add one to mark the next message for fetching commit_offset = (task_done_offset + 1) # Skip if no change from previous committed if commit_offset == self._offsets.commit[topic_partition]: continue commits.append(OffsetCommitRequest(topic_partition[0], topic_partition[1], commit_offset, metadata)) if commits: logger.info('committing consumer offsets to group %s', self._config['group_id']) resps = self._client.send_offset_commit_request(kafka_bytestring(self._config['group_id']), commits, fail_on_error=False) for r in resps: check_error(r) topic_partition = (r.topic, r.partition) task_done = self._offsets.task_done[topic_partition] self._offsets.commit[topic_partition] = (task_done + 1) if self._config['auto_commit_enable']: self._reset_auto_commit() return True else: logger.info('No new offsets found to commit in group %s', self._config['group_id']) return False
def _send_offset_commit_requests(self, offset_commit_request_list): if len(offset_commit_request_list) > 0: retry_on_exception( self._consumer_retry_policy, (FailedPayloadsError), self.kafka_client.send_offset_commit_request, group=kafka_bytestring(self.client_name), payloads=offset_commit_request_list )
def current_offset(self, topic, partition): try: offsets, = self.client.send_offset_request([ OffsetRequest(kafka_bytestring(topic), partition, -1, 1) ]) except: # XXX: We've seen some UnknownErrors here and cant debug w/o server logs self.zk.child.dump_logs() self.server.child.dump_logs() raise else: return offsets.offsets[0]
def __init__(self, topic, config, partitions=None): self.log = logging.getLogger(self.__class__.__name__) if not isinstance(topic, six.string_types): raise TypeError("Topic must be a string") self.topic = kafka_bytestring(topic) if partitions and not isinstance(partitions, list): raise TypeError("Partitions must be a list") self.partitions = partitions self.kafka_consumer = None self.config = config
def _consume_topic_partition(self, topic, partition): topic = kafka_bytestring(topic) if not isinstance(partition, int): raise KafkaConfigurationError('Unknown partition type (%s) ' '-- expected int' % type(partition)) if topic not in self._client.topic_partitions: raise UnknownTopicOrPartitionError("Topic %s not found in broker metadata" % topic) if partition not in self._client.get_partition_ids_for_topic(topic): raise UnknownTopicOrPartitionError("Partition %d not found in Topic %s " "in broker metadata" % (partition, topic)) logger.info("Configuring consumer to fetch topic '%s', partition %d", topic, partition) self._topics.append((topic, partition))
def __init__(self, hosts, client_id=CLIENT_ID, timeout=DEFAULT_SOCKET_TIMEOUT_SECONDS, correlation_id=0): # We need one connection to bootstrap self.client_id = kafka_bytestring(client_id) self.timeout = timeout self.hosts = collect_hosts(hosts) self.correlation_id = correlation_id # create connections only when we need them self.conns = {} self.brokers = {} # broker_id -> BrokerMetadata self.topics_to_brokers = {} # TopicAndPartition -> BrokerMetadata self.topic_partitions = {} # topic -> partition -> PartitionMetadata self.load_metadata_for_topics() # bootstrap with all metadata
def get_kafka_ucr_pillow(pillow_id='kafka-ucr-main', ucr_division=None, include_ucrs=None, exclude_ucrs=None, topics=None, num_processes=1, process_num=0, **kwargs): topics = topics or KAFKA_TOPICS topics = [kafka_bytestring(t) for t in topics] return ConfigurableReportKafkaPillow( processor=ConfigurableReportPillowProcessor( data_source_provider=DynamicDataSourceProvider(), auto_repopulate_tables=False, ucr_division=ucr_division, include_ucrs=include_ucrs, exclude_ucrs=exclude_ucrs, ), pillow_name=pillow_id, topics=topics, num_processes=num_processes, process_num=process_num, )
def validate_offsets(expected_offsets): """ Takes in a dictionary of offsets (topics to checkpoint numbers) and ensures they are all available in the current kafka feed """ if expected_offsets: topics = {kafka_bytestring(x[0]) for x in expected_offsets.keys()} available_offsets = get_multi_topic_first_available_offsets(topics) for topic_partition, offset in expected_offsets.items(): topic, partition = topic_partition if topic_partition not in available_offsets: raise UnavailableKafkaOffset("Invalid partition '{}' for topic '{}'".format(partition, topic)) if expected_offsets[topic_partition] < available_offsets[topic_partition]: message = ( 'First available topic offset for {}:{} is {} but needed {}.' ).format(topic, partition, available_offsets[topic_partition], expected_offsets[topic_partition]) raise UnavailableKafkaOffset(message)
def get_kafka_ucr_static_pillow(pillow_id='kafka-ucr-static', ucr_division=None, include_ucrs=None, exclude_ucrs=None, topics=None, num_processes=1, process_num=0, **kwargs): topics = topics or KAFKA_TOPICS topics = [kafka_bytestring(t) for t in topics] return ConfigurableReportKafkaPillow( processor=ConfigurableReportPillowProcessor( data_source_provider=StaticDataSourceProvider(), auto_repopulate_tables=True, ucr_division=ucr_division, include_ucrs=include_ucrs, exclude_ucrs=exclude_ucrs, bootstrap_interval=7 * 24 * 60 * 60 # 1 week ), pillow_name=pillow_id, topics=topics, num_processes=num_processes, process_num=process_num, retry_errors=True )
def _get_commit_offsets(self): logger.info("Consumer fetching stored offsets") for topic_partition in self._topics: (resp, ) = self._client.send_offset_fetch_request( kafka_bytestring(self._config['group_id']), [OffsetFetchRequest(topic_partition[0], topic_partition[1])], fail_on_error=False) try: check_error(resp) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored if resp.offset == -1: self._offsets.commit[topic_partition] = None # Otherwise we committed the stored offset # and need to fetch the next one else: self._offsets.commit[topic_partition] = resp.offset
def _get_commit_offsets(self): logger.info("Consumer fetching stored offsets") for topic_partition in self._topics: (resp,) = self._client.send_offset_fetch_request( kafka_bytestring(self._config['group_id']), [OffsetFetchRequest(topic_partition[0], topic_partition[1])], fail_on_error=False) try: check_error(resp) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored if resp.offset == -1: self._offsets.commit[topic_partition] = None # Otherwise we committed the stored offset # and need to fetch the next one else: self._offsets.commit[topic_partition] = resp.offset
def test_switch_leader_keyed_producer(self): topic = self.topic producer = KeyedProducer(self.client, async=False) # Send 10 random messages for _ in range(10): key = random_string(3).encode('utf-8') msg = random_string(10).encode('utf-8') producer.send_messages(topic, key, msg) # kill leader for partition 0 self._kill_leader(topic, 0) recovered = False started = time.time() timeout = 60 while not recovered and (time.time() - started) < timeout: try: key = random_string(3).encode('utf-8') msg = random_string(10).encode('utf-8') producer.send_messages(topic, key, msg) if producer.partitioners[kafka_bytestring(topic)].partition( key) == 0: recovered = True except (FailedPayloadsError, ConnectionError): log.debug("caught exception sending message -- will retry") continue # Verify we successfully sent the message self.assertTrue(recovered) # send some more messages just to make sure no more exceptions for _ in range(10): key = random_string(3).encode('utf-8') msg = random_string(10).encode('utf-8') producer.send_messages(topic, key, msg)
def test_switch_leader_keyed_producer(self): topic = self.topic producer = KeyedProducer(self.client, async=False) # Send 10 random messages for _ in range(10): key = random_string(3) msg = random_string(10) producer.send_messages(topic, key, msg) # kill leader for partition 0 self._kill_leader(topic, 0) recovered = False started = time.time() timeout = 60 while not recovered and (time.time() - started) < timeout: try: key = random_string(3) msg = random_string(10) producer.send_messages(topic, key, msg) if producer.partitioners[kafka_bytestring(topic)].partition(key) == 0: recovered = True except (FailedPayloadsError, ConnectionError): logging.debug("caught exception sending message -- will retry") continue # Verify we successfully sent the message self.assertTrue(recovered) # send some more messages just to make sure no more exceptions for _ in range(10): key = random_string(3) msg = random_string(10) producer.send_messages(topic, key, msg)
def get_topics_watermarks(kafka_client, topics, raise_on_error=True): """ Get current topic watermarks. NOTE: This method does not refresh client metadata. It is up to the caller to use avoid using stale metadata. If any partition leader is not available, the request fails for all the other topics. This is the tradeoff of sending all topic requests in batch and save both in performance and Kafka load. :param kafka_client: a connected KafkaToolClient :param topics: topic list or dict {<topic>: [partitions]} :param raise_on_error: if False the method ignores missing topics and missing partitions. It still may fail on the request send. :returns: a dict topic: partition: Part :raises: :py:class:`~kafka_utils.util.error.UnknownTopic`: upon missing topics and raise_on_error=True :py:class:`~kafka_utils.util.error.UnknownPartition`: upon missing partitions and raise_on_error=True FailedPayloadsError: upon send request error. """ topics = _verify_topics_and_partitions( kafka_client, topics, raise_on_error, ) highmark_offset_reqs = [] lowmark_offset_reqs = [] for topic, partitions in topics.iteritems(): # Batch watermark requests for partition in partitions: # Request the the latest offset highmark_offset_reqs.append( OffsetRequest( kafka_bytestring(topic), partition, -1, max_offsets=1 ) ) # Request the earliest offset lowmark_offset_reqs.append( OffsetRequest( kafka_bytestring(topic), partition, -2, max_offsets=1 ) ) watermark_offsets = {} if not (len(highmark_offset_reqs) + len(lowmark_offset_reqs)): return watermark_offsets # fail_on_error = False does not prevent network errors highmark_resps = kafka_client.send_offset_request( highmark_offset_reqs, fail_on_error=False, callback=_check_fetch_response_error, ) lowmark_resps = kafka_client.send_offset_request( lowmark_offset_reqs, fail_on_error=False, callback=_check_fetch_response_error, ) # At this point highmark and lowmark should ideally have the same length. assert len(highmark_resps) == len(lowmark_resps) aggregated_offsets = defaultdict(lambda: defaultdict(dict)) for resp in highmark_resps: aggregated_offsets[resp.topic][resp.partition]['highmark'] = \ resp.offsets[0] for resp in lowmark_resps: aggregated_offsets[resp.topic][resp.partition]['lowmark'] = \ resp.offsets[0] for topic, partition_watermarks in aggregated_offsets.iteritems(): for partition, watermarks in partition_watermarks.iteritems(): watermark_offsets.setdefault( topic, {}, )[partition] = PartitionOffsets( topic, partition, watermarks['highmark'], watermarks['lowmark'], ) return watermark_offsets
def has_metadata_for_topic(self, topic): topic = kafka_bytestring(topic) return (topic in self.topic_partitions and len(self.topic_partitions[topic]) > 0)
def fetch_messages(self): """Sends FetchRequests for all topic/partitions set for consumption Returns: Generator that yields KafkaMessage structs after deserializing with the configured `deserializer_class` Note: Refreshes metadata on errors, and resets fetch offset on OffsetOutOfRange, per the configured `auto_offset_reset` policy See Also: Key KafkaConsumer configuration parameters: * `fetch_message_max_bytes` * `fetch_max_wait_ms` * `fetch_min_bytes` * `deserializer_class` * `auto_offset_reset` """ max_bytes = self._config['fetch_message_max_bytes'] max_wait_time = self._config['fetch_wait_max_ms'] min_bytes = self._config['fetch_min_bytes'] if not self._topics: raise KafkaConfigurationError('No topics or partitions configured') if not self._offsets.fetch: raise KafkaConfigurationError( 'No fetch offsets found when calling fetch_messages') fetches = [ FetchRequest(topic, partition, self._offsets.fetch[(topic, partition)], max_bytes) for (topic, partition) in self._topics ] # send_fetch_request will batch topic/partition requests by leader responses = self._client.send_fetch_request( fetches, max_wait_time=max_wait_time, min_bytes=min_bytes, fail_on_error=False) for resp in responses: if isinstance(resp, FailedPayloadsError): logger.warning('FailedPayloadsError attempting to fetch data') self._refresh_metadata_on_error() continue topic = kafka_bytestring(resp.topic) partition = resp.partition try: check_error(resp) except OffsetOutOfRangeError: logger.warning( 'OffsetOutOfRange: topic %s, partition %d, ' 'offset %d (Highwatermark: %d)', topic, partition, self._offsets.fetch[(topic, partition)], resp.highwaterMark) # Reset offset self._offsets.fetch[(topic, partition)] = ( self._reset_partition_offset((topic, partition))) continue except NotLeaderForPartitionError: logger.warning( "NotLeaderForPartitionError for %s - %d. " "Metadata may be out of date", topic, partition) self._refresh_metadata_on_error() continue except RequestTimedOutError: logger.warning("RequestTimedOutError for %s - %d", topic, partition) continue # Track server highwater mark self._offsets.highwater[(topic, partition)] = resp.highwaterMark # Yield each message # Kafka-python could raise an exception during iteration # we are not catching -- user will need to address for (offset, message) in resp.messages: # deserializer_class could raise an exception here val = self._config['deserializer_class'](message.value) msg = KafkaMessage(topic, partition, offset, message.key, val) # in some cases the server will return earlier messages # than we requested. skip them per kafka spec if offset < self._offsets.fetch[(topic, partition)]: logger.debug( 'message offset less than fetched offset ' 'skipping: %s', msg) continue # Only increment fetch offset # if we safely got the message and deserialized self._offsets.fetch[(topic, partition)] = offset + 1 # Then yield to user yield msg
def fetch_messages(self): """Sends FetchRequests for all topic/partitions set for consumption Returns: Generator that yields KafkaMessage structs after deserializing with the configured `deserializer_class` Note: Refreshes metadata on errors, and resets fetch offset on OffsetOutOfRange, per the configured `auto_offset_reset` policy See Also: Key KafkaConsumer configuration parameters: * `fetch_message_max_bytes` * `fetch_max_wait_ms` * `fetch_min_bytes` * `deserializer_class` * `auto_offset_reset` """ max_bytes = self._config['fetch_message_max_bytes'] max_wait_time = self._config['fetch_wait_max_ms'] min_bytes = self._config['fetch_min_bytes'] if not self._topics: raise KafkaConfigurationError('No topics or partitions configured') if not self._offsets.fetch: raise KafkaConfigurationError( 'No fetch offsets found when calling fetch_messages' ) fetches = [FetchRequest(topic, partition, self._offsets.fetch[(topic, partition)], max_bytes) for (topic, partition) in self._topics] # send_fetch_request will batch topic/partition requests by leader responses = self._client.send_fetch_request( fetches, max_wait_time=max_wait_time, min_bytes=min_bytes, fail_on_error=False ) for resp in responses: if isinstance(resp, FailedPayloadsError): logger.warning('FailedPayloadsError attempting to fetch data') self._refresh_metadata_on_error() continue topic = kafka_bytestring(resp.topic) partition = resp.partition try: check_error(resp) except OffsetOutOfRangeError: logger.warning('OffsetOutOfRange: topic %s, partition %d, ' 'offset %d (Highwatermark: %d)', topic, partition, self._offsets.fetch[(topic, partition)], resp.highwaterMark) # Reset offset self._offsets.fetch[(topic, partition)] = ( self._reset_partition_offset((topic, partition)) ) continue except NotLeaderForPartitionError: logger.warning("NotLeaderForPartitionError for %s - %d. " "Metadata may be out of date", topic, partition) self._refresh_metadata_on_error() continue except RequestTimedOutError: logger.warning("RequestTimedOutError for %s - %d", topic, partition) continue # Track server highwater mark self._offsets.highwater[(topic, partition)] = resp.highwaterMark # Yield each message # Kafka-python could raise an exception during iteration # we are not catching -- user will need to address for (offset, message) in resp.messages: # deserializer_class could raise an exception here val = self._config['deserializer_class'](message.value) msg = KafkaMessage(topic, partition, offset, message.key, val) # in some cases the server will return earlier messages # than we requested. skip them per kafka spec if offset < self._offsets.fetch[(topic, partition)]: logger.debug('message offset less than fetched offset ' 'skipping: %s', msg) continue # Only increment fetch offset # if we safely got the message and deserialized self._offsets.fetch[(topic, partition)] = offset + 1 # Then yield to user yield msg
def load_metadata_for_topics(self, *topics): """ Fetch broker and topic-partition metadata from the server, and update internal data: broker list, topic/partition list, and topic/parition -> broker map This method should be called after receiving any error Arguments: *topics (optional): If a list of topics is provided, the metadata refresh will be limited to the specified topics only. Exceptions: ---------- If the broker is configured to not auto-create topics, expect UnknownTopicOrPartitionError for topics that don't exist If the broker is configured to auto-create topics, expect LeaderNotAvailableError for new topics until partitions have been initialized. Exceptions *will not* be raised in a full refresh (i.e. no topic list) In this case, error codes will be logged as errors Partition-level errors will also not be raised here (a single partition w/o a leader, for example) """ topics = [kafka_bytestring(t) for t in topics] if topics: for topic in topics: self.reset_topic_metadata(topic) else: self.reset_all_metadata() resp = self.send_metadata_request(topics) log.debug('Updating broker metadata: %s', resp.brokers) log.debug('Updating topic metadata: %s', resp.topics) self.brokers = dict([(broker.nodeId, broker) for broker in resp.brokers]) for topic_metadata in resp.topics: topic = topic_metadata.topic partitions = topic_metadata.partitions # Errors expected for new topics try: kafka.common.check_error(topic_metadata) except (UnknownTopicOrPartitionError, LeaderNotAvailableError) as e: # Raise if the topic was passed in explicitly if topic in topics: raise # Otherwise, just log a warning log.error('Error loading topic metadata for %s: %s', topic, type(e)) continue self.topic_partitions[topic] = {} for partition_metadata in partitions: partition = partition_metadata.partition leader = partition_metadata.leader self.topic_partitions[topic][partition] = partition_metadata # Populate topics_to_brokers dict topic_part = TopicAndPartition(topic, partition) # Check for partition errors try: kafka.common.check_error(partition_metadata) # If No Leader, topics_to_brokers topic_partition -> None except LeaderNotAvailableError: log.error('No leader for topic %s partition %d', topic, partition) self.topics_to_brokers[topic_part] = None continue # If one of the replicas is unavailable -- ignore # this error code is provided for admin purposes only # we never talk to replicas, only the leader except ReplicaNotAvailableError: log.debug('Some (non-leader) replicas not available for topic %s partition %d', topic, partition) # If Known Broker, topic_partition -> BrokerMetadata if leader in self.brokers: self.topics_to_brokers[topic_part] = self.brokers[leader] # If Unknown Broker, fake BrokerMetadata so we dont lose the id # (not sure how this could happen. server could be in bad state) else: self.topics_to_brokers[topic_part] = BrokerMetadata( leader, None, None )
def has_metadata_for_topic(self, topic): topic = kafka_bytestring(topic) return ( topic in self.topic_partitions and len(self.topic_partitions[topic]) > 0 )
def __init__(self, group_id, cluster, **config): self.log = logging.getLogger(self.__class__.__name__) self._config = config self.cluster = cluster self.group_id = kafka_bytestring(group_id)
def current_offset(self, topic, partition): offsets, = self.client.send_offset_request( [OffsetRequest(kafka_bytestring(topic), partition, -1, 1)]) return offsets.offsets[0]
def set_consumer_offsets( kafka_client, group, new_offsets, raise_on_error=True, offset_storage='zookeeper', ): """Set consumer offsets to the specified offsets. This method does not validate the specified offsets, it is up to the caller to specify valid offsets within a topic partition. If any partition leader is not available, the request fails for all the other topics. This is the tradeoff of sending all topic requests in batch and save both in performance and Kafka load. :param kafka_client: a connected KafkaToolClient :param group: kafka group_id :param topics: dict {<topic>: {<partition>: <offset>}} :param raise_on_error: if False the method does not raise exceptions on errors encountered. It may still fail on the request send. :param offset_storage: String, one of {zookeeper, kafka}. :returns: a list of errors for each partition offset update that failed. :rtype: list [OffsetCommitError] :raises: :py:class:`kafka_utils.util.error.UnknownTopic`: upon missing topics and raise_on_error=True :py:class:`kafka_utils.util.error.UnknownPartition`: upon missing partitions and raise_on_error=True :py:class:`exceptions.TypeError`: upon badly formatted input new_offsets :py:class:`kafka_utils.util.error.InvalidOffsetStorageError: upon unknown offset_storage choice. FailedPayloadsError: upon send request error. """ valid_new_offsets = _verify_commit_offsets_requests( kafka_client, new_offsets, raise_on_error) group_offset_reqs = [ OffsetCommitRequest(kafka_bytestring(topic), partition, offset, None) for topic, new_partition_offsets in valid_new_offsets.iteritems() for partition, offset in new_partition_offsets.iteritems() ] if offset_storage == 'zookeeper' or not offset_storage: send_api = kafka_client.send_offset_commit_request elif offset_storage == 'kafka': send_api = kafka_client.send_offset_commit_request_kafka else: raise InvalidOffsetStorageError(offset_storage) status = [] if group_offset_reqs: status = send_api(kafka_bytestring(group), group_offset_reqs, raise_on_error, callback=_check_commit_response_error) return filter(None, status)
def current_offset(self, topic, partition): offsets, = self.client.send_offset_request([ OffsetRequest(kafka_bytestring(topic), partition, -1, 1) ]) return offsets.offsets[0]
def _kill_leader(self, topic, partition): leader = self.client.topics_to_brokers[TopicAndPartition(kafka_bytestring(topic), partition)] broker = self.brokers[leader.nodeId] broker.close() return broker
def get_current_consumer_offsets( kafka_client, group, topics, raise_on_error=True, offset_storage='zookeeper', ): """ Get current consumer offsets. NOTE: This method does not refresh client metadata. It is up to the caller to avoid using stale metadata. If any partition leader is not available, the request fails for all the other topics. This is the tradeoff of sending all topic requests in batch and save both in performance and Kafka load. :param kafka_client: a connected KafkaToolClient :param group: kafka group_id :param topics: topic list or dict {<topic>: [partitions]} :param raise_on_error: if False the method ignores missing topics and missing partitions. It still may fail on the request send. :param offset_storage: String, one of {zookeeper, kafka}. :returns: a dict topic: partition: offset :raises: :py:class:`kafka_utils.util.error.UnknownTopic`: upon missing topics and raise_on_error=True :py:class:`kafka_utils.util.error.UnknownPartition`: upon missing partitions and raise_on_error=True :py:class:`kafka_utils.util.error.InvalidOffsetStorageError: upon unknown offset_storage choice. FailedPayloadsError: upon send request error. """ topics = _verify_topics_and_partitions(kafka_client, topics, raise_on_error) group_offset_reqs = [ OffsetFetchRequest(kafka_bytestring(topic), partition) for topic, partitions in topics.iteritems() for partition in partitions ] group_offsets = {} if offset_storage == 'zookeeper': send_api = kafka_client.send_offset_fetch_request elif offset_storage == 'kafka': send_api = kafka_client.send_offset_fetch_request_kafka else: raise InvalidOffsetStorageError(offset_storage) if group_offset_reqs: # fail_on_error = False does not prevent network errors group_resps = send_api( group=kafka_bytestring(group), payloads=group_offset_reqs, fail_on_error=False, callback=pluck_topic_offset_or_zero_on_unknown, ) for resp in group_resps: group_offsets.setdefault( resp.topic, {}, )[resp.partition] = resp.offset return group_offsets
def send(self, topic, key, msg): topic = kafka_bytestring(topic) partition = self._next_partition(topic, key) return self._send_messages(topic, partition, msg, key=key)
def _kill_leader(self, topic, partition): leader = self.client.topics_to_brokers[TopicAndPartition( kafka_bytestring(topic), partition)] broker = self.brokers[leader.nodeId] broker.close() return broker
def get_partition_ids_for_topic(self, topic): topic = kafka_bytestring(topic) if topic not in self.topic_partitions: return [] return sorted(list(self.topic_partitions[topic]))
def get_topics_watermarks(kafka_client, topics, raise_on_error=True): """ Get current topic watermarks. NOTE: This method does not refresh client metadata. It is up to the caller to use avoid using stale metadata. If any partition leader is not available, the request fails for all the other topics. This is the tradeoff of sending all topic requests in batch and save both in performance and Kafka load. :param kafka_client: a connected KafkaToolClient :param topics: topic list or dict {<topic>: [partitions]} :param raise_on_error: if False the method ignores missing topics and missing partitions. It still may fail on the request send. :returns: a dict topic: partition: Part :raises: :py:class:`~kafka_utils.util.error.UnknownTopic`: upon missing topics and raise_on_error=True :py:class:`~kafka_utils.util.error.UnknownPartition`: upon missing partitions and raise_on_error=True FailedPayloadsError: upon send request error. """ topics = _verify_topics_and_partitions( kafka_client, topics, raise_on_error, ) highmark_offset_reqs = [] lowmark_offset_reqs = [] for topic, partitions in topics.iteritems(): # Batch watermark requests for partition in partitions: # Request the the latest offset highmark_offset_reqs.append( OffsetRequest(kafka_bytestring(topic), partition, -1, max_offsets=1)) # Request the earliest offset lowmark_offset_reqs.append( OffsetRequest(kafka_bytestring(topic), partition, -2, max_offsets=1)) watermark_offsets = {} if not (len(highmark_offset_reqs) + len(lowmark_offset_reqs)): return watermark_offsets # fail_on_error = False does not prevent network errors highmark_resps = kafka_client.send_offset_request( highmark_offset_reqs, fail_on_error=False, callback=_check_fetch_response_error, ) lowmark_resps = kafka_client.send_offset_request( lowmark_offset_reqs, fail_on_error=False, callback=_check_fetch_response_error, ) # At this point highmark and lowmark should ideally have the same length. assert len(highmark_resps) == len(lowmark_resps) aggregated_offsets = defaultdict(lambda: defaultdict(dict)) for resp in highmark_resps: aggregated_offsets[resp.topic][resp.partition]['highmark'] = \ resp.offsets[0] for resp in lowmark_resps: aggregated_offsets[resp.topic][resp.partition]['lowmark'] = \ resp.offsets[0] for topic, partition_watermarks in aggregated_offsets.iteritems(): for partition, watermarks in partition_watermarks.iteritems(): watermark_offsets.setdefault( topic, {}, )[partition] = PartitionOffsets( topic, partition, watermarks['highmark'], watermarks['lowmark'], ) return watermark_offsets
def set_topic_partitions(self, *topics): """ Set the topic/partitions to consume Optionally specify offsets to start from Accepts types: * str (utf-8): topic name (will consume all available partitions) * tuple: (topic, partition) * dict: - { topic: partition } - { topic: [partition list] } - { topic: (partition tuple,) } Optionally, offsets can be specified directly: * tuple: (topic, partition, offset) * dict: { (topic, partition): offset, ... } Example: .. code:: python kafka = KafkaConsumer() # Consume topic1-all; topic2-partition2; topic3-partition0 kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0}) # Consume topic1-0 starting at offset 12, and topic2-1 at offset 45 # using tuples -- kafka.set_topic_partitions(("topic1", 0, 12), ("topic2", 1, 45)) # using dict -- kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 }) """ self._topics = [] self._client.load_metadata_for_topics() # Setup offsets self._offsets = OffsetsStruct(fetch=dict(), commit=dict(), highwater=dict(), task_done=dict()) # Handle different topic types for arg in topics: # Topic name str -- all partitions if isinstance(arg, (six.string_types, six.binary_type)): topic = kafka_bytestring(arg) for partition in self._client.get_partition_ids_for_topic(topic): self._consume_topic_partition(topic, partition) # (topic, partition [, offset]) tuple elif isinstance(arg, tuple): topic = kafka_bytestring(arg[0]) partition = arg[1] self._consume_topic_partition(topic, partition) if len(arg) == 3: offset = arg[2] self._offsets.fetch[(topic, partition)] = offset # { topic: partitions, ... } dict elif isinstance(arg, dict): for key, value in six.iteritems(arg): # key can be string (a topic) if isinstance(key, (six.string_types, six.binary_type)): topic = kafka_bytestring(key) # topic: partition if isinstance(value, int): self._consume_topic_partition(topic, value) # topic: [ partition1, partition2, ... ] elif isinstance(value, (list, tuple)): for partition in value: self._consume_topic_partition(topic, partition) else: raise KafkaConfigurationError( 'Unknown topic type ' '(dict key must be int or list/tuple of ints)' ) # (topic, partition): offset elif isinstance(key, tuple): topic = kafka_bytestring(key[0]) partition = key[1] self._consume_topic_partition(topic, partition) self._offsets.fetch[(topic, partition)] = value else: raise KafkaConfigurationError('Unknown topic type (%s)' % type(arg)) # If we have a consumer group, try to fetch stored offsets if self._config['group_id']: self._get_commit_offsets() # Update missing fetch/commit offsets for topic_partition in self._topics: # Commit offsets default is None if topic_partition not in self._offsets.commit: self._offsets.commit[topic_partition] = None # Skip if we already have a fetch offset from user args if topic_partition not in self._offsets.fetch: # Fetch offsets default is (1) commit if self._offsets.commit[topic_partition] is not None: self._offsets.fetch[topic_partition] = self._offsets.commit[topic_partition] # or (2) auto reset else: self._offsets.fetch[topic_partition] = self._reset_partition_offset(topic_partition) # highwater marks (received from server on fetch response) # and task_done (set locally by user) # should always get initialized to None self._reset_highwater_offsets() self._reset_task_done_offsets() # Reset message iterator in case we were in the middle of one self._reset_message_iterator()
raise TypeError("Can't get Kafka port from environment variable") try: env_kafka_port = int(env_kafka_port) except: raise TypeError("Couldn't turn Kafka port into a number") env_kafka_topic = os.environ.get('TOPIC') if env_kafka_topic is None: raise TypeError("Can't get Kafka topic from environment variable") env_consumer_group = os.environ.get('CONSUMER_GROUP') if env_consumer_group is None: raise TypeError("Can't get Kafka consumer group from environment variable") # Kafka settings consumer_group = kafka_bytestring(env_consumer_group) topic = kafka_bytestring(env_kafka_topic) kafka_host = "{0}:{1}".format(env_kafka_addr, env_kafka_port) # Create client print("Starting Kafka client: ", kafka_host, topic, consumer_group, flush=True) try: client = KafkaClient(kafka_host) except (LeaderNotAvailableError, ConnectionError, KafkaUnavailableError,) as leader_err: num_tries = 0 max_tries = 10 is_connected = False while not is_connected: print("Retrying to start consumer client: {0!s}".format(num_tries), flush=True) time.sleep(5)
def set_topic_partitions(self, *topics): """ Set the topic/partitions to consume Optionally specify offsets to start from Accepts types: * str (utf-8): topic name (will consume all available partitions) * tuple: (topic, partition) * dict: - { topic: partition } - { topic: [partition list] } - { topic: (partition tuple,) } Optionally, offsets can be specified directly: * tuple: (topic, partition, offset) * dict: { (topic, partition): offset, ... } Example: .. code:: python kafka = KafkaConsumer() # Consume topic1-all; topic2-partition2; topic3-partition0 kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0}) # Consume topic1-0 starting at offset 12, and topic2-1 at offset 45 # using tuples -- kafka.set_topic_partitions(("topic1", 0, 12), ("topic2", 1, 45)) # using dict -- kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 }) """ self._topics = [] self._client.load_metadata_for_topics() # Setup offsets self._offsets = OffsetsStruct(fetch=dict(), commit=dict(), highwater=dict(), task_done=dict()) # Handle different topic types for arg in topics: # Topic name str -- all partitions if isinstance(arg, (six.string_types, six.binary_type)): topic = kafka_bytestring(arg) for partition in self._client.get_partition_ids_for_topic( topic): self._consume_topic_partition(topic, partition) # (topic, partition [, offset]) tuple elif isinstance(arg, tuple): topic = kafka_bytestring(arg[0]) partition = arg[1] self._consume_topic_partition(topic, partition) if len(arg) == 3: offset = arg[2] self._offsets.fetch[(topic, partition)] = offset # { topic: partitions, ... } dict elif isinstance(arg, dict): for key, value in six.iteritems(arg): # key can be string (a topic) if isinstance(key, (six.string_types, six.binary_type)): topic = kafka_bytestring(key) # topic: partition if isinstance(value, int): self._consume_topic_partition(topic, value) # topic: [ partition1, partition2, ... ] elif isinstance(value, (list, tuple)): for partition in value: self._consume_topic_partition(topic, partition) else: raise KafkaConfigurationError( 'Unknown topic type ' '(dict key must be int or list/tuple of ints)') # (topic, partition): offset elif isinstance(key, tuple): topic = kafka_bytestring(key[0]) partition = key[1] self._consume_topic_partition(topic, partition) self._offsets.fetch[(topic, partition)] = value else: raise KafkaConfigurationError('Unknown topic type (%s)' % type(arg)) # If we have a consumer group, try to fetch stored offsets if self._config['group_id']: self._get_commit_offsets() # Update missing fetch/commit offsets for topic_partition in self._topics: # Commit offsets default is None if topic_partition not in self._offsets.commit: self._offsets.commit[topic_partition] = None # Skip if we already have a fetch offset from user args if topic_partition not in self._offsets.fetch: # Fetch offsets default is (1) commit if self._offsets.commit[topic_partition] is not None: self._offsets.fetch[ topic_partition] = self._offsets.commit[ topic_partition] # or (2) auto reset else: self._offsets.fetch[ topic_partition] = self._reset_partition_offset( topic_partition) # highwater marks (received from server on fetch response) # and task_done (set locally by user) # should always get initialized to None self._reset_highwater_offsets() self._reset_task_done_offsets() # Reset message iterator in case we were in the middle of one self._reset_message_iterator()
def set_consumer_offsets( kafka_client, group, new_offsets, raise_on_error=True, offset_storage='zookeeper', ): """Set consumer offsets to the specified offsets. This method does not validate the specified offsets, it is up to the caller to specify valid offsets within a topic partition. If any partition leader is not available, the request fails for all the other topics. This is the tradeoff of sending all topic requests in batch and save both in performance and Kafka load. :param kafka_client: a connected KafkaToolClient :param group: kafka group_id :param topics: dict {<topic>: {<partition>: <offset>}} :param raise_on_error: if False the method does not raise exceptions on errors encountered. It may still fail on the request send. :param offset_storage: String, one of {zookeeper, kafka}. :returns: a list of errors for each partition offset update that failed. :rtype: list [OffsetCommitError] :raises: :py:class:`kafka_utils.util.error.UnknownTopic`: upon missing topics and raise_on_error=True :py:class:`kafka_utils.util.error.UnknownPartition`: upon missing partitions and raise_on_error=True :py:class:`exceptions.TypeError`: upon badly formatted input new_offsets :py:class:`kafka_utils.util.error.InvalidOffsetStorageError: upon unknown offset_storage choice. FailedPayloadsError: upon send request error. """ valid_new_offsets = _verify_commit_offsets_requests( kafka_client, new_offsets, raise_on_error ) group_offset_reqs = [ OffsetCommitRequest( kafka_bytestring(topic), partition, offset, None ) for topic, new_partition_offsets in valid_new_offsets.iteritems() for partition, offset in new_partition_offsets.iteritems() ] if offset_storage == 'zookeeper' or not offset_storage: send_api = kafka_client.send_offset_commit_request elif offset_storage == 'kafka': send_api = kafka_client.send_offset_commit_request_kafka else: raise InvalidOffsetStorageError(offset_storage) status = [] if group_offset_reqs: status = send_api( kafka_bytestring(group), group_offset_reqs, raise_on_error, callback=_check_commit_response_error ) return filter(None, status)
def load_metadata_for_topics(self, *topics): """ Fetch broker and topic-partition metadata from the server, and update internal data: broker list, topic/partition list, and topic/parition -> broker map This method should be called after receiving any error Arguments: *topics (optional): If a list of topics is provided, the metadata refresh will be limited to the specified topics only. Exceptions: ---------- If the broker is configured to not auto-create topics, expect UnknownTopicOrPartitionError for topics that don't exist If the broker is configured to auto-create topics, expect LeaderNotAvailableError for new topics until partitions have been initialized. Exceptions *will not* be raised in a full refresh (i.e. no topic list) In this case, error codes will be logged as errors Partition-level errors will also not be raised here (a single partition w/o a leader, for example) """ topics = [kafka_bytestring(t) for t in topics] if topics: for topic in topics: self.reset_topic_metadata(topic) else: self.reset_all_metadata() resp = self.send_metadata_request(topics) log.debug('Updating broker metadata: %s', resp.brokers) log.debug('Updating topic metadata: %s', resp.topics) self.brokers = dict([(broker.nodeId, broker) for broker in resp.brokers]) for topic_metadata in resp.topics: topic = topic_metadata.topic partitions = topic_metadata.partitions # Errors expected for new topics try: kafka.common.check_error(topic_metadata) except (UnknownTopicOrPartitionError, LeaderNotAvailableError) as e: # Raise if the topic was passed in explicitly if topic in topics: raise # Otherwise, just log a warning log.error('Error loading topic metadata for %s: %s', topic, type(e)) continue self.topic_partitions[topic] = {} for partition_metadata in partitions: partition = partition_metadata.partition leader = partition_metadata.leader self.topic_partitions[topic][partition] = partition_metadata # Populate topics_to_brokers dict topic_part = TopicAndPartition(topic, partition) # Check for partition errors try: kafka.common.check_error(partition_metadata) # If No Leader, topics_to_brokers topic_partition -> None except LeaderNotAvailableError: log.error('No leader for topic %s partition %d', topic, partition) self.topics_to_brokers[topic_part] = None continue # If one of the replicas is unavailable -- ignore # this error code is provided for admin purposes only # we never talk to replicas, only the leader except ReplicaNotAvailableError: log.debug( 'Some (non-leader) replicas not available for topic %s partition %d', topic, partition) # If Known Broker, topic_partition -> BrokerMetadata if leader in self.brokers: self.topics_to_brokers[topic_part] = self.brokers[leader] # If Unknown Broker, fake BrokerMetadata so we dont lose the id # (not sure how this could happen. server could be in bad state) else: self.topics_to_brokers[topic_part] = BrokerMetadata( leader, None, None)