def kafka_get_topics_offsets(host, topic, port=9092): """Return available partitions and their offsets for the given topic. Args: host (str): Kafka host. topic (str): Kafka topic. port (int): Kafka port. Returns: [(int, int, int)]: [(partition, start_offset, end_offset)]. """ brokers = ['{}:{}'.format(host, port)] client = SimpleClient(brokers) offsets = [] partitions = client.get_partition_ids_for_topic(topic) offsets_responses_end = client.send_offset_request([ OffsetRequestPayload(topic, partition, -1, 1) for partition in partitions ]) offsets_responses_start = client.send_offset_request([ OffsetRequestPayload(topic, partition, -2, 1) for partition in partitions ]) for start_offset, end_offset in zip(offsets_responses_start, offsets_responses_end): offsets.append((start_offset.partition, start_offset.offsets[0], end_offset.offsets[0])) return offsets
def _get_highwater_offsets(self, kafka_hosts_ports): """ Fetch highwater offsets for each topic/partition from Kafka cluster. Do this for all partitions in the cluster because even if it has no consumers, we may want to measure whether producers are successfully producing. No need to limit this for performance because fetching broker offsets from Kafka is a relatively inexpensive operation. """ kafka_conn = SimpleClient(kafka_hosts_ports, timeout=self.kafka_timeout) try: broker_topics_partitions = kafka_conn.topics_to_brokers.keys() # batch a bunch of requests into a single network call offsets_request = [ OffsetRequestPayload(topic, partition, -1, 1) for topic, partition in broker_topics_partitions ] offsets_response = kafka_conn.send_offset_request(offsets_request) highwater_offsets = {(x.topic, x.partition): x.offsets[0] for x in offsets_response} finally: try: kafka_conn.close() except Exception: self.log.exception('Error cleaning up Kafka connection') return highwater_offsets
def get_logsize(client, topic, partitions=None): if partitions is None: # 获取 partitions partitions = client.get_partition_ids_for_topic(topic) data = {} # 判断 topic 是否在 client.topic_partitions, 不在则创建 # client.topic_partitions = {} # topic -> partition -> leader if topic not in client.topic_partitions: client.load_metadata_for_topics(topic) # 验证 partition 是否有问题 for partition in partitions: if client.topic_partitions[topic][partition] == -1: # 有问题则删除 partitions.remove(partition) data[partition] = None reqs = [] for partition in partitions: reqs.append(OffsetRequestPayload(topic, partition, -1, 1)) resps = client.send_offset_request(reqs) for resp in resps: pending = resp.offsets[0] partition = resp.partition data[partition] = pending return data
def current_offset(self, topic, partition): try: offsets, = self.client.send_offset_request( [OffsetRequestPayload(topic, partition, -1, 1)]) except Exception: # XXX: We've seen some UnknownErrors here and can't debug w/o server logs self.zk.child.dump_logs() self.server.child.dump_logs() raise else: return offsets.offsets[0]
def current_offset(client, topic, partition, kafka_broker=None): """Get the current offset of a topic's partition """ try: offsets, = client.send_offset_request( [OffsetRequestPayload(topic, partition, -1, 1)]) except Exception: # XXX: We've seen some UnknownErrors here and can't debug w/o server logs if kafka_broker: kafka_broker.dump_logs() raise else: return offsets.offsets[0]
def reset_partition_offset(self, partition): """Update offsets using auto_offset_reset policy (smallest|largest) Arguments: partition (int): the partition for which offsets should be updated Returns: Updated offset on success, None on failure """ LATEST = -1 EARLIEST = -2 if self.auto_offset_reset == 'largest': reqs = [OffsetRequestPayload(self.topic, partition, LATEST, 1)] elif self.auto_offset_reset == 'smallest': reqs = [OffsetRequestPayload(self.topic, partition, EARLIEST, 1)] else: # Let's raise an reasonable exception type if user calls # outside of an exception context if sys.exc_info() == (None, None, None): raise OffsetOutOfRangeError( 'Cannot reset partition offsets without a ' 'valid auto_offset_reset setting ' '(largest|smallest)') # Otherwise we should re-raise the upstream exception # b/c it typically includes additional data about # the request that triggered it, and we do not want to drop that raise # pylint: disable=E0704 # send_offset_request log.info('Resetting topic-partition offset to %s for %s:%d', self.auto_offset_reset, self.topic, partition) try: (resp, ) = self.client.send_offset_request(reqs) except KafkaError as e: log.error('%s sending offset request for %s:%d', e.__class__.__name__, self.topic, partition) else: self.offsets[partition] = resp.offsets[0] self.fetch_offsets[partition] = resp.offsets[0] return resp.offsets[0]
def setUp(self): super(KafkaIntegrationTestCase, self).setUp() if not os.environ.get('KAFKA_VERSION'): self.skipTest('Integration test requires KAFKA_VERSION') if not self.topic: topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10)) self.topic = topic if self.create_client: self.client = SimpleClient('%s:%d' % (self.server.host, self.server.port)) self.client_async = KafkaClient( bootstrap_servers='%s:%d' % (self.server.host, self.server.port)) timeout = time.time() + 30 while time.time() < timeout: try: self.client.load_metadata_for_topics( self.topic, ignore_leadernotavailable=False) if self.client.has_metadata_for_topic(topic): break except (LeaderNotAvailableError, InvalidTopicError): time.sleep(1) else: raise KafkaTimeoutError('Timeout loading topic metadata!') # Ensure topic partitions have been created on all brokers to avoid UnknownPartitionErrors # TODO: It might be a good idea to move this to self.client.ensure_topic_exists for partition in self.client.get_partition_ids_for_topic(self.topic): while True: try: req = OffsetRequestPayload(self.topic, partition, -1, 100) self.client.send_offset_request([req]) break except (NotLeaderForPartitionError, UnknownTopicOrPartitionError, FailedPayloadsError) as e: if time.time() > timeout: raise KafkaTimeoutError( 'Timeout loading topic metadata!') time.sleep(.1) self._messages = {}
def topic_offsets(kafka_brokers, topic): client = SimpleClient(insure_is_array(kafka_brokers)) topic_partitions = client.topic_partitions if topic not in topic_partitions: raise KafkaException("topic {} doesn't exists".format(topic)) partitions = topic_partitions[topic] offset_requests = [ OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys() ] offsets_responses = client.send_offset_request(offset_requests) client.close() partitions_and_offsets = {} for offset in offsets_responses: if offset.topic == topic: topic_offset = 0 topic_partition = TopicPartition(topic=offset.topic, partition=offset.partition) if offset.offsets[0]: topic_offset = offset.offsets[0] partitions_and_offsets[topic_partition] = topic_offset return partitions_and_offsets
def get_partition_offsets(self, topic, partition, request_time_ms, max_num_offsets): """Request available fetch offsets for a single topic/partition Keyword Arguments: topic (str): topic for offset request partition (int): partition for offset request request_time_ms (int): Used to ask for all messages before a certain time (ms). There are two special values. Specify -1 to receive the latest offset (i.e. the offset of the next coming message) and -2 to receive the earliest available offset. Note that because offsets are pulled in descending order, asking for the earliest offset will always return you a single element. max_num_offsets (int): Maximum offsets to include in the OffsetResponse Returns: a list of offsets in the OffsetResponse submitted for the provided topic / partition. See: https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetAPI """ reqs = [ OffsetRequestPayload(topic, partition, request_time_ms, max_num_offsets) ] (resp, ) = self._client.send_offset_request(reqs) check_error(resp) # Just for sanity.. # probably unnecessary assert resp.topic == topic assert resp.partition == partition return resp.offsets
def pending(self, partitions=None): """ Gets the pending message count Keyword Arguments: partitions (list): list of partitions to check for, default is to check all """ if partitions is None: partitions = self.offsets.keys() total = 0 reqs = [] for partition in partitions: reqs.append(OffsetRequestPayload(self.topic, partition, -1, 1)) resps = self.client.send_offset_request(reqs) for resp in resps: partition = resp.partition pending = resp.offsets[0] offset = self.offsets[partition] total += pending - offset return total
def get_topics_watermarks(kafka_client, topics, raise_on_error=True): """ Get current topic watermarks. NOTE: This method does not refresh client metadata. It is up to the caller to use avoid using stale metadata. If any partition leader is not available, the request fails for all the other topics. This is the tradeoff of sending all topic requests in batch and save both in performance and Kafka load. :param kafka_client: a connected KafkaToolClient :param topics: topic list or dict {<topic>: [partitions]} :param raise_on_error: if False the method ignores missing topics and missing partitions. It still may fail on the request send. :returns: a dict topic: partition: Part :raises: :py:class:`~kafka_utils.util.error.UnknownTopic`: upon missing topics and raise_on_error=True :py:class:`~kafka_utils.util.error.UnknownPartition`: upon missing partitions and raise_on_error=True FailedPayloadsError: upon send request error. """ topics = _verify_topics_and_partitions( kafka_client, topics, raise_on_error, ) highmark_offset_reqs = [] lowmark_offset_reqs = [] for topic, partitions in six.iteritems(topics): # Batch watermark requests for partition in partitions: # Request the the latest offset highmark_offset_reqs.append( OffsetRequestPayload(topic, partition, -1, max_offsets=1)) # Request the earliest offset lowmark_offset_reqs.append( OffsetRequestPayload(topic, partition, -2, max_offsets=1)) watermark_offsets = {} if not (len(highmark_offset_reqs) + len(lowmark_offset_reqs)): return watermark_offsets # fail_on_error = False does not prevent network errors highmark_resps = kafka_client.send_offset_request( highmark_offset_reqs, fail_on_error=False, callback=_check_fetch_response_error, ) lowmark_resps = kafka_client.send_offset_request( lowmark_offset_reqs, fail_on_error=False, callback=_check_fetch_response_error, ) # At this point highmark and lowmark should ideally have the same length. assert len(highmark_resps) == len(lowmark_resps) aggregated_offsets = defaultdict(lambda: defaultdict(dict)) for resp in highmark_resps: aggregated_offsets[resp.topic][resp.partition]['highmark'] = \ resp.offsets[0] for resp in lowmark_resps: aggregated_offsets[resp.topic][resp.partition]['lowmark'] = \ resp.offsets[0] for topic, partition_watermarks in six.iteritems(aggregated_offsets): for partition, watermarks in six.iteritems(partition_watermarks): watermark_offsets.setdefault( topic, {}, )[partition] = PartitionOffsets( topic, partition, watermarks['highmark'], watermarks['lowmark'], ) return watermark_offsets
def seek(self, offset, whence=None, partition=None): """ Alter the current offset in the consumer, similar to fseek Arguments: offset: how much to modify the offset whence: where to modify it from, default is None * None is an absolute offset * 0 is relative to the earliest available offset (head) * 1 is relative to the current offset * 2 is relative to the latest known offset (tail) partition: modify which partition, default is None. If partition is None, would modify all partitions. """ if whence is None: # set an absolute offset if partition is None: for tmp_partition in self.offsets: self.offsets[tmp_partition] = offset else: self.offsets[partition] = offset elif whence == 1: # relative to current position if partition is None: for tmp_partition, _offset in self.offsets.items(): self.offsets[tmp_partition] = _offset + offset else: self.offsets[partition] += offset elif whence in (0, 2): # relative to beginning or end reqs = [] deltas = {} if partition is None: # divide the request offset by number of partitions, # distribute the remained evenly (delta, rem) = divmod(offset, len(self.offsets)) for tmp_partition, r in izip_longest(self.offsets.keys(), repeat(1, rem), fillvalue=0): deltas[tmp_partition] = delta + r for tmp_partition in self.offsets.keys(): if whence == 0: reqs.append(OffsetRequestPayload(self.topic, tmp_partition, -2, 1)) elif whence == 2: reqs.append(OffsetRequestPayload(self.topic, tmp_partition, -1, 1)) else: pass else: deltas[partition] = offset if whence == 0: reqs.append(OffsetRequestPayload(self.topic, partition, -2, 1)) elif whence == 2: reqs.append(OffsetRequestPayload(self.topic, partition, -1, 1)) else: pass resps = self.client.send_offset_request(reqs) for resp in resps: self.offsets[resp.partition] = \ resp.offsets[0] + deltas[resp.partition] else: raise ValueError('Unexpected value for `whence`, %d' % whence) # Reset queue and fetch offsets since they are invalid self.fetch_offsets = self.offsets.copy() self.count_since_commit += 1 if self.auto_commit: self.commit() self.queue = queue.Queue()