def fetch(self, max_size=None): log.debug("Fetch called on SimpleConsumer {0}".format(self.id)) bps_to_offsets = self._bps_to_next_offsets # Do all the fetches we need to (this should get replaced with # multifetch or performance is going to suck wind later)... message_sets = [] # We only iterate over those broker partitions for which we have offsets for bp in bps_to_offsets: offset = bps_to_offsets[bp] kafka = self._connections[bp.broker_id] offsets_msgs, next_offset = kafka.fetch(bp.topic, offset, partition=bp.partition, max_size=max_size) msg_set = MessageSet(bp, offset, offsets_msgs) msg_set.next_offset = next_offset # fetches bytes messages max_fetch old_stats = self._stats[bp] self._stats[bp] = ConsumerStats(fetches=old_stats.fetches + 1, bytes=old_stats.bytes + msg_set.size, messages=old_stats.messages + len(msg_set), max_fetch=max(old_stats.max_fetch, msg_set.size)) message_sets.append(msg_set) if message_sets: result = FetchResult(sorted(message_sets)) else: result = [] # Filter out broker partitions whose end offsets we've exceeded self._bps_to_next_offsets = {} for msg_result in result: bp = msg_result.broker_partition next_offset = msg_result.next_offset end_offset = self._end_broker_partitions.get(bp, None) if end_offset is None or next_offset <= end_offset: self._bps_to_next_offsets[bp] = next_offset return result
def fetch(self, max_size=None, min_size=None, fetch_step=None): log.debug("Fetch called on SimpleConsumer {0}".format(self.id)) bps_to_offsets = self._bps_to_next_offsets # Do all the fetches we need to (this should get replaced with # multifetch or performance is going to suck wind later)... message_sets = [] # We only iterate over those broker partitions for which we have offsets for bp in bps_to_offsets: offset = bps_to_offsets[bp] kafka = self._connections[bp.broker_id] offsets_msgs = kafka.fetch(bp.topic, offset, partition=bp.partition, min_size=min_size, max_size=max_size, fetch_step=fetch_step) msg_set = MessageSet(bp, offset, offsets_msgs) # fetches bytes messages max_fetch old_stats = self._stats[bp] self._stats[bp] = ConsumerStats( fetches=old_stats.fetches + 1, bytes=old_stats.bytes + msg_set.size, messages=old_stats.messages + len(msg_set), max_fetch=max(old_stats.max_fetch, msg_set.size)) message_sets.append(msg_set) if message_sets: result = FetchResult(sorted(message_sets)) else: result = FetchResult([]) # Filter out broker partitions whose end offsets we've exceeded self._bps_to_next_offsets = {} for msg_result in result: bp = msg_result.broker_partition next_offset = msg_result.next_offset end_offset = self._end_broker_partitions.get(bp, None) if end_offset is None or next_offset <= end_offset: self._bps_to_next_offsets[bp] = next_offset return result
def fetch(self, max_size=None, retry_limit=3, ignore_failures=False): """Return a FetchResult, which can be iterated over as a list of MessageSets. A MessageSet is returned for every broker partition that is successfully queried, even if that MessageSet is empty. FIXME: This is where the adjustment needs to happen. Regardless of whether a rebalance has occurred or not, we can very easily see if we are still responsible for the same partitions as we were the last time we ran, and set self._bps_to_next_offsets --> we just need to check if it's not None and if we still have the same offsets, and adjust accordingly. """ def needs_offset_values_from_zk(bps_to_offsets): """We need to pull offset values from ZK if we have no BrokerPartitions in our BPs -> Offsets mapping, or if some of those Offsets are unknown (None)""" return (not bps_to_offsets) or (None in bps_to_offsets.values()) log.debug("Fetch called on ZKConsumer {0}".format(self.id)) if self._needs_rebalance: self.rebalance() # Find where we're starting from. If we've already done a fetch, we use # our internal value. This is also all we can do in the case where # autocommit is off, since any value in ZK will be out of date. bps_to_offsets = dict(self._bps_to_next_offsets) offsets_pulled_from_zk = False if needs_offset_values_from_zk(bps_to_offsets): # We have some offsets, but we've been made responsible for new # BrokerPartitions that we need to lookup. if bps_to_offsets: bps_needing_offsets = [bp for bp, offset in bps_to_offsets.items() if offset is None] # Otherwise, it's our first fetch, so we need everything else: bps_needing_offsets = self.broker_partitions bps_to_offsets.update(self._zk_util.offsets_for(self.consumer_group, self._id, bps_needing_offsets)) offsets_pulled_from_zk = True # Do all the fetches we need to (this should get replaced with # multifetch or performance is going to suck wind later)... message_sets = [] # We only iterate over those broker partitions for which we have offsets for bp in bps_to_offsets: offset = bps_to_offsets[bp] kafka = self._connections[bp.broker_id] partition = kafka.partition(bp.topic, bp.partition) if offset is None: offset = partition.latest_offset() try: offsets_msgs, next_offset = kafka.fetch(bp.topic, offset, partition=bp.partition, max_size=max_size) # If our fetch fails because it's out of range, and the values came # from ZK originally (not our internal incrementing), we assume ZK # is somehow stale, so we just grab the latest and march on. except OffsetOutOfRange as ex: if offsets_pulled_from_zk: log.error("Offset {0} from ZooKeeper is out of range for {1}" .format(offset, bp)) offset = partition.latest_offset() log.error("Retrying with offset {0} for {1}" .format(offset, bp)) offsets_msgs, next_offset = kafka.fetch(bp.topic, offset, partition=bp.partition, max_size=max_size) else: raise except KafkaError as k_err: if ignore_failures: log.error("Ignoring failed fetch on {0}".format(bp)) log.exception(k_err) continue else: raise msg_set = MessageSet(bp, offset, offsets_msgs) msg_set.next_offset = next_offset # fetches bytes messages max_fetch old_stats = self._stats[bp] self._stats[bp] = ConsumerStats(fetches=old_stats.fetches + 1, bytes=old_stats.bytes + msg_set.size, messages=old_stats.messages + len(msg_set), max_fetch=max(old_stats.max_fetch, msg_set.size)) message_sets.append(msg_set) result = FetchResult(sorted(message_sets)) # Now persist our new offsets for msg_set in result: self._bps_to_next_offsets[msg_set.broker_partition] = msg_set.next_offset if self._autocommit: self.commit_offsets() return result
def fetch(self, max_size=None, retry_limit=3, ignore_failures=False): """Return a FetchResult, which can be iterated over as a list of MessageSets. A MessageSet is returned for every broker partition that is successfully queried, even if that MessageSet is empty. FIXME: This is where the adjustment needs to happen. Regardless of whether a rebalance has occurred or not, we can very easily see if we are still responsible for the same partitions as we were the last time we ran, and set self._bps_to_next_offsets --> we just need to check if it's not None and if we still have the same offsets, and adjust accordingly. """ log.debug("Fetch called on ZKConsumer {0}".format(self.id)) if self._needs_rebalance: self.rebalance() # Find where we're starting from... offsets_pulled_from_zk = False if self._bps_to_next_offsets: # We've already done a fetch, we use our internal value. This is # also all we can do in the case where autocommit is off, since any # value in ZK will be out of date bps_to_offsets = self._bps_to_next_offsets else: # In this case, it's our first fetch, and we need to ask ZooKeeper # for our start value. That being said, if the value from ZooKeeper # is out of range for any given partition, we'll simply start at the # most recent value for that partition. bps_to_offsets = self._zk_util.offsets_for(self.consumer_group, self._id, self.broker_partitions) offsets_pulled_from_zk = True # Do all the fetches we need to (this should get replaced with # multifetch or performance is going to suck wind later)... message_sets = [] # We only iterate over those broker partitions for which we have offsets for bp in bps_to_offsets: offset = bps_to_offsets[bp] kafka = self._connections[bp.broker_id] partition = kafka.partition(bp.topic, bp.partition) if offset is None: offset = partition.latest_offset() try: offsets_msgs = kafka.fetch(bp.topic, offset, partition=bp.partition, max_size=max_size) # If our fetch fails because it's out of range, and the values came # from ZK originally (not our internal incrementing), we assume ZK # is somehow stale, so we just grab the latest and march on. except OffsetOutOfRange as ex: if offsets_pulled_from_zk: log.error( "Offset {0} from ZooKeeper is out of range for {1}". format(offset, bp)) offset = partition.latest_offset() log.error("Retrying with offset {0} for {1}".format( offset, bp)) offsets_msgs = kafka.fetch(bp.topic, offset, partition=bp.partition, max_size=max_size) else: raise except KafkaError as k_err: if ignore_failures: log.error("Ignoring failed fetch on {0}".format(bp)) log.exception(k_err) continue else: raise message_sets.append(MessageSet(bp, offset, offsets_msgs)) result = FetchResult(sorted(message_sets)) # Now persist our new offsets for msg_set in result: self._bps_to_next_offsets[ msg_set.broker_partition] = msg_set.next_offset if self._autocommit: self.commit_offsets() old_stats = self._stats # fetches bytes messages max_fetch self._stats = ConsumerStats( fetches=old_stats.fetches + 1, bytes=old_stats.bytes + result.num_bytes, messages=old_stats.messages + result.num_messages, max_fetch=max(old_stats.max_fetch, result.num_bytes)) return result
def fetch(self, max_size=None, retry_limit=3, ignore_failures=False): """Return a FetchResult, which can be iterated over as a list of MessageSets. A MessageSet is returned for every broker partition that is successfully queried, even if that MessageSet is empty. FIXME: This is where the adjustment needs to happen. Regardless of whether a rebalance has occurred or not, we can very easily see if we are still responsible for the same partitions as we were the last time we ran, and set self._bps_to_next_offsets --> we just need to check if it's not None and if we still have the same offsets, and adjust accordingly. """ def needs_offset_values_from_zk(bps_to_offsets): """We need to pull offset values from ZK if we have no BrokerPartitions in our BPs -> Offsets mapping, or if some of those Offsets are unknown (None)""" return (not bps_to_offsets) or (None in bps_to_offsets.values()) log.debug("Fetch called on ZKConsumer {0}".format(self.id)) if self._needs_rebalance: self.rebalance() # Find where we're starting from. If we've already done a fetch, we use # our internal value. This is also all we can do in the case where # autocommit is off, since any value in ZK will be out of date. bps_to_offsets = dict(self._bps_to_next_offsets) offsets_pulled_from_zk = False if needs_offset_values_from_zk(bps_to_offsets): # We have some offsets, but we've been made responsible for new # BrokerPartitions that we need to lookup. if bps_to_offsets: bps_needing_offsets = [ bp for bp, offset in bps_to_offsets.items() if offset is None ] # Otherwise, it's our first fetch, so we need everything else: bps_needing_offsets = self.broker_partitions bps_to_offsets.update( self._zk_util.offsets_for(self.consumer_group, self._id, bps_needing_offsets)) offsets_pulled_from_zk = True # Do all the fetches we need to (this should get replaced with # multifetch or performance is going to suck wind later)... message_sets = [] # We only iterate over those broker partitions for which we have offsets for bp in bps_to_offsets: offset = bps_to_offsets[bp] kafka = self._connections[bp.broker_id] partition = kafka.partition(bp.topic, bp.partition) if offset is None: offset = partition.latest_offset() try: offsets_msgs = kafka.fetch(bp.topic, offset, partition=bp.partition, max_size=max_size) # If our fetch fails because it's out of range, and the values came # from ZK originally (not our internal incrementing), we assume ZK # is somehow stale, so we just grab the latest and march on. except OffsetOutOfRange as ex: if offsets_pulled_from_zk: log.error( "Offset {0} from ZooKeeper is out of range for {1}". format(offset, bp)) offset = partition.latest_offset() log.error("Retrying with offset {0} for {1}".format( offset, bp)) offsets_msgs = kafka.fetch(bp.topic, offset, partition=bp.partition, max_size=max_size) else: raise except KafkaError as k_err: if ignore_failures: log.error("Ignoring failed fetch on {0}".format(bp)) log.exception(k_err) continue else: raise msg_set = MessageSet(bp, offset, offsets_msgs) # fetches bytes messages max_fetch old_stats = self._stats[bp] self._stats[bp] = ConsumerStats( fetches=old_stats.fetches + 1, bytes=old_stats.bytes + msg_set.size, messages=old_stats.messages + len(msg_set), max_fetch=max(old_stats.max_fetch, msg_set.size)) message_sets.append(msg_set) result = FetchResult(sorted(message_sets)) # Now persist our new offsets for msg_set in result: self._bps_to_next_offsets[ msg_set.broker_partition] = msg_set.next_offset if self._autocommit: self.commit_offsets() return result