Exemple #1
0
    def fetch(self, max_size=None):
        log.debug("Fetch called on SimpleConsumer {0}".format(self.id))
        bps_to_offsets = self._bps_to_next_offsets
        
        # Do all the fetches we need to (this should get replaced with 
        # multifetch or performance is going to suck wind later)...
        message_sets = []
        # We only iterate over those broker partitions for which we have offsets
        for bp in bps_to_offsets:
            offset = bps_to_offsets[bp]
            kafka = self._connections[bp.broker_id]

            offsets_msgs, next_offset = kafka.fetch(bp.topic, 
                                                    offset,
                                                    partition=bp.partition,
                                                    max_size=max_size)

            msg_set = MessageSet(bp, offset, offsets_msgs)
            msg_set.next_offset = next_offset
            # fetches bytes messages max_fetch
            old_stats = self._stats[bp]
            self._stats[bp] = ConsumerStats(fetches=old_stats.fetches + 1,
                                        bytes=old_stats.bytes + msg_set.size,
                                        messages=old_stats.messages + len(msg_set),
                                        max_fetch=max(old_stats.max_fetch, msg_set.size))

            message_sets.append(msg_set)
        
        if message_sets:
            result = FetchResult(sorted(message_sets))
        else:
            result = []

        # Filter out broker partitions whose end offsets we've exceeded
        self._bps_to_next_offsets = {}
        for msg_result in result:
            bp = msg_result.broker_partition
            next_offset = msg_result.next_offset
            end_offset = self._end_broker_partitions.get(bp, None)

            if end_offset is None or next_offset <= end_offset:
                self._bps_to_next_offsets[bp] = next_offset
        
        return result
    def fetch(self, max_size=None, min_size=None, fetch_step=None):
        log.debug("Fetch called on SimpleConsumer {0}".format(self.id))
        bps_to_offsets = self._bps_to_next_offsets

        # Do all the fetches we need to (this should get replaced with
        # multifetch or performance is going to suck wind later)...
        message_sets = []
        # We only iterate over those broker partitions for which we have offsets
        for bp in bps_to_offsets:
            offset = bps_to_offsets[bp]
            kafka = self._connections[bp.broker_id]

            offsets_msgs = kafka.fetch(bp.topic,
                                       offset,
                                       partition=bp.partition,
                                       min_size=min_size,
                                       max_size=max_size,
                                       fetch_step=fetch_step)

            msg_set = MessageSet(bp, offset, offsets_msgs)

            # fetches bytes messages max_fetch
            old_stats = self._stats[bp]
            self._stats[bp] = ConsumerStats(
                fetches=old_stats.fetches + 1,
                bytes=old_stats.bytes + msg_set.size,
                messages=old_stats.messages + len(msg_set),
                max_fetch=max(old_stats.max_fetch, msg_set.size))

            message_sets.append(msg_set)

        if message_sets:
            result = FetchResult(sorted(message_sets))
        else:
            result = FetchResult([])

        # Filter out broker partitions whose end offsets we've exceeded
        self._bps_to_next_offsets = {}
        for msg_result in result:
            bp = msg_result.broker_partition
            next_offset = msg_result.next_offset
            end_offset = self._end_broker_partitions.get(bp, None)

            if end_offset is None or next_offset <= end_offset:
                self._bps_to_next_offsets[bp] = next_offset

        return result
Exemple #3
0
    def fetch(self, max_size=None, retry_limit=3, ignore_failures=False):
        """Return a FetchResult, which can be iterated over as a list of 
        MessageSets. A MessageSet is returned for every broker partition that
        is successfully queried, even if that MessageSet is empty.

        FIXME: This is where the adjustment needs to happen. Regardless of 
        whether a rebalance has occurred or not, we can very easily see if we
        are still responsible for the same partitions as we were the last time
        we ran, and set self._bps_to_next_offsets --> we just need to check if
        it's not None and if we still have the same offsets, and adjust 
        accordingly.
        """
        def needs_offset_values_from_zk(bps_to_offsets):
            """We need to pull offset values from ZK if we have no 
            BrokerPartitions in our BPs -> Offsets mapping, or if some of those
            Offsets are unknown (None)"""
            return (not bps_to_offsets) or (None in bps_to_offsets.values())

        log.debug("Fetch called on ZKConsumer {0}".format(self.id))
        if self._needs_rebalance:
            self.rebalance()

        # Find where we're starting from. If we've already done a fetch, we use 
        # our internal value. This is also all we can do in the case where 
        # autocommit is off, since any value in ZK will be out of date.
        bps_to_offsets = dict(self._bps_to_next_offsets)
        offsets_pulled_from_zk = False

        if needs_offset_values_from_zk(bps_to_offsets):
            # We have some offsets, but we've been made responsible for new
            # BrokerPartitions that we need to lookup.
            if bps_to_offsets:
                bps_needing_offsets = [bp for bp, offset in bps_to_offsets.items() 
                                       if offset is None]
            # Otherwise, it's our first fetch, so we need everything
            else:
                bps_needing_offsets = self.broker_partitions

            bps_to_offsets.update(self._zk_util.offsets_for(self.consumer_group,
                                                            self._id,
                                                            bps_needing_offsets))
            offsets_pulled_from_zk = True

        # Do all the fetches we need to (this should get replaced with 
        # multifetch or performance is going to suck wind later)...
        message_sets = []
        # We only iterate over those broker partitions for which we have offsets

        for bp in bps_to_offsets:
            offset = bps_to_offsets[bp]
            kafka = self._connections[bp.broker_id]
            partition = kafka.partition(bp.topic, bp.partition)

            if offset is None:
                offset = partition.latest_offset()
            
            try:
                offsets_msgs, next_offset = kafka.fetch(bp.topic, 
                                                        offset,
                                                        partition=bp.partition,
                                                        max_size=max_size)

            # If our fetch fails because it's out of range, and the values came
            # from ZK originally (not our internal incrementing), we assume ZK
            # is somehow stale, so we just grab the latest and march on.
            except OffsetOutOfRange as ex:
                if offsets_pulled_from_zk:
                    log.error("Offset {0} from ZooKeeper is out of range for {1}"
                              .format(offset, bp))
                    offset = partition.latest_offset()
                    log.error("Retrying with offset {0} for {1}"
                              .format(offset, bp))
                    offsets_msgs, next_offset = kafka.fetch(bp.topic, 
                                                            offset,
                                                            partition=bp.partition,
                                                            max_size=max_size)
                else:
                    raise
            except KafkaError as k_err:
                if ignore_failures:
                    log.error("Ignoring failed fetch on {0}".format(bp))
                    log.exception(k_err)
                    continue
                else:
                    raise

            msg_set = MessageSet(bp, offset, offsets_msgs)
            msg_set.next_offset = next_offset
            # fetches bytes messages max_fetch
            old_stats = self._stats[bp]
            self._stats[bp] = ConsumerStats(fetches=old_stats.fetches + 1,
                                        bytes=old_stats.bytes + msg_set.size,
                                            messages=old_stats.messages + len(msg_set),
                                        max_fetch=max(old_stats.max_fetch, msg_set.size))

            message_sets.append(msg_set)
        
        result = FetchResult(sorted(message_sets))

        # Now persist our new offsets
        for msg_set in result:
            self._bps_to_next_offsets[msg_set.broker_partition] = msg_set.next_offset

        if self._autocommit:
            self.commit_offsets()

        return result
Exemple #4
0
    def fetch(self, max_size=None, retry_limit=3, ignore_failures=False):
        """Return a FetchResult, which can be iterated over as a list of 
        MessageSets. A MessageSet is returned for every broker partition that
        is successfully queried, even if that MessageSet is empty.

        FIXME: This is where the adjustment needs to happen. Regardless of 
        whether a rebalance has occurred or not, we can very easily see if we
        are still responsible for the same partitions as we were the last time
        we ran, and set self._bps_to_next_offsets --> we just need to check if
        it's not None and if we still have the same offsets, and adjust 
        accordingly.
        """
        log.debug("Fetch called on ZKConsumer {0}".format(self.id))
        if self._needs_rebalance:
            self.rebalance()

        # Find where we're starting from...
        offsets_pulled_from_zk = False
        if self._bps_to_next_offsets:
            # We've already done a fetch, we use our internal value. This is
            # also all we can do in the case where autocommit is off, since any
            # value in ZK will be out of date
            bps_to_offsets = self._bps_to_next_offsets
        else:
            # In this case, it's our first fetch, and we need to ask ZooKeeper
            # for our start value. That being said, if the value from ZooKeeper
            # is out of range for any given partition, we'll simply start at the
            # most recent value for that partition.
            bps_to_offsets = self._zk_util.offsets_for(self.consumer_group,
                                                       self._id,
                                                       self.broker_partitions)
            offsets_pulled_from_zk = True

        # Do all the fetches we need to (this should get replaced with
        # multifetch or performance is going to suck wind later)...
        message_sets = []
        # We only iterate over those broker partitions for which we have offsets
        for bp in bps_to_offsets:
            offset = bps_to_offsets[bp]
            kafka = self._connections[bp.broker_id]
            partition = kafka.partition(bp.topic, bp.partition)

            if offset is None:
                offset = partition.latest_offset()

            try:
                offsets_msgs = kafka.fetch(bp.topic,
                                           offset,
                                           partition=bp.partition,
                                           max_size=max_size)

            # If our fetch fails because it's out of range, and the values came
            # from ZK originally (not our internal incrementing), we assume ZK
            # is somehow stale, so we just grab the latest and march on.
            except OffsetOutOfRange as ex:
                if offsets_pulled_from_zk:
                    log.error(
                        "Offset {0} from ZooKeeper is out of range for {1}".
                        format(offset, bp))
                    offset = partition.latest_offset()
                    log.error("Retrying with offset {0} for {1}".format(
                        offset, bp))
                    offsets_msgs = kafka.fetch(bp.topic,
                                               offset,
                                               partition=bp.partition,
                                               max_size=max_size)
                else:
                    raise
            except KafkaError as k_err:
                if ignore_failures:
                    log.error("Ignoring failed fetch on {0}".format(bp))
                    log.exception(k_err)
                    continue
                else:
                    raise

            message_sets.append(MessageSet(bp, offset, offsets_msgs))

        result = FetchResult(sorted(message_sets))

        # Now persist our new offsets
        for msg_set in result:
            self._bps_to_next_offsets[
                msg_set.broker_partition] = msg_set.next_offset

        if self._autocommit:
            self.commit_offsets()

        old_stats = self._stats  # fetches bytes messages max_fetch
        self._stats = ConsumerStats(
            fetches=old_stats.fetches + 1,
            bytes=old_stats.bytes + result.num_bytes,
            messages=old_stats.messages + result.num_messages,
            max_fetch=max(old_stats.max_fetch, result.num_bytes))
        return result
Exemple #5
0
    def fetch(self, max_size=None, retry_limit=3, ignore_failures=False):
        """Return a FetchResult, which can be iterated over as a list of 
        MessageSets. A MessageSet is returned for every broker partition that
        is successfully queried, even if that MessageSet is empty.

        FIXME: This is where the adjustment needs to happen. Regardless of 
        whether a rebalance has occurred or not, we can very easily see if we
        are still responsible for the same partitions as we were the last time
        we ran, and set self._bps_to_next_offsets --> we just need to check if
        it's not None and if we still have the same offsets, and adjust 
        accordingly.
        """
        def needs_offset_values_from_zk(bps_to_offsets):
            """We need to pull offset values from ZK if we have no 
            BrokerPartitions in our BPs -> Offsets mapping, or if some of those
            Offsets are unknown (None)"""
            return (not bps_to_offsets) or (None in bps_to_offsets.values())

        log.debug("Fetch called on ZKConsumer {0}".format(self.id))
        if self._needs_rebalance:
            self.rebalance()

        # Find where we're starting from. If we've already done a fetch, we use
        # our internal value. This is also all we can do in the case where
        # autocommit is off, since any value in ZK will be out of date.
        bps_to_offsets = dict(self._bps_to_next_offsets)
        offsets_pulled_from_zk = False

        if needs_offset_values_from_zk(bps_to_offsets):
            # We have some offsets, but we've been made responsible for new
            # BrokerPartitions that we need to lookup.
            if bps_to_offsets:
                bps_needing_offsets = [
                    bp for bp, offset in bps_to_offsets.items()
                    if offset is None
                ]
            # Otherwise, it's our first fetch, so we need everything
            else:
                bps_needing_offsets = self.broker_partitions

            bps_to_offsets.update(
                self._zk_util.offsets_for(self.consumer_group, self._id,
                                          bps_needing_offsets))
            offsets_pulled_from_zk = True

        # Do all the fetches we need to (this should get replaced with
        # multifetch or performance is going to suck wind later)...
        message_sets = []
        # We only iterate over those broker partitions for which we have offsets
        for bp in bps_to_offsets:
            offset = bps_to_offsets[bp]
            kafka = self._connections[bp.broker_id]
            partition = kafka.partition(bp.topic, bp.partition)

            if offset is None:
                offset = partition.latest_offset()

            try:
                offsets_msgs = kafka.fetch(bp.topic,
                                           offset,
                                           partition=bp.partition,
                                           max_size=max_size)

            # If our fetch fails because it's out of range, and the values came
            # from ZK originally (not our internal incrementing), we assume ZK
            # is somehow stale, so we just grab the latest and march on.
            except OffsetOutOfRange as ex:
                if offsets_pulled_from_zk:
                    log.error(
                        "Offset {0} from ZooKeeper is out of range for {1}".
                        format(offset, bp))
                    offset = partition.latest_offset()
                    log.error("Retrying with offset {0} for {1}".format(
                        offset, bp))
                    offsets_msgs = kafka.fetch(bp.topic,
                                               offset,
                                               partition=bp.partition,
                                               max_size=max_size)
                else:
                    raise
            except KafkaError as k_err:
                if ignore_failures:
                    log.error("Ignoring failed fetch on {0}".format(bp))
                    log.exception(k_err)
                    continue
                else:
                    raise

            msg_set = MessageSet(bp, offset, offsets_msgs)

            # fetches bytes messages max_fetch
            old_stats = self._stats[bp]
            self._stats[bp] = ConsumerStats(
                fetches=old_stats.fetches + 1,
                bytes=old_stats.bytes + msg_set.size,
                messages=old_stats.messages + len(msg_set),
                max_fetch=max(old_stats.max_fetch, msg_set.size))

            message_sets.append(msg_set)

        result = FetchResult(sorted(message_sets))

        # Now persist our new offsets
        for msg_set in result:
            self._bps_to_next_offsets[
                msg_set.broker_partition] = msg_set.next_offset

        if self._autocommit:
            self.commit_offsets()

        return result