Exemple #1
0
 def test_encode_fetch_request(self):
     requests = [
         FetchRequest("topic1", 0, 10, 1024),
         FetchRequest("topic2", 1, 20, 100)
     ]
     expect = ('\x00\x00\x00Y\x00\x01\x00\x00\x00\x00\x00\x03\x00\x07'
               'client1\xff\xff\xff\xff\x00\x00\x00\x02\x00\x00\x00d\x00'
               '\x00\x00\x02\x00\x06topic1\x00\x00\x00\x01\x00\x00\x00\x00'
               '\x00\x00\x00\x00\x00\x00\x00\n\x00\x00\x04\x00\x00\x06'
               'topic2\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x00\x00'
               '\x00\x00\x14\x00\x00\x00d')
     encoded = KafkaProtocol.encode_fetch_request("client1", 3, requests, 2,
                                                  100)
     self.assertEqual(encoded, expect)
Exemple #2
0
    def __iter_partition__(self, partition, offset):
        """
        Iterate over the messages in a partition. Create a FetchRequest
        to get back a batch of messages, yield them one at a time.
        After a batch is exhausted, start a new batch unless we've reached
        the end of this partition.
        """

        # The offset that is stored in the consumer is the offset that
        # we have consumed. In subsequent iterations, we are supposed to
        # fetch the next message (that is from the next offset)
        # However, for the 0th message, the offset should be as-is.
        # An OffsetFetchRequest to Kafka gives 0 for a new queue. This is
        # problematic, since 0 is offset of a message which we have not yet
        # consumed.
        if self.fetch_started[partition]:
            offset += 1

        fetch_size = self.fetch_min_bytes

        while True:
            # use MaxBytes = client's bufsize since we're only
            # fetching one topic + partition
            req = FetchRequest(
                self.topic, partition, offset, self.client.bufsize)

            (resp,) = self.client.send_fetch_request(
                [req],
                max_wait_time=self.fetch_max_wait_time,
                min_bytes=fetch_size)

            assert resp.topic == self.topic
            assert resp.partition == partition

            next_offset = None
            try:
                for message in resp.messages:
                    next_offset = message.offset

                    # update the offset before the message is yielded. This
                    # is so that the consumer state is not lost in certain
                    # cases.
                    #
                    # For eg: the message is yielded and consumed by the
                    # caller, but the caller does not come back into the
                    # generator again. The message will be consumed but the
                    # status will not be updated in the consumer
                    self.fetch_started[partition] = True
                    self.offsets[partition] = message.offset
                    yield message
            except ConsumerFetchSizeTooSmall, e:
                fetch_size *= 1.5
                log.warn(
                    "Fetch size too small, increasing to %d (1.5x) and retry",
                    fetch_size)
                continue
            except ConsumerNoMoreData, e:
                log.debug("Iteration was ended by %r", e)
Exemple #3
0
    def test_consume_none(self):
        fetch = FetchRequest(self.topic, 0, 0, 1024)
        (fetch_resp, ) = yield from self.client.send_fetch_request([fetch])
        self.assertEquals(fetch_resp.error, 0)
        self.assertEquals(fetch_resp.topic, self.topic)
        self.assertEquals(fetch_resp.partition, 0)

        messages = list(fetch_resp.messages)
        self.assertEquals(len(messages), 0)
Exemple #4
0
    def assert_fetch_offset(self, partition, start_offset, expected_messages):
        # There should only be one response message from the server.
        # This will throw an exception if there's more than one.

        resp, = self.client.send_fetch_request([ FetchRequest(self.bytes_topic, partition, start_offset, 1024) ])

        self.assertEqual(resp.error, 0)
        self.assertEqual(resp.partition, partition)
        messages = [ x.message.value for x in resp.messages ]

        self.assertEqual(messages, expected_messages)
        self.assertEqual(resp.highwaterMark, start_offset+len(expected_messages))
Exemple #5
0
    def test_encode_fetch_request(self):
        requests = [
            FetchRequest("topic1", 0, 10, 1024),
            FetchRequest("topic2", 1, 20, 100),
        ]

        header = "".join([
            struct.pack('>i', 89),  # The length of the message overall
            struct.pack('>h', 1),  # Msg Header, Message type = Fetch
            struct.pack('>h', 0),  # Msg Header, API version
            struct.pack('>i', 3),  # Msg Header, Correlation ID
            struct.pack('>h7s', 7, "client1"),  # Msg Header, The client ID
            struct.pack('>i', -1),  # Replica Id
            struct.pack('>i', 2),  # Max wait time
            struct.pack('>i', 100),  # Min bytes
            struct.pack('>i', 2),  # Num requests
        ])

        topic1 = "".join([
            struct.pack('>h6s', 6, 'topic1'),  # Topic
            struct.pack('>i', 1),  # Num Payloads
            struct.pack('>i', 0),  # Partition 0
            struct.pack('>q', 10),  # Offset
            struct.pack('>i', 1024),  # Max Bytes
        ])

        topic2 = "".join([
            struct.pack('>h6s', 6, 'topic2'),  # Topic
            struct.pack('>i', 1),  # Num Payloads
            struct.pack('>i', 1),  # Partition 0
            struct.pack('>q', 20),  # Offset
            struct.pack('>i', 100),  # Max Bytes
        ])

        expected1 = "".join([header, topic1, topic2])
        expected2 = "".join([header, topic2, topic1])

        encoded = KafkaProtocol.encode_fetch_request("client1", 3, requests, 2,
                                                     100)
        self.assertIn(encoded, [expected1, expected2])
Exemple #6
0
 def get_timestamp(k, p, current):
     buffer_size = 1024
     responses = k.send_fetch_request([
         FetchRequest(p['topic'].encode('utf-8'), p['partition'], current,
                      buffer_size)
     ])
     for resp in responses:
         for message in resp.messages:
             if in_array:
                 return json.loads(
                     message.message.value)[array_index][field_name]
             else:
                 return json.loads(message.message.value)[field_name]
Exemple #7
0
    def __iter_partition__(self, partition, offset):
        """
        Iterate over the messages in a partition. Create a FetchRequest
        to get back a batch of messages, yield them one at a time.
        After a batch is exhausted, start a new batch unless we've reached
        the end of this partition.
        """

        # The offset that is stored in the consumer is the offset that
        # we have consumed. In subsequent iterations, we are supposed to
        # fetch the next message (that is from the next offset)
        # However, for the 0th message, the offset should be as-is.
        # An OffsetFetchRequest to Kafka gives 0 for a new queue. This is
        # problematic, since 0 is offset of a message which we have not yet
        # consumed.
        if self.fetch_started[partition]:
            offset += 1

        while True:
            # TODO: configure fetch size
            req = FetchRequest(self.topic, partition, offset, 1024)

            (resp, ) = self.client.send_fetch_request(
                [req],
                max_wait_time=self.fetch_max_wait_time,
                min_bytes=self.fetch_min_bytes)

            assert resp.topic == self.topic
            assert resp.partition == partition

            next_offset = None
            for message in resp.messages:
                next_offset = message.offset

                # update the offset before the message is yielded. This is
                # so that the consumer state is not lost in certain cases.
                # For eg: the message is yielded and consumed by the caller,
                # but the caller does not come back into the generator again.
                # The message will be consumed but the status will not be
                # updated in the consumer
                self.fetch_started[partition] = True
                self.offsets[partition] = message.offset
                yield message
            if next_offset is None:
                break
            else:
                offset = next_offset + 1
Exemple #8
0
    def _fetch(self):
        # Create fetch request payloads for all the partitions
        partitions = dict(
            (p, self.buffer_size) for p in self.fetch_offsets.keys())
        while partitions:
            requests = []
            for partition, buffer_size in six.iteritems(partitions):
                requests.append(
                    FetchRequest(self.topic, partition,
                                 self.fetch_offsets[partition], buffer_size))
            # Send request
            responses = self.client.send_fetch_request(
                requests,
                max_wait_time=int(self.fetch_max_wait_time),
                min_bytes=self.fetch_min_bytes)

            retry_partitions = {}
            for resp in responses:
                partition = resp.partition
                buffer_size = partitions[partition]
                try:
                    for message in resp.messages:
                        # Put the message in our queue
                        self.queue.put((partition, message))
                        self.fetch_offsets[partition] = message.offset + 1
                except ConsumerFetchSizeTooSmall:
                    if (self.max_buffer_size is not None
                            and buffer_size == self.max_buffer_size):
                        log.error("Max fetch size %d too small",
                                  self.max_buffer_size)
                        raise
                    if self.max_buffer_size is None:
                        buffer_size *= 2
                    else:
                        buffer_size = min(buffer_size * 2,
                                          self.max_buffer_size)
                    log.warn(
                        "Fetch size too small, increase to %d (2x) "
                        "and retry", buffer_size)
                    retry_partitions[partition] = buffer_size
                except ConsumerNoMoreData as e:
                    log.debug("Iteration was ended by %r", e)
                except StopIteration:
                    # Stop iterating through this partition
                    log.debug("Done iterating over partition %s" % partition)
            partitions = retry_partitions
Exemple #9
0
    def fetch_messages(self):
        """Sends FetchRequests for all topic/partitions set for consumption

        Returns:
            Generator that yields KafkaMessage structs
            after deserializing with the configured `deserializer_class`

        Note:
            Refreshes metadata on errors, and resets fetch offset on
            OffsetOutOfRange, per the configured `auto_offset_reset` policy

        See Also:
            Key KafkaConsumer configuration parameters:
            * `fetch_message_max_bytes`
            * `fetch_max_wait_ms`
            * `fetch_min_bytes`
            * `deserializer_class`
            * `auto_offset_reset`

        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        if not self._topics:
            raise KafkaConfigurationError('No topics or partitions configured')

        if not self._offsets.fetch:
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages')

        fetches = [
            FetchRequest(topic, partition,
                         self._offsets.fetch[(topic, partition)], max_bytes)
            for (topic, partition) in self._topics
        ]

        # send_fetch_request will batch topic/partition requests by leader
        responses = self._client.send_fetch_request(
            fetches,
            max_wait_time=max_wait_time,
            min_bytes=min_bytes,
            fail_on_error=False)

        for resp in responses:

            if isinstance(resp, FailedPayloadsError):
                logger.warning('FailedPayloadsError attempting to fetch data')
                self._refresh_metadata_on_error()
                continue

            topic = kafka_bytestring(resp.topic)
            partition = resp.partition
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning(
                    'OffsetOutOfRange: topic %s, partition %d, '
                    'offset %d (Highwatermark: %d)', topic, partition,
                    self._offsets.fetch[(topic, partition)],
                    resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[(topic, partition)] = (
                    self._reset_partition_offset((topic, partition)))
                continue

            except NotLeaderForPartitionError:
                logger.warning(
                    "NotLeaderForPartitionError for %s - %d. "
                    "Metadata may be out of date", topic, partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d", topic,
                               partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[(topic, partition)] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                val = self._config['deserializer_class'](message.value)
                msg = KafkaMessage(topic, partition, offset, message.key, val)

                # in some cases the server will return earlier messages
                # than we requested. skip them per kafka spec
                if offset < self._offsets.fetch[(topic, partition)]:
                    logger.debug(
                        'message offset less than fetched offset '
                        'skipping: %s', msg)
                    continue
                # Only increment fetch offset
                # if we safely got the message and deserialized
                self._offsets.fetch[(topic, partition)] = offset + 1

                # Then yield to user
                yield msg
Exemple #10
0
    def fetch_messages(self):
        """
        Sends FetchRequests for all topic/partitions set for consumption
        Returns a generator that yields KafkaMessage structs
        after deserializing with the configured `deserializer_class`

        Refreshes metadata on errors, and resets fetch offset on
        OffsetOutOfRange, per the configured `auto_offset_reset` policy

        Key configuration parameters:

        * `fetch_message_max_bytes`
        * `fetch_max_wait_ms`
        * `fetch_min_bytes`
        * `deserializer_class`
        * `auto_offset_reset`
        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        # Get current fetch offsets
        offsets = self._offsets.fetch
        if not offsets:
            if not self._topics:
                raise KafkaConfigurationError(
                    'No topics or partitions configured')
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages')

        fetches = []
        for topic_partition, offset in six.iteritems(offsets):
            fetches.append(
                FetchRequest(topic_partition[0], topic_partition[1], offset,
                             max_bytes))

        # client.send_fetch_request will collect topic/partition requests by leader
        # and send each group as a single FetchRequest to the correct broker
        try:
            responses = self._client.send_fetch_request(
                fetches,
                max_wait_time=max_wait_time,
                min_bytes=min_bytes,
                fail_on_error=False)
        except FailedPayloadsError:
            logger.warning(
                'FailedPayloadsError attempting to fetch data from kafka')
            self._refresh_metadata_on_error()
            return

        for resp in responses:
            topic_partition = (resp.topic, resp.partition)
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning(
                    'OffsetOutOfRange: topic %s, partition %d, offset %d '
                    '(Highwatermark: %d)', resp.topic, resp.partition,
                    offsets[topic_partition], resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[
                    topic_partition] = self._reset_partition_offset(
                        topic_partition)
                continue

            except NotLeaderForPartitionError:
                logger.warning(
                    "NotLeaderForPartitionError for %s - %d. "
                    "Metadata may be out of date", resp.topic, resp.partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d", resp.topic,
                               resp.partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[topic_partition] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                msg = KafkaMessage(
                    resp.topic, resp.partition, offset, message.key,
                    self._config['deserializer_class'](message.value))

                # Only increment fetch offset if we safely got the message and deserialized
                self._offsets.fetch[topic_partition] = offset + 1

                # Then yield to user
                yield msg
Exemple #11
0
 def test_fetch_request(self):
     req = FetchRequest("my-topic", 0, 0, 1024)
     enc = KafkaClient.encode_fetch_request(req)
     expect = "\x00\x01\x00\x08my-topic\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00"
     self.assertEquals(enc, expect)
Exemple #12
0
    def _fetch(self):
        # Create fetch request payloads for all the partitions
        partitions = dict((p, self.buffer_size)
                      for p in self.fetch_offsets.keys())
        while partitions:
            requests = []
            for partition, buffer_size in six.iteritems(partitions):
                requests.append(FetchRequest(self.topic, partition,
                                             self.fetch_offsets[partition],
                                             buffer_size))
            # Send request
            responses = self.client.send_fetch_request(
                requests,
                max_wait_time=int(self.fetch_max_wait_time),
                min_bytes=self.fetch_min_bytes,
                fail_on_error=False
            )

            retry_partitions = {}
            for resp in responses:

                try:
                    check_error(resp)
                except (UnknownTopicOrPartitionError, NotLeaderForPartitionError):
                    self.client.reset_topic_metadata(resp.topic)
                    raise
                except OffsetOutOfRangeError:
                    log.warning("OffsetOutOfRangeError for %s - %d. "
                                "Resetting partition offset...",
                                resp.topic, resp.partition)
                    self.reset_partition_offset(resp.partition)
                    # Retry this partition
                    retry_partitions[resp.partition] = partitions[resp.partition]
                    continue

                partition = resp.partition
                buffer_size = partitions[partition]
                try:
                    for message in resp.messages:
                        if message.offset < self.fetch_offsets[partition]:
                            log.debug('Skipping message %s because its offset is less than the consumer offset',
                                      message)
                            continue
                        # Put the message in our queue
                        self.queue.put((partition, message))
                        self.fetch_offsets[partition] = message.offset + 1
                except ConsumerFetchSizeTooSmall:
                    if (self.max_buffer_size is not None and
                            buffer_size == self.max_buffer_size):
                        log.error("Max fetch size %d too small",
                                  self.max_buffer_size)
                        raise
                    if self.max_buffer_size is None:
                        buffer_size *= 2
                    else:
                        buffer_size = min(buffer_size * 2,
                                          self.max_buffer_size)
                    log.warn("Fetch size too small, increase to %d (2x) "
                             "and retry", buffer_size)
                    retry_partitions[partition] = buffer_size
                except ConsumerNoMoreData as e:
                    log.debug("Iteration was ended by %r", e)
                except StopIteration:
                    # Stop iterating through this partition
                    log.debug("Done iterating over partition %s" % partition)
            partitions = retry_partitions