def _do_sasl_handshake(self): req_klass = self._version_info.pick_best(SaslHandShakeRequest) sasl_handshake = req_klass(self._sasl_mechanism) response = yield from self.send(sasl_handshake) error_type = Errors.for_code(response.error_code) if error_type is not Errors.NoError: error = error_type(self) self.close(reason=CloseReason.AUTH_FAILURE, exc=error) raise error if self._sasl_mechanism not in response.enabled_mechanisms: exc = Errors.UnsupportedSaslMechanismError( 'Kafka broker does not support %s sasl mechanism. ' 'Enabled mechanisms are: %s' % (self._sasl_mechanism, response.enabled_mechanisms)) self.close(reason=CloseReason.AUTH_FAILURE, exc=exc) raise exc assert self._sasl_mechanism == 'PLAIN' if self._security_protocol == 'SASL_PLAINTEXT': self.log.warning('Sending username and password in the clear') authenticator = self.authenticator_plain() auth_bytes = None while True: try: payload = authenticator.send(auth_bytes) except StopIteration: break if req_klass.API_VERSION == 0: auth_bytes = yield from self._send_sasl_token(payload) else: req_klass = self._version_info.pick_best( SaslAuthenticateRequest) req = req_klass(payload) resp = yield from self.send(req) error_type = Errors.for_code(resp.error_code) if error_type is not Errors.NoError: exc = error_type(resp.error_message) self.close(reason=CloseReason.AUTH_FAILURE, exc=exc) raise exc auth_bytes = resp.sasl_auth_bytes self.log.info('Authenticated as %s via PLAIN', self._sasl_plain_username)
def handle_response(self, resp): txn_manager = self._sender._txn_manager group_id = self._group_id error_type = Errors.for_code(resp.error_code) if error_type is Errors.NoError: log.debug("Successfully added consumer group %s to transaction", group_id) txn_manager.consumer_group_added(group_id) return elif (error_type is CoordinatorNotAvailableError or error_type is NotCoordinatorError): self._sender._coordinator_dead(CoordinationType.TRANSACTION) elif (error_type is CoordinatorLoadInProgressError or error_type is ConcurrentTransactions): # We will just retry after backoff pass elif error_type is InvalidProducerEpoch: raise ProducerFenced() elif error_type is InvalidTxnState: raise error_type() elif error_type is TransactionalIdAuthorizationFailed: raise error_type(txn_manager.transactional_id) elif error_type is GroupAuthorizationFailedError: txn_manager.error_transaction(error_type(self._group_id)) return else: log.error( "Could not add consumer group due to unexpected error: %s", error_type) raise error_type() return self._default_backoff
def _check_api_version_response(self, response): # The logic here is to check the list of supported request versions # in descending order. As soon as we find one that works, return it test_cases = [ # format (<broker verion>, <needed struct>) ((2, 1, 0), MetadataRequest[0].API_KEY, 7), ((1, 1, 0), FetchRequest[0].API_KEY, 7), ((1, 0, 0), MetadataRequest[0].API_KEY, 5), ((0, 11, 0), MetadataRequest[0].API_KEY, 4), ((0, 10, 2), OffsetFetchRequest[0].API_KEY, 2), ((0, 10, 1), MetadataRequest[0].API_KEY, 2), ] error_type = Errors.for_code(response.error_code) assert error_type is Errors.NoError, "API version check failed" max_versions = dict([ (api_key, max_version) for api_key, _, max_version in response.api_versions ]) # Get the best match of test cases for broker_version, api_key, version in test_cases: if max_versions.get(api_key, -1) >= version: return broker_version # We know that ApiVersionResponse is only supported in 0.10+ # so if all else fails, choose that return (0, 10, 0)
def _check_api_version_response(self, response): # The logic here is to check the list of supported request versions # in descending order. As soon as we find one that works, return it test_cases = [ # format (<broker version>, <needed struct>) # TODO Requires unreleased version of python-kafka # ((2, 6, 0), DescribeClientQuotasRequest[0]), ((2, 5, 0), DescribeAclsRequest_v2), ((2, 4, 0), ProduceRequest[8]), ((2, 3, 0), FetchRequest[11]), ((2, 2, 0), OffsetRequest[5]), ((2, 1, 0), FetchRequest[10]), ((2, 0, 0), FetchRequest[8]), ((1, 1, 0), FetchRequest[7]), ((1, 0, 0), MetadataRequest[5]), ((0, 11, 0), MetadataRequest[4]), ((0, 10, 2), OffsetFetchRequest[2]), ((0, 10, 1), MetadataRequest[2]), ] error_type = Errors.for_code(response.error_code) assert error_type is Errors.NoError, "API version check failed" max_versions = { api_key: max_version for api_key, _, max_version in response.api_versions } # Get the best match of test cases for broker_version, struct in test_cases: if max_versions.get(struct.API_KEY, -1) >= struct.API_VERSION: return broker_version # We know that ApiVersionResponse is only supported in 0.10+ # so if all else fails, choose that return (0, 10, 0)
def _check_api_version_response(self, response): # The logic here is to check the list of supported request versions # in descending order. As soon as we find one that works, return it test_cases = [ # format (<broker verion>, <needed struct>) ((1, 0, 0), MetadataRequest[0].API_KEY, 5), ((0, 11, 0), MetadataRequest[0].API_KEY, 4), ((0, 10, 2), OffsetFetchRequest[0].API_KEY, 2), ((0, 10, 1), MetadataRequest[0].API_KEY, 2), ] error_type = Errors.for_code(response.error_code) assert error_type is Errors.NoError, "API version check failed" max_versions = dict([ (api_key, max_version) for api_key, _, max_version in response.api_versions ]) # Get the best match of test cases for broker_version, api_key, version in test_cases: if max_versions.get(api_key, -1) >= version: return broker_version # We know that ApiVersionResponse is only supported in 0.10+ # so if all else fails, choose that return (0, 10, 0)
def handle_response(self, resp): txn_manager = self._sender._txn_manager group_id = self._group_id for topic, partitions in resp.errors: for partition, error_code in partitions: tp = TopicPartition(topic, partition) error_type = Errors.for_code(error_code) if error_type is Errors.NoError: offset = self._offsets[tp].offset log.debug( "Offset %s for partition %s committed to group %s", offset, tp, group_id) txn_manager.offset_committed(tp, offset, group_id) elif (error_type is CoordinatorNotAvailableError or error_type is NotCoordinatorError or # Copied from Java. Not sure why it's only in this case error_type is RequestTimedOutError): self._sender._coordinator_dead(CoordinationType.GROUP) return self._default_backoff elif (error_type is CoordinatorLoadInProgressError or error_type is UnknownTopicOrPartitionError): # We will just retry after backoff return self._default_backoff elif error_type is InvalidProducerEpoch: raise ProducerFenced() else: log.error( "Could not commit offset for partition %s due to " "unexpected error: %s", partition, error_type) raise error_type()
def handle_response(self, resp): txn_manager = self._sender._txn_manager error_type = Errors.for_code(resp.error_code) if error_type is Errors.NoError: txn_manager.complete_transaction() return elif (error_type is CoordinatorNotAvailableError or error_type is NotCoordinatorError): self._sender._coordinator_dead(CoordinationType.TRANSACTION) elif (error_type is CoordinatorLoadInProgressError or error_type is ConcurrentTransactions): # We will just retry after backoff pass elif error_type is InvalidProducerEpoch: raise ProducerFenced() elif error_type is InvalidTxnState: raise error_type() else: log.error( "Could not end transaction due to unexpected error: %s", error_type) raise error_type() return self._default_backoff
def handle_response(self, resp): txn_manager = self._sender._txn_manager error_type = Errors.for_code(resp.error_code) if error_type is Errors.NoError: log.debug( "Successfully found PID=%s EPOCH=%s for Producer %s", resp.producer_id, resp.producer_epoch, self._sender.client._client_id) self._sender._txn_manager.set_pid_and_epoch( resp.producer_id, resp.producer_epoch) return elif (error_type is CoordinatorNotAvailableError or error_type is NotCoordinatorError): self._sender._coordinator_dead(CoordinationType.TRANSACTION) elif (error_type is CoordinatorLoadInProgressError or error_type is ConcurrentTransactions): pass elif error_type is TransactionalIdAuthorizationFailed: raise error_type(txn_manager.transactional_id) else: log.error( "Unexpected error during InitProducerIdRequest: %s", error_type) raise error_type() return self._default_backoff
async def coordinator_lookup(self, coordinator_type, coordinator_key): """ Lookup which node in the cluster is the coordinator for a certain role (Transaction coordinator or Group coordinator atm.) NOTE: Client keeps track of all coordination nodes separately, as they all have different sockets and ids. """ node_id = self.get_random_node() assert node_id is not None, "Did we not perform bootstrap?" log.debug("Sending FindCoordinator request for key %s to broker %s", coordinator_key, node_id) if self.api_version > (0, 11): request = FindCoordinatorRequest[1](coordinator_key, coordinator_type) else: # Group coordination only assert coordinator_type == CoordinationType.GROUP, \ "No transactions for older brokers" request = FindCoordinatorRequest[0](coordinator_key) resp = await self.send(node_id, request) log.debug("Received group coordinator response %s", resp) error_type = Errors.for_code(resp.error_code) if error_type is not Errors.NoError: err = error_type() raise err self.cluster.add_coordinator(resp.coordinator_id, resp.host, resp.port, rack=None, purpose=(coordinator_type, coordinator_key)) return resp.coordinator_id
def coordinator_lookup(self, coordinator_type, coordinator_key): """ Lookup which node in the cluster is the coordinator for a certain role (Transaction coordinator or Group coordinator atm.) NOTE: Client keeps track of all coordination nodes separately, as they all have different sockets and ids. """ node_id = self.get_random_node() assert node_id is not None, "Did we not perform bootstrap?" log.debug( "Sending FindCoordinator request for key %s to broker %s", coordinator_key, node_id) if self.api_version > (0, 11): request = FindCoordinatorRequest[1]( coordinator_key, coordinator_type) else: # Group coordination only assert coordinator_type == CoordinationType.GROUP, \ "No transactions for older brokers" request = FindCoordinatorRequest[0](coordinator_key) resp = yield from self.send(node_id, request) log.debug("Received group coordinator response %s", resp) error_type = Errors.for_code(resp.error_code) if error_type is not Errors.NoError: err = error_type() raise err self.cluster.add_coordinator( resp.coordinator_id, resp.host, resp.port, rack=None, purpose=(coordinator_type, coordinator_key)) return resp.coordinator_id
def handle_response(self, response): for topic, partitions in response.topics: for partition_info in partitions: global_error = None log_start_offset = None if response.API_VERSION < 2: partition, error_code, offset = partition_info # Mimic CREATE_TIME to take user provided timestamp timestamp = -1 elif 2 <= response.API_VERSION <= 4: partition, error_code, offset, timestamp = partition_info elif 5 <= response.API_VERSION <= 7: ( partition, error_code, offset, timestamp, log_start_offset ) = partition_info else: # the ignored parameter is record_error of type # list[(batch_index: int, error_message: str)] ( partition, error_code, offset, timestamp, log_start_offset, _, global_error ) = partition_info tp = TopicPartition(topic, partition) error = Errors.for_code(error_code) batch = self._batches.get(tp) if batch is None: continue if error is Errors.NoError: batch.done(offset, timestamp, log_start_offset) elif error is DuplicateSequenceNumber: # If we have received a duplicate sequence error, # it means that the sequence number has advanced # beyond the sequence of the current batch, and we # haven't retained batch metadata on the broker to # return the correct offset and timestamp. # # The only thing we can do is to return success to # the user and not return a valid offset and # timestamp. batch.done(offset, timestamp, log_start_offset) elif not self._can_retry(error(), batch): if error is InvalidProducerEpoch: exc = ProducerFenced() elif error is TopicAuthorizationFailedError: exc = error(topic) else: exc = error() batch.failure(exception=exc) else: log.warning( "Got error produce response on topic-partition" " %s, retrying. Error: %s", tp, global_error or error) # Ok, we can retry this batch if getattr(error, "invalid_metadata", False): self._client.force_metadata_update() self._to_reenqueue.append(batch)
def handle_response(self, resp): txn_manager = self._sender._txn_manager unauthorized_topics = set() for topic, partitions in resp.errors: for partition, error_code in partitions: tp = TopicPartition(topic, partition) error_type = Errors.for_code(error_code) if error_type is Errors.NoError: log.debug("Added partition %s to transaction", tp) txn_manager.partition_added(tp) elif (error_type is CoordinatorNotAvailableError or error_type is NotCoordinatorError): self._sender._coordinator_dead( CoordinationType.TRANSACTION) return self._default_backoff elif error_type is ConcurrentTransactions: # See KAFKA-5477: There is some time between commit and # actual transaction marker write, that will produce this # ConcurrentTransactions. We don't want the 100ms latency # in that case. if not txn_manager.txn_partitions: return BACKOFF_OVERRIDE else: return self._default_backoff elif (error_type is CoordinatorLoadInProgressError or error_type is UnknownTopicOrPartitionError): return self._default_backoff elif error_type is InvalidProducerEpoch: raise ProducerFenced() elif (error_type is InvalidProducerIdMapping or error_type is InvalidTxnState): raise error_type() elif error_type is TopicAuthorizationFailedError: unauthorized_topics.add(topic) elif error_type is OperationNotAttempted: pass elif error_type is TransactionalIdAuthorizationFailed: raise error_type(txn_manager.transactional_id) else: log.error( "Could not add partition %s due to unexpected error:" " %s", partition, error_type) raise error_type() if unauthorized_topics: txn_manager.error_transaction( TopicAuthorizationFailedError(unauthorized_topics)) return
def _send_req(self, node_id, request, *, group): """send request to Kafka node and mark coordinator as `dead` in error case """ try: resp = yield from self._client.send(node_id, request, group=group) except Errors.KafkaError as err: log.error( 'Error sending %s to node %s [%s] -- marking coordinator dead', request.__class__.__name__, node_id, err) self.coordinator_dead() raise err else: if not hasattr(resp, 'error_code'): return resp error_type = Errors.for_code(resp.error_code) if error_type is Errors.NoError: return resp else: raise error_type()
def _do_init_pid(self): init_pid_req = InitProducerIdRequest[0]( transactional_id=self._transactional_id, transaction_timeout_ms=self._transaction_timeout_ms) node_id = self.client.get_random_node() try: resp = yield from self.client.send(node_id, init_pid_req) except KafkaError as err: log.debug("Could not send InitProducerIdRequest: %r", err) return False error = Errors.for_code(resp.error_code) if error is Errors.NoError: self._txn_manager.set_pid_and_epoch(resp.producer_id, resp.producer_epoch) # Just in case we got bad values from broker return self._txn_manager.has_pid() else: log.debug("Got an error for InitProducerIdRequest: %r", error) return False
def _proc_offsets_fetch_request(self, node_id, request): response = yield from self._send_req( node_id, request, group=ConnectionGroup.COORDINATION) offsets = {} for topic, partitions in response.topics: for partition, offset, metadata, error_code in partitions: tp = TopicPartition(topic, partition) error_type = Errors.for_code(error_code) if error_type is not Errors.NoError: error = error_type() log.debug("Error fetching offset for %s: %s", tp, error) if error_type is Errors.GroupLoadInProgressError: # just retry raise error elif error_type is Errors.NotCoordinatorForGroupError: # re-discover the coordinator and retry self.coordinator_dead() raise error elif error_type in (Errors.UnknownMemberIdError, Errors.IllegalGenerationError): # need to re-join group self._subscription.mark_for_reassignment() raise error elif error_type is Errors.UnknownTopicOrPartitionError: log.warning( "OffsetFetchRequest -- unknown topic %s", topic) continue else: log.error("Unknown error fetching offsets for %s: %s", tp, error) raise error elif offset >= 0: # record the position with the offset # (-1 indicates no committed offset to fetch) offsets[tp] = OffsetAndMetadata(offset, metadata) else: log.debug( "No committed offset for partition %s", tp) return offsets
def _proc_fetch_request(self, assignment, node_id, request): needs_wakeup = False try: response = yield from self._client.send(node_id, request) except Errors.KafkaError as err: log.error("Failed fetch messages from %s: %s", node_id, err) return False except asyncio.CancelledError: # Either `close()` or partition unassigned. Either way the result # is no longer of interest. return False if not assignment.active: log.debug( "Discarding fetch response since the assignment changed during" " fetch") return False fetch_offsets = {} for topic, partitions in request.topics: for partition, offset, _ in partitions: fetch_offsets[TopicPartition(topic, partition)] = offset for topic, partitions in response.topics: for partition, error_code, highwater, *part_data in partitions: tp = TopicPartition(topic, partition) error_type = Errors.for_code(error_code) fetch_offset = fetch_offsets[tp] tp_state = assignment.state_value(tp) if not tp_state.has_valid_position or \ tp_state.position != fetch_offset: log.debug( "Discarding fetch response for partition %s " "since its offset %s does not match the current " "position", tp, fetch_offset) continue if error_type is Errors.NoError: tp_state.highwater = highwater # part_data also contains lso, aborted_transactions. # message_set is last records = MemoryRecords(part_data[-1]) if records.has_next(): log.debug( "Adding fetched record for partition %s with" " offset %d to buffered record list", tp, fetch_offset) message_iterator = self._unpack_records(tp, records) self._records[tp] = FetchResult( tp, message_iterator=message_iterator, assignment=assignment, backoff=self._prefetch_backoff, fetch_offset=fetch_offset, loop=self._loop) # We added at least 1 successful record needs_wakeup = True elif records.size_in_bytes() > 0: # we did not read a single message from a non-empty # buffer because that message's size is larger than # fetch size, in this case record this exception err = RecordTooLargeError( "There are some messages at [Partition=Offset]: " "%s=%s whose size is larger than the fetch size %s" " and hence cannot be ever returned. " "Increase the fetch size, or decrease the maximum " "message size the broker will allow.", tp, fetch_offset, self._max_partition_fetch_bytes) self._set_error(tp, err) tp_state.consumed_to(tp_state.position + 1) needs_wakeup = True elif error_type in (Errors.NotLeaderForPartitionError, Errors.UnknownTopicOrPartitionError): self._client.force_metadata_update() elif error_type is Errors.OffsetOutOfRangeError: if self._default_reset_strategy != \ OffsetResetStrategy.NONE: tp_state.await_reset(self._default_reset_strategy) else: err = Errors.OffsetOutOfRangeError({tp: fetch_offset}) self._set_error(tp, err) needs_wakeup = True log.info( "Fetch offset %s is out of range for partition %s," " resetting offset", fetch_offset, tp) elif error_type is Errors.TopicAuthorizationFailedError: log.warning("Not authorized to read from topic %s.", tp.topic) err = Errors.TopicAuthorizationFailedError(tp.topic) self._set_error(tp, err) needs_wakeup = True else: log.warning('Unexpected error while fetching data: %s', error_type.__name__) return needs_wakeup
def _proc_offset_request(self, node_id, topic_data): if self._client.api_version < (0, 10, 1): version = 0 # Version 0 had another field `max_offsets`, set it to `1` for topic, part_data in topic_data.items(): topic_data[topic] = [(part, ts, 1) for part, ts in part_data] else: version = 1 request = OffsetRequest[version](-1, list(topic_data.items())) response = yield from self._client.send(node_id, request) res_offsets = {} for topic, part_data in response.topics: for part, error_code, *partition_info in part_data: partition = TopicPartition(topic, part) error_type = Errors.for_code(error_code) if error_type is Errors.NoError: if response.API_VERSION == 0: offsets = partition_info[0] assert len(offsets) <= 1, \ 'Expected OffsetResponse with one offset' if offsets: offset = offsets[0] log.debug( "Handling v0 ListOffsetResponse response for " "%s. Fetched offset %s", partition, offset) res_offsets[partition] = (offset, None) else: res_offsets[partition] = (UNKNOWN_OFFSET, None) else: timestamp, offset = partition_info log.debug( "Handling ListOffsetResponse response for " "%s. Fetched offset %s, timestamp %s", partition, offset, timestamp) res_offsets[partition] = (offset, timestamp) elif error_type is Errors.UnsupportedForMessageFormatError: # The message format on the broker side is before 0.10.0, # we will simply put None in the response. log.debug( "Cannot search by timestamp for partition %s " "because the message format version is before " "0.10.0", partition) elif error_type is Errors.NotLeaderForPartitionError: log.debug( "Attempt to fetch offsets for partition %s " "failed " "due to obsolete leadership information, retrying.", partition) raise error_type(partition) elif error_type is Errors.UnknownTopicOrPartitionError: log.warning( "Received unknown topic or partition error in " "ListOffset request for partition %s. The " "topic/partition may not exist or the user may not " "have Describe access to it.", partition) raise error_type(partition) else: log.warning( "Attempt to fetch offsets for partition %s failed due " "to: %s", partition, error_type) raise error_type(partition) return res_offsets
def _proc_fetch_request(self, assignment, node_id, request): needs_wakeup = False try: response = yield from self._client.send(node_id, request) except Errors.KafkaError as err: log.error("Failed fetch messages from %s: %s", node_id, err) return False except asyncio.CancelledError: # Either `close()` or partition unassigned. Either way the result # is no longer of interest. return False if not assignment.active: log.debug( "Discarding fetch response since the assignment changed during" " fetch") return False fetch_offsets = {} for topic, partitions in request.topics: for partition, offset, _ in partitions: fetch_offsets[TopicPartition(topic, partition)] = offset for topic, partitions in response.topics: for partition, error_code, highwater, *part_data in partitions: tp = TopicPartition(topic, partition) error_type = Errors.for_code(error_code) fetch_offset = fetch_offsets[tp] tp_state = assignment.state_value(tp) if not tp_state.has_valid_position or \ tp_state.position != fetch_offset: log.debug( "Discarding fetch response for partition %s " "since its offset %s does not match the current " "position", tp, fetch_offset) continue if error_type is Errors.NoError: if request.API_VERSION >= 4: aborted_transactions = part_data[-2] lso = part_data[-3] else: aborted_transactions = None lso = None tp_state.highwater = highwater tp_state.lso = lso # part_data also contains lso, aborted_transactions. # message_set is last records = MemoryRecords(part_data[-1]) if records.has_next(): log.debug( "Adding fetched record for partition %s with" " offset %d to buffered record list", tp, fetch_offset) partition_records = PartitionRecords( tp, records, aborted_transactions, fetch_offset, self._key_deserializer, self._value_deserializer, self._check_crcs, self._isolation_level) self._records[tp] = FetchResult( tp, partition_records=partition_records, assignment=assignment, backoff=self._prefetch_backoff, loop=self._loop) # We added at least 1 successful record needs_wakeup = True elif records.size_in_bytes() > 0: # we did not read a single message from a non-empty # buffer because that message's size is larger than # fetch size, in this case record this exception err = RecordTooLargeError( "There are some messages at [Partition=Offset]: " "%s=%s whose size is larger than the fetch size %s" " and hence cannot be ever returned. " "Increase the fetch size, or decrease the maximum " "message size the broker will allow.", tp, fetch_offset, self._max_partition_fetch_bytes) self._set_error(tp, err) tp_state.consumed_to(tp_state.position + 1) needs_wakeup = True elif error_type in (Errors.NotLeaderForPartitionError, Errors.UnknownTopicOrPartitionError): self._client.force_metadata_update() elif error_type is Errors.OffsetOutOfRangeError: if self._default_reset_strategy != \ OffsetResetStrategy.NONE: tp_state.await_reset(self._default_reset_strategy) else: err = Errors.OffsetOutOfRangeError({tp: fetch_offset}) self._set_error(tp, err) needs_wakeup = True log.info( "Fetch offset %s is out of range for partition %s," " resetting offset", fetch_offset, tp) elif error_type is Errors.TopicAuthorizationFailedError: log.warning( "Not authorized to read from topic %s.", tp.topic) err = Errors.TopicAuthorizationFailedError(tp.topic) self._set_error(tp, err) needs_wakeup = True else: log.warning('Unexpected error while fetching data: %s', error_type.__name__) return needs_wakeup
def _send_produce_req(self, node_id, batches): """ Create produce request to node If producer configured with `retries`>0 and produce response contain "failed" partitions produce request for this partition will try resend to broker `retries` times with `retry_timeout_ms` timeouts. Arguments: node_id (int): kafka broker identifier batches (dict): dictionary of {TopicPartition: MessageBatch} """ t0 = self._loop.time() topics = collections.defaultdict(list) for tp, batch in batches.items(): topics[tp.topic].append((tp.partition, batch.get_data_buffer())) if self.client.api_version >= (0, 10): version = 2 elif self.client.api_version == (0, 9): version = 1 else: version = 0 request = ProduceRequest[version](required_acks=self._acks, timeout=self._request_timeout_ms, topics=list(topics.items())) reenqueue = [] try: response = yield from self.client.send(node_id, request) except KafkaError as err: log.warning("Got error produce response: %s", err) if getattr(err, "invalid_metadata", False): self.client.force_metadata_update() for batch in batches.values(): if not self._can_retry(err, batch): batch.failure(exception=err) else: reenqueue.append(batch) else: # noacks, just mark batches as "done" if request.required_acks == 0: for batch in batches.values(): batch.done_noack() else: for topic, partitions in response.topics: for partition_info in partitions: if response.API_VERSION < 2: partition, error_code, offset = partition_info # Mimic CREATE_TIME to take user provided timestamp timestamp = -1 else: partition, error_code, offset, timestamp = \ partition_info tp = TopicPartition(topic, partition) error = Errors.for_code(error_code) batch = batches.pop(tp, None) if batch is None: continue if error is Errors.NoError: batch.done(offset, timestamp) elif not self._can_retry(error(), batch): batch.failure(exception=error()) else: log.warning( "Got error produce response on topic-partition" " %s, retrying. Error: %s", tp, error) # Ok, we can retry this batch if getattr(error, "invalid_metadata", False): self.client.force_metadata_update() reenqueue.append(batch) if reenqueue: # Wait backoff before reequeue yield from asyncio.sleep(self._retry_backoff, loop=self._loop) for batch in reenqueue: self._message_accumulator.reenqueue(batch) # If some error started metadata refresh we have to wait before # trying again yield from self.client._maybe_wait_metadata() # if batches for node is processed in less than a linger seconds # then waiting for the remaining time sleep_time = self._linger_time - (self._loop.time() - t0) if sleep_time > 0: yield from asyncio.sleep(sleep_time, loop=self._loop) self._in_flight.remove(node_id)
def commit_offsets(self, offsets): """Commit specific offsets asynchronously. Arguments: offsets (dict {TopicPartition: OffsetAndMetadata}): what to commit Raises error on failure """ self._subscription.needs_fetch_committed_offsets = True if not offsets: log.debug('No offsets to commit') return True if (yield from self.coordinator_unknown()): raise Errors.GroupCoordinatorNotAvailableError() node_id = self.coordinator_id # create the offset commit request offset_data = collections.defaultdict(list) for tp, offset in offsets.items(): offset_data[tp.topic].append( (tp.partition, offset.offset, offset.metadata)) request = OffsetCommitRequest( self.group_id, self.generation, self.member_id, OffsetCommitRequest.DEFAULT_RETENTION_TIME, [(topic, tp_offsets) for topic, tp_offsets in offset_data.items()]) log.debug("Sending offset-commit request with %s for group %s to %s", offsets, self.group_id, node_id) response = yield from self._send_req( node_id, request, group=ConnectionGroup.COORDINATION) unauthorized_topics = set() for topic, partitions in response.topics: for partition, error_code in partitions: tp = TopicPartition(topic, partition) offset = offsets[tp] error_type = Errors.for_code(error_code) if error_type is Errors.NoError: log.debug("Committed offset %s for partition %s", offset, tp) if self._subscription.is_assigned(tp): partition = self._subscription.assignment[tp] partition.committed = offset.offset elif error_type is Errors.GroupAuthorizationFailedError: log.error("OffsetCommit failed for group %s - %s", self.group_id, error_type.__name__) raise error_type() elif error_type is Errors.TopicAuthorizationFailedError: unauthorized_topics.add(topic) elif error_type in (Errors.OffsetMetadataTooLargeError, Errors.InvalidCommitOffsetSizeError): # raise the error to the user log.info( "OffsetCommit failed for group %s on partition %s" " due to %s, will retry", self.group_id, tp, error_type.__name__) raise error_type() elif error_type is Errors.GroupLoadInProgressError: # just retry log.info( "OffsetCommit failed for group %s because group is" " initializing (%s), will retry", self.group_id, error_type.__name__) raise error_type() elif error_type in (Errors.GroupCoordinatorNotAvailableError, Errors.NotCoordinatorForGroupError, Errors.RequestTimedOutError): log.info( "OffsetCommit failed for group %s due to a" " coordinator error (%s), will find new coordinator" " and retry", self.group_id, error_type.__name__) self.coordinator_dead() raise error_type() elif error_type in (Errors.UnknownMemberIdError, Errors.IllegalGenerationError, Errors.RebalanceInProgressError): # need to re-join group error = error_type(self.group_id) log.error( "OffsetCommit failed for group %s due to group" " error (%s), will rejoin", self.group_id, error) self._subscription.mark_for_reassignment() raise error else: log.error( "OffsetCommit failed for group %s on partition %s" " with offset %s: %s", self.group_id, tp, offset, error_type.__name__) raise error_type() if unauthorized_topics: log.error("OffsetCommit failed for unauthorized topics %s", unauthorized_topics) raise Errors.TopicAuthorizationFailedError(unauthorized_topics)
def _proc_offset_request(self, node_id, topic_data): if self._client.api_version < (0, 10, 1): version = 0 # Version 0 had another field `max_offsets`, set it to `1` for topic, part_data in topic_data.items(): topic_data[topic] = [(part, ts, 1) for part, ts in part_data] else: version = 1 request = OffsetRequest[version](-1, list(topic_data.items())) response = yield from self._client.send(node_id, request) res_offsets = {} for topic, part_data in response.topics: for part, error_code, *partition_info in part_data: partition = TopicPartition(topic, part) error_type = Errors.for_code(error_code) if error_type is Errors.NoError: if response.API_VERSION == 0: offsets = partition_info[0] assert len(offsets) <= 1, \ 'Expected OffsetResponse with one offset' if offsets: offset = offsets[0] log.debug( "Handling v0 ListOffsetResponse response for " "%s. Fetched offset %s", partition, offset) res_offsets[partition] = (offset, None) else: res_offsets[partition] = (UNKNOWN_OFFSET, None) else: timestamp, offset = partition_info log.debug( "Handling ListOffsetResponse response for " "%s. Fetched offset %s, timestamp %s", partition, offset, timestamp) res_offsets[partition] = (offset, timestamp) elif error_type is Errors.UnsupportedForMessageFormatError: # The message format on the broker side is before 0.10.0, # we will simply put None in the response. log.debug("Cannot search by timestamp for partition %s " "because the message format version is before " "0.10.0", partition) elif error_type is Errors.NotLeaderForPartitionError: log.debug( "Attempt to fetch offsets for partition %s ""failed " "due to obsolete leadership information, retrying.", partition) raise error_type(partition) elif error_type is Errors.UnknownTopicOrPartitionError: log.warning( "Received unknown topic or partition error in " "ListOffset request for partition %s. The " "topic/partition may not exist or the user may not " "have Describe access to it.", partition) raise error_type(partition) else: log.warning( "Attempt to fetch offsets for partition %s failed due " "to: %s", partition, error_type) raise error_type(partition) return res_offsets
def _do_sasl_handshake(self): # NOTE: We will only fallback to v0.9 gssapi scheme if user explicitly # stated, that api_version is "0.9" if self._version_hint and self._version_hint < (0, 10): handshake_klass = None assert self._sasl_mechanism == 'GSSAPI', ( "Only GSSAPI supported for v0.9" ) else: handshake_klass = self._version_info.pick_best( SaslHandShakeRequest) sasl_handshake = handshake_klass(self._sasl_mechanism) response = yield from self.send(sasl_handshake) error_type = Errors.for_code(response.error_code) if error_type is not Errors.NoError: error = error_type(self) self.close(reason=CloseReason.AUTH_FAILURE, exc=error) raise error if self._sasl_mechanism not in response.enabled_mechanisms: exc = Errors.UnsupportedSaslMechanismError( 'Kafka broker does not support %s sasl mechanism. ' 'Enabled mechanisms are: %s' % (self._sasl_mechanism, response.enabled_mechanisms)) self.close(reason=CloseReason.AUTH_FAILURE, exc=exc) raise exc assert self._sasl_mechanism in ('PLAIN', 'GSSAPI') if self._security_protocol == 'SASL_PLAINTEXT' and \ self._sasl_mechanism == 'PLAIN': self.log.warning( 'Sending username and password in the clear') if self._sasl_mechanism == 'GSSAPI': authenticator = self.authenticator_gssapi() else: authenticator = self.authenticator_plain() if handshake_klass is not None and sasl_handshake.API_VERSION > 0: auth_klass = self._version_info.pick_best(SaslAuthenticateRequest) else: auth_klass = None auth_bytes = None expect_response = True while True: res = yield from authenticator.step(auth_bytes) if res is None: break payload, expect_response = res # Before Kafka 1.0.0 Authentication bytes for SASL were send # without a Kafka Header, only with Length. This made error # handling hard, so they made SaslAuthenticateRequest to properly # pass error messages to clients on source of error. if auth_klass is None: auth_bytes = yield from self._send_sasl_token(payload, expect_response) else: req = auth_klass(payload) resp = yield from self.send(req) error_type = Errors.for_code(resp.error_code) if error_type is not Errors.NoError: exc = error_type(resp.error_message) self.close(reason=CloseReason.AUTH_FAILURE, exc=exc) raise exc auth_bytes = resp.sasl_auth_bytes if self._sasl_mechanism == 'GSSAPI': self.log.info( 'Authenticated as %s via GSSAPI', self.sasl_principal) else: self.log.info('Authenticated as %s via PLAIN', self._sasl_plain_username)
async def _do_sasl_handshake(self): # NOTE: We will only fallback to v0.9 gssapi scheme if user explicitly # stated, that api_version is "0.9" if self._version_hint and self._version_hint < (0, 10): handshake_klass = None assert self._sasl_mechanism == 'GSSAPI', ( "Only GSSAPI supported for v0.9") else: handshake_klass = self._version_info.pick_best( SaslHandShakeRequest) sasl_handshake = handshake_klass(self._sasl_mechanism) response = await self.send(sasl_handshake) error_type = Errors.for_code(response.error_code) if error_type is not Errors.NoError: error = error_type(self) self.close(reason=CloseReason.AUTH_FAILURE, exc=error) raise error if self._sasl_mechanism not in response.enabled_mechanisms: exc = Errors.UnsupportedSaslMechanismError( 'Kafka broker does not support %s sasl mechanism. ' 'Enabled mechanisms are: %s' % (self._sasl_mechanism, response.enabled_mechanisms)) self.close(reason=CloseReason.AUTH_FAILURE, exc=exc) raise exc assert self._sasl_mechanism in ('PLAIN', 'GSSAPI', 'SCRAM-SHA-256', 'SCRAM-SHA-512', 'OAUTHBEARER') if self._security_protocol == 'SASL_PLAINTEXT' and \ self._sasl_mechanism == 'PLAIN': self.log.warning('Sending username and password in the clear') if self._sasl_mechanism == 'GSSAPI': authenticator = self.authenticator_gssapi() elif self._sasl_mechanism.startswith('SCRAM-SHA-'): authenticator = self.authenticator_scram() elif self._sasl_mechanism == 'OAUTHBEARER': authenticator = self.authenticator_oauth() else: authenticator = self.authenticator_plain() if handshake_klass is not None and sasl_handshake.API_VERSION > 0: auth_klass = self._version_info.pick_best(SaslAuthenticateRequest) else: auth_klass = None auth_bytes = None expect_response = True while True: res = await authenticator.step(auth_bytes) if res is None: break payload, expect_response = res # Before Kafka 1.0.0 Authentication bytes for SASL were send # without a Kafka Header, only with Length. This made error # handling hard, so they made SaslAuthenticateRequest to properly # pass error messages to clients on source of error. if auth_klass is None: auth_bytes = await self._send_sasl_token( payload, expect_response) else: req = auth_klass(payload) resp = await self.send(req) error_type = Errors.for_code(resp.error_code) if error_type is not Errors.NoError: exc = error_type(resp.error_message) self.close(reason=CloseReason.AUTH_FAILURE, exc=exc) raise exc auth_bytes = resp.sasl_auth_bytes if self._sasl_mechanism == 'GSSAPI': self.log.info('Authenticated as %s via GSSAPI', self.sasl_principal) elif self._sasl_mechanism == 'OAUTHBEARER': self.log.info('Authenticated via OAUTHBEARER') else: self.log.info('Authenticated as %s via PLAIN', self._sasl_plain_username)
def _proc_fetch_request(self, node_id, request): needs_wakeup = False needs_position_update = [] try: response = yield from self._client.send(node_id, request) except Errors.KafkaError as err: log.error("Failed fetch messages from %s: %s", node_id, err) return False finally: self._in_flight.remove(node_id) fetch_offsets = {} for topic, partitions in request.topics: for partition, offset, _ in partitions: fetch_offsets[TopicPartition(topic, partition)] = offset for topic, partitions in response.topics: for partition, error_code, highwater, raw_batch in partitions: tp = TopicPartition(topic, partition) error_type = Errors.for_code(error_code) if not self._subscriptions.is_fetchable(tp): # this can happen when a rebalance happened log.debug( "Ignoring fetched records for partition %s" " since it is no longer fetchable", tp) elif error_type is Errors.NoError: tp_assignment = self._subscriptions.assignment[tp] tp_assignment.highwater = highwater # `drop_pending_message_set` is set after a seek to another # position. If we request the *new* position we have to # drop this flag, so we catch future seek's. fetch_offset = fetch_offsets[tp] if fetch_offset == tp_assignment.position: tp_assignment.drop_pending_message_set = False records = MemoryRecords(raw_batch) if records.has_next(): log.debug( "Adding fetched record for partition %s with" " offset %d to buffered record list", tp, fetch_offset) message_iterator = self._unpack_records(tp, records) self._records[tp] = FetchResult( tp, records=message_iterator, subscriptions=self._subscriptions, backoff=self._prefetch_backoff, loop=self._loop) # We added at least 1 successful record needs_wakeup = True elif records.size_in_bytes() > 0: # we did not read a single message from a non-empty # buffer because that message's size is larger than # fetch size, in this case record this exception err = RecordTooLargeError( "There are some messages at [Partition=Offset]: " "%s=%s whose size is larger than the fetch size %s" " and hence cannot be ever returned. " "Increase the fetch size, or decrease the maximum " "message size the broker will allow.", tp, fetch_offset, self._max_partition_fetch_bytes) self._set_error(tp, err) needs_wakeup = True self._subscriptions.assignment[tp].position += 1 elif error_type in (Errors.NotLeaderForPartitionError, Errors.UnknownTopicOrPartitionError): self._client.force_metadata_update() elif error_type is Errors.OffsetOutOfRangeError: fetch_offset = fetch_offsets[tp] if self._subscriptions.has_default_offset_reset_policy(): self._subscriptions.need_offset_reset(tp) needs_position_update.append(tp) else: err = Errors.OffsetOutOfRangeError({tp: fetch_offset}) self._set_error(tp, err) needs_wakeup = True log.info( "Fetch offset %s is out of range for partition %s," " resetting offset", fetch_offset, tp) elif error_type is Errors.TopicAuthorizationFailedError: log.warn("Not authorized to read from topic %s.", tp.topic) err = Errors.TopicAuthorizationFailedError(tp.topic) self._set_error(tp, err) needs_wakeup = True else: log.warn('Unexpected error while fetching data: %s', error_type.__name__) if needs_position_update: try: yield from self.update_fetch_positions(needs_position_update) except Exception: # pragma: no cover log.error("Unexpected error updating fetch positions", exc_info=True) return needs_wakeup
def update_metadata(self, metadata): """Update cluster state given a MetadataResponse. Arguments: metadata (MetadataResponse): broker response to a metadata request Returns: None """ if not metadata.brokers: log.warning("No broker metadata found in MetadataResponse") _new_brokers = {} for broker in metadata.brokers: if metadata.API_VERSION == 0: node_id, host, port = broker rack = None else: node_id, host, port, rack = broker _new_brokers.update({ node_id: BrokerMetadata(node_id, host, port, rack) }) if metadata.API_VERSION == 0: _new_controller = None else: _new_controller = _new_brokers.get(metadata.controller_id) _new_partitions = {} _new_broker_partitions = collections.defaultdict(set) _new_unauthorized_topics = set() _new_internal_topics = set() for topic_data in metadata.topics: if metadata.API_VERSION == 0: error_code, topic, partitions = topic_data is_internal = False else: error_code, topic, is_internal, partitions = topic_data if is_internal: _new_internal_topics.add(topic) error_type = Errors.for_code(error_code) if error_type is Errors.NoError: _new_partitions[topic] = {} for p_error, partition, leader, replicas, isr in partitions: _new_partitions[topic][partition] = PartitionMetadata( topic=topic, partition=partition, leader=leader, replicas=replicas, isr=isr, error=p_error) if leader != -1: _new_broker_partitions[leader].add( TopicPartition(topic, partition)) elif error_type is Errors.LeaderNotAvailableError: log.warning("Topic %s is not available during auto-create" " initialization", topic) elif error_type is Errors.UnknownTopicOrPartitionError: log.error("Topic %s not found in cluster metadata", topic) elif error_type is Errors.TopicAuthorizationFailedError: log.error("Topic %s is not authorized for this client", topic) _new_unauthorized_topics.add(topic) elif error_type is Errors.InvalidTopicError: log.error("'%s' is not a valid topic name", topic) else: log.error("Error fetching metadata for topic %s: %s", topic, error_type) with self._lock: self._brokers = _new_brokers self.controller = _new_controller self._partitions = _new_partitions self._broker_partitions = _new_broker_partitions self.unauthorized_topics = _new_unauthorized_topics self.internal_topics = _new_internal_topics now = time.time() * 1000 self._last_refresh_ms = now self._last_successful_refresh_ms = now log.debug("Updated cluster metadata to %s", self) for listener in self._listeners: listener(self)
def _send_produce_req(self, node_id, batches): """Create produce request to node If producer configured with `retries`>0 and produce response contain "failed" partitions produce request for this partition will try resend to broker `retries` times with `retry_timeout_ms` timeouts. Arguments: node_id (int): kafka broker identifier batches (dict): dictionary of {TopicPartition: MessageBatch} """ self._in_flight.add(node_id) t0 = self._loop.time() while True: topics = collections.defaultdict(list) for tp, batch in batches.items(): topics[tp.topic].append((tp.partition, batch.data())) if self.client.api_version >= (0, 10): version = 2 elif self.client.api_version == (0, 9): version = 1 else: version = 0 request = ProduceRequest[version]( required_acks=self._acks, timeout=self._request_timeout_ms, topics=list(topics.items())) try: response = yield from self.client.send(node_id, request) except KafkaError as err: for batch in batches.values(): if not err.retriable or batch.expired(): batch.done(exception=err) log.warning( "Got error produce response: %s", err) if not err.retriable: break else: if response is None: # noacks, just "done" batches for batch in batches.values(): batch.done() break for topic, partitions in response.topics: for partition_info in partitions: if response.API_VERSION < 2: partition, error_code, offset = partition_info else: partition, error_code, offset, _ = partition_info tp = TopicPartition(topic, partition) error = Errors.for_code(error_code) batch = batches.pop(tp, None) if batch is None: continue if error is Errors.NoError: batch.done(offset) elif not getattr(error, 'retriable', False) or \ batch.expired(): batch.done(exception=error()) else: # Ok, we can retry this batch batches[tp] = batch log.warning( "Got error produce response on topic-partition" " %s, retrying. Error: %s", tp, error) if batches: yield from asyncio.sleep( self._retry_backoff, loop=self._loop) else: break # if batches for node is processed in less than a linger seconds # then waiting for the remaining time sleep_time = self._linger_time - (self._loop.time() - t0) if sleep_time > 0: yield from asyncio.sleep(sleep_time, loop=self._loop) self._in_flight.remove(node_id)