def test_send_produce_request_raises_when_noleader(self, protocol, conn): "Send producer request raises LeaderUnavailableError if leader is not available" conn.recv.return_value = 'response' # anything but None brokers = {} brokers[0] = BrokerMetadata(0, 'broker_1', 4567) brokers[1] = BrokerMetadata(1, 'broker_2', 5678) topics = {} topics['topic_noleader'] = { 0: PartitionMetadata('topic_noleader', 0, -1, [], []), 1: PartitionMetadata('topic_noleader', 1, -1, [], []) } protocol.decode_metadata_response.return_value = (brokers, topics) client = KafkaClient(hosts=['broker_1:4567']) requests = [ ProduceRequest( "topic_noleader", 0, [create_message("a"), create_message("b")]) ] with self.assertRaises(LeaderUnavailableError): client.send_produce_request(requests)
def test_send_produce_request_raises_when_noleader(self, protocol, conn): "Send producer request raises LeaderNotAvailableError if leader is not available" conn.recv.return_value = 'response' # anything but None brokers = [ BrokerMetadata(0, 'broker_1', 4567), BrokerMetadata(1, 'broker_2', 5678) ] topics = [ TopicMetadata('topic_noleader', NO_ERROR, [ PartitionMetadata('topic_noleader', 0, -1, [], [], NO_LEADER), PartitionMetadata('topic_noleader', 1, -1, [], [], NO_LEADER), ]), ] protocol.decode_metadata_response.return_value = MetadataResponse( brokers, topics) client = KafkaClient(hosts=['broker_1:4567']) requests = [ ProduceRequest( "topic_noleader", 0, [create_message("a"), create_message("b")]) ] with self.assertRaises(LeaderNotAvailableError): client.send_produce_request(requests)
def test_send_produce_request_raises_when_topic_unknown( self, protocol, conn): conn.recv.return_value = 'response' # anything but None brokers = [ BrokerMetadata(0, 'broker_1', 4567), BrokerMetadata(1, 'broker_2', 5678) ] topics = [ TopicMetadata('topic_doesnt_exist', UNKNOWN_TOPIC_OR_PARTITION, []), ] protocol.decode_metadata_response.return_value = MetadataResponse( brokers, topics) client = KafkaClient(hosts=['broker_1:4567']) requests = [ ProduceRequest( "topic_doesnt_exist", 0, [create_message("a"), create_message("b")]) ] with self.assertRaises(UnknownTopicOrPartitionError): client.send_produce_request(requests)
def test_send_produce_request_raises_when_noleader(self, protocol, conn): "Send producer request raises LeaderNotAvailableError if leader is not available" conn.recv.return_value = 'response' # anything but None brokers = [ BrokerMetadata(0, 'broker_1', 4567), BrokerMetadata(1, 'broker_2', 5678) ] topics = [ TopicMetadata('topic_noleader', NO_ERROR, [ PartitionMetadata('topic_noleader', 0, -1, [], [], NO_LEADER), PartitionMetadata('topic_noleader', 1, -1, [], [], NO_LEADER), ]), ] protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) client = KafkaClient(hosts=['broker_1:4567']) requests = [ProduceRequest( "topic_noleader", 0, [create_message("a"), create_message("b")])] with self.assertRaises(LeaderNotAvailableError): client.send_produce_request(requests)
def test_send_produce_request_raises_when_topic_unknown(self, protocol, conn): conn.recv.return_value = "response" # anything but None brokers = [BrokerMetadata(0, "broker_1", 4567), BrokerMetadata(1, "broker_2", 5678)] topics = [TopicMetadata("topic_doesnt_exist", UNKNOWN_TOPIC_OR_PARTITION, [])] protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) client = KafkaClient(hosts=["broker_1:4567"]) requests = [ProduceRequest("topic_doesnt_exist", 0, [create_message("a"), create_message("b")])] with self.assertRaises(UnknownTopicOrPartitionError): client.send_produce_request(requests)
def produce_messages(self): """ Produce sample messages """ # TODO: Support different kafka port kafka = KafkaClient(self.config.kafka_host) total_messages = self.batches * self.batch_size messages_batch = [create_message(random.choice(self.sample_messages)) for _ in xrange(self.batch_size)] for i in range(self.batches): # TODO: Support writing to all partitions req = ProduceRequest(topic=self.config.kafka_topic, partition=0, messages=messages_batch) kafka.send_produce_request(payloads=[req], fail_on_error=True) sent_messages = i * self.batch_size logging.info('Created %s out of %s sample messages', sent_messages, total_messages) kafka.close()
def produce_messages(self): """ Produce sample messages """ # TODO: Support different kafka port kafka = KafkaClient(self.config.kafka_host) total_messages = self.batches * self.batch_size messages_batch = [ create_message(random.choice(self.sample_messages)) for _ in xrange(self.batch_size) ] for i in range(self.batches): # TODO: Support writing to all partitions req = ProduceRequest(topic=self.config.kafka_topic, partition=0, messages=messages_batch) kafka.send_produce_request(payloads=[req], fail_on_error=True) sent_messages = i * self.batch_size logging.info('Created %s out of %s sample messages', sent_messages, total_messages) kafka.close()
def test_send_produce_request_raises_when_noleader(self, protocol, conn): "Send producer request raises LeaderUnavailableError if leader is not available" conn.recv.return_value = 'response' # anything but None brokers = {} brokers[0] = BrokerMetadata(0, 'broker_1', 4567) brokers[1] = BrokerMetadata(1, 'broker_2', 5678) topics = {} topics['topic_noleader'] = { 0: PartitionMetadata('topic_noleader', 0, -1, [], []), 1: PartitionMetadata('topic_noleader', 1, -1, [], []) } protocol.decode_metadata_response.return_value = (brokers, topics) client = KafkaClient(hosts=['broker_1:4567']) requests = [ProduceRequest( "topic_noleader", 0, [create_message(b"a"), create_message(b"b")])] with self.assertRaises(LeaderUnavailableError): client.send_produce_request(requests)
def low_level(): '''low level''' from kafka import KafkaClient, create_message from kafka.protocol import KafkaProtocol from kafka.common import ProduceRequest kafka = KafkaClient(KAFKA_SERVER) req = ProduceRequest(topic=b'topic1', partition=1, messages=[create_message(b'some message')]) resps = kafka.send_produce_request(payloads=[req], fail_on_error=True) kafka.close() print resps[0].topic # b'topic1' print resps[0].partition # 1 print resps[0].error # 0 (hopefully) print resps[0].offset # offset of the first message sent in this request
class KafkaProducer(object): """The KafkaProducer deals with buffering messages that need to be published into Kafka, preparing them for publication, and ultimately publishing them. Args: producer_position_callback (function): The producer position callback is called when the KafkaProducer is instantiated, and every time messages are published to notify the producer of current position information of successfully published messages. dry_run (Optional[bool]): When dry_run mode is on, the producer won't talk to real KafKa topic, nor to real Schematizer. Default to False. """ @cached_property def envelope(self): return Envelope() def __init__(self, producer_position_callback, dry_run=False): self.producer_position_callback = producer_position_callback self.dry_run = dry_run self.kafka_client = KafkaClient(get_config().cluster_config.broker_list) self.position_data_tracker = PositionDataTracker() self._reset_message_buffer() self.skip_messages_with_pii = get_config().skip_messages_with_pii self._publish_retry_policy = RetryPolicy( ExpBackoffPolicy(with_jitter=True), max_retry_count=get_config().producer_max_publish_retry_count ) self._automatic_flush_enabled = True @contextmanager def disable_automatic_flushing(self): """Prevents the producer from flushing automatically (e.g. for timeouts or batch size) while the context manager is open. """ try: self._automatic_flush_enabled = False yield finally: self._automatic_flush_enabled = True def wake(self): """Should be called periodically if we're not otherwise waking up by publishing, to ensure that messages are actually published. """ # if we haven't woken up in a while, we may need to flush messages self._flush_if_necessary() def publish(self, message): if message.contains_pii and self.skip_messages_with_pii: logger.info( "Skipping a PII message - " "uuid hex: {0}, " "schema_id: {1}, " "timestamp: {2}, " "type: {3}".format( message.uuid_hex, message.schema_id, message.timestamp, message.message_type.name ) ) return self._add_message_to_buffer(message) self.position_data_tracker.record_message_buffered(message) self._flush_if_necessary() def flush_buffered_messages(self): produce_method = (self._publish_produce_requests_dry_run if self.dry_run else self._publish_produce_requests) produce_method(self._generate_produce_requests()) self._reset_message_buffer() def close(self): self.flush_buffered_messages() self.kafka_client.close() def _publish_produce_requests(self, requests): """It will try to publish all the produce requests for topics, and retry a number of times until either all the requests are successfully published or it can no longer retry, in which case, the exception will be thrown. Each time the requests that are successfully published in the previous round will be removed from the requests and won't be published again. """ unpublished_requests = list(requests) retry_handler = RetryHandler(self.kafka_client, unpublished_requests) def has_requests_to_be_sent(): return bool(retry_handler.requests_to_be_sent) retry_handler = retry_on_condition( retry_policy=self._publish_retry_policy, retry_conditions=[Predicate(has_requests_to_be_sent)], func_to_retry=self._publish_requests, use_previous_result_as_param=True, retry_handler=retry_handler ) if retry_handler.has_unpublished_request: raise MaxRetryError(last_result=retry_handler) def _publish_requests(self, retry_handler): """Main function to publish message requests. This function is wrapped with retry function and will be retried based on specified retry policy Args: retry_handler: :class:`data_pipeline._producer_retry.RetryHandler` that determines which messages should be retried next time. """ if not retry_handler.requests_to_be_sent: return retry_handler responses = self._try_send_produce_requests( retry_handler.requests_to_be_sent ) retry_handler.update_requests_to_be_sent( responses, self.position_data_tracker.topic_to_kafka_offset_map ) self._record_success_requests(retry_handler.success_topic_stats_map) return retry_handler def _try_send_produce_requests(self, requests): # Either it throws exceptions and none of them succeeds, or it returns # responses of all the requests (success or fail response). try: return self.kafka_client.send_produce_request( payloads=requests, acks=get_config().kafka_client_ack_count, fail_on_error=False ) except Exception: # Exceptions like KafkaUnavailableError, LeaderNotAvailableError, # UnknownTopicOrPartitionError, etc., are not controlled by # `fail_on_error` flag and could be thrown from the kafka client, # and fail all the requests. We will retry all the requests until # either all of them are successfully published or it exceeds the # maximum retry criteria. return [] def _record_success_requests(self, success_topic_stats_map): for topic_partition, stats in success_topic_stats_map.iteritems(): topic = topic_partition.topic_name assert stats.message_count == len(self.message_buffer[topic]) self.position_data_tracker.record_messages_published( topic=topic, offset=stats.original_offset, message_count=stats.message_count ) self.message_buffer.pop(topic) def _publish_produce_requests_dry_run(self, requests): for request in requests: self._publish_single_request_dry_run(request) def _publish_single_request_dry_run(self, request): topic = request.topic message_count = len(request.messages) self.position_data_tracker.record_messages_published( topic, -1, message_count ) def _is_ready_to_flush(self): time_limit = get_config().kafka_producer_flush_time_limit_seconds return (self._automatic_flush_enabled and ( (time.time() - self.start_time) >= time_limit or self.message_buffer_size >= get_config().kafka_producer_buffer_size )) def _flush_if_necessary(self): if self._is_ready_to_flush(): self.flush_buffered_messages() def _add_message_to_buffer(self, message): topic = message.topic message = self._prepare_message(message) self.message_buffer[topic].append(message) self.message_buffer_size += 1 def _generate_produce_requests(self): return [ ProduceRequest(topic=topic, partition=0, messages=messages) for topic, messages in self._generate_prepared_topic_and_messages() ] def _generate_prepared_topic_and_messages(self): return self.message_buffer.iteritems() def _prepare_message(self, message): return _prepare(_EnvelopeAndMessage(envelope=self.envelope, message=message)) def _reset_message_buffer(self): if not hasattr(self, 'message_buffer_size') or self.message_buffer_size > 0: self.producer_position_callback(self.position_data_tracker.get_position_data()) self.start_time = time.time() self.message_buffer = defaultdict(list) self.message_buffer_size = 0
stream.add_filter('record-type', 'ribs') stream.add_filter('record-type', 'updates') stream.add_interval_filter(last_ts, 0) # Start the stream stream.start() client = KafkaClient(args.our_servers.split(",")) count = 0 for batch in group_by_n( messages_from_internal(iterate_stream(stream, args.collector)), 1000): req = ProduceRequest("rib-{}".format(args.collector), 0, batch) for msg in reversed(req.messages): if msg.value is None: continue last_timestamp = json.loads(msg.value)["timestamp"] break count += len(batch) logger.info("sending %i", count) res = client.send_produce_request([req]) try: # this is a bit buggy but it will do for now with open(save_file, "w") as f: f.write(str(last_timestamp)) except: logger.warning("could not write offsets to %s", save_file) pass
class KafkaProducer(object): """The KafkaProducer deals with buffering messages that need to be published into Kafka, preparing them for publication, and ultimately publishing them. Args: producer_position_callback (function): The producer position callback is called when the KafkaProducer is instantiated, and every time messages are published to notify the producer of current position information of successfully published messages. dry_run (Optional[bool]): When dry_run mode is on, the producer won't talk to real KafKa topic, nor to real Schematizer. Default to False. """ @cached_property def envelope(self): return Envelope() def __init__(self, producer_position_callback, dry_run=False): self.producer_position_callback = producer_position_callback self.dry_run = dry_run self.kafka_client = KafkaClient(get_config().cluster_config.broker_list) self.position_data_tracker = PositionDataTracker() self._reset_message_buffer() self.skip_messages_with_pii = get_config().skip_messages_with_pii self._publish_retry_policy = RetryPolicy( ExpBackoffPolicy(with_jitter=True), max_retry_count=get_config().producer_max_publish_retry_count ) self._automatic_flush_enabled = True @contextmanager def disable_automatic_flushing(self): """Prevents the producer from flushing automatically (e.g. for timeouts or batch size) while the context manager is open. """ try: self._automatic_flush_enabled = False yield finally: self._automatic_flush_enabled = True def wake(self): """Should be called periodically if we're not otherwise waking up by publishing, to ensure that messages are actually published. """ # if we haven't woken up in a while, we may need to flush messages self._flush_if_necessary() def publish(self, message): if message.contains_pii and self.skip_messages_with_pii: logger.info( "Skipping a PII message - " "uuid hex: {0}, " "schema_id: {1}, " "timestamp: {2}, " "type: {3}".format( message.uuid_hex, message.schema_id, message.timestamp, message.message_type.name ) ) return self._add_message_to_buffer(message) self.position_data_tracker.record_message_buffered(message) self._flush_if_necessary() def flush_buffered_messages(self): produce_method = (self._publish_produce_requests_dry_run if self.dry_run else self._publish_produce_requests) produce_method(self._generate_produce_requests()) self._reset_message_buffer() def close(self): self.flush_buffered_messages() self.kafka_client.close() def _publish_produce_requests(self, requests): """It will try to publish all the produce requests for topics, and retry a number of times until either all the requests are successfully published or it can no longer retry, in which case, the exception will be thrown. Each time the requests that are successfully published in the previous round will be removed from the requests and won't be published again. """ unpublished_requests = list(requests) retry_handler = RetryHandler(self.kafka_client, unpublished_requests) def has_requests_to_be_sent(): return bool(retry_handler.requests_to_be_sent) retry_handler = retry_on_condition( retry_policy=self._publish_retry_policy, retry_conditions=[Predicate(has_requests_to_be_sent)], func_to_retry=self._publish_requests, use_previous_result_as_param=True, retry_handler=retry_handler ) if retry_handler.has_unpublished_request: raise MaxRetryError(last_result=retry_handler) def _publish_requests(self, retry_handler): """Main function to publish message requests. This function is wrapped with retry function and will be retried based on specified retry policy Args: retry_handler: :class:`data_pipeline._producer_retry.RetryHandler` that determines which messages should be retried next time. """ if not retry_handler.requests_to_be_sent: return retry_handler responses = self._try_send_produce_requests( retry_handler.requests_to_be_sent ) topics_watermarks = self._populate_topics_to_offset_map(responses) self.position_data_tracker.topic_to_kafka_offset_map.update( topics_watermarks ) retry_handler.update_requests_to_be_sent( responses, self.position_data_tracker.topic_to_kafka_offset_map ) self._record_success_requests(retry_handler.success_topic_stats_map) return retry_handler def _populate_topics_to_offset_map(self, responses): topics_from_responses = [ response.topic for response in responses if isinstance(response, ProduceResponse) ] topics_watermarks = get_topics_watermarks( kafka_client=self.kafka_client, topics=topics_from_responses, raise_on_error=True ) topics_watermarks = { topic: partition_offsets[0].highmark for topic, partition_offsets in topics_watermarks.iteritems() } return topics_watermarks def _try_send_produce_requests(self, requests): # Either it throws exceptions and none of them succeeds, or it returns # responses of all the requests (success or fail response). try: return self.kafka_client.send_produce_request( payloads=requests, acks=get_config().kafka_client_ack_count, fail_on_error=False ) except Exception: # Exceptions like KafkaUnavailableError, LeaderNotAvailableError, # UnknownTopicOrPartitionError, etc., are not controlled by # `fail_on_error` flag and could be thrown from the kafka client, # and fail all the requests. We will retry all the requests until # either all of them are successfully published or it exceeds the # maximum retry criteria. return [] def _record_success_requests(self, success_topic_stats_map): for topic_partition, stats in success_topic_stats_map.iteritems(): topic = topic_partition.topic_name assert stats.message_count == len(self.message_buffer[topic]) self.position_data_tracker.record_messages_published( topic=topic, offset=stats.original_offset, message_count=stats.message_count ) self.message_buffer.pop(topic) def _publish_produce_requests_dry_run(self, requests): for request in requests: self._publish_single_request_dry_run(request) def _publish_single_request_dry_run(self, request): topic = request.topic message_count = len(request.messages) self.position_data_tracker.record_messages_published( topic, -1, message_count ) def _is_ready_to_flush(self): time_limit = get_config().kafka_producer_flush_time_limit_seconds return (self._automatic_flush_enabled and ( (time.time() - self.start_time) >= time_limit or self.message_buffer_size >= get_config().kafka_producer_buffer_size )) def _flush_if_necessary(self): if self._is_ready_to_flush(): self.flush_buffered_messages() def _add_message_to_buffer(self, message): topic = message.topic message = self._prepare_message(message) self.message_buffer[topic].append(message) self.message_buffer_size += 1 def _generate_produce_requests(self): return [ ProduceRequest(topic=topic, partition=0, messages=messages) for topic, messages in self._generate_prepared_topic_and_messages() ] def _generate_prepared_topic_and_messages(self): return self.message_buffer.iteritems() def _prepare_message(self, message): return _prepare(_EnvelopeAndMessage(envelope=self.envelope, message=message)) def _reset_message_buffer(self): if not hasattr(self, 'message_buffer_size') or self.message_buffer_size > 0: self.producer_position_callback(self.position_data_tracker.get_position_data()) self.start_time = time.time() self.message_buffer = defaultdict(list) self.message_buffer_size = 0
relation.labels(args.collector).inc() filter_out = True if "direct" in msg: connected.labels(args.collector).inc() filter_out = True if msg.get("caida_private", False) is True: caida_private.labels(args.collector).inc() filter_out = True if msg.get("caida_as2org", False) is True: caida_as2org.labels(args.collector).inc() filter_out = True if msg.get("caida_relation", False) is True: caida_relation.labels(args.collector).inc() filter_out = True if msg.get("caida_cone", False) is True: caida_cone.labels(args.collector).inc() filter_out = True if msg.get("caida_as2rel", False) is True: caida_as2rel.labels(args.collector).inc() filter_out = True all_events.labels(args.collector, filter_out).inc() if filter_out: continue abnormal.labels(args.collector).inc() client.send_produce_request([ProduceRequest("conflicts", PARTITIONS[args.collector], [create_message(json.dumps(msg))])])
#Listing A.1.5 from kafka import KafkaClient, SimpleProducer kafka = KafkaClient("localhost:9092") producer = SimpleProducer(kafka, async=False, req_acks=SimpleProducer.ACK_AFTER_CLUSTER_COMMIT, ack_timeout=2000) producer.send_messages("test-replicated-topic", "Hello Kafka Cluster!") producer.send_messages("test-replicated-topic", "Message to be replicated.") producer.send_messages("test-replicated-topic", "And so is this!") #Listing A.1.8 from kafka import KafkaClient from kafka.common import ProduceRequest from kafka.protocol import KafkaProtocol, create_message kafka = KafkaClient("localhost:9092") f = open('A1.data', 'r') for line in f: s = line.split("\t")[0] part = abs(hash(s)) % 3 req = ProduceRequest(topic="click-streams", partition=part, messages=[create_message(s)]) resps = kafka.send_produce_request(payloads=[req], fail_on_error=True)
for message in consumer: print(message) #Listing A.1.5 from kafka import KafkaClient, SimpleProducer kafka = KafkaClient("localhost:9092") producer = SimpleProducer(kafka,async=False, req_acks=SimpleProducer.ACK_AFTER_CLUSTER_COMMIT, ack_timeout=2000) producer.send_messages("test-replicated-topic", "Hello Kafka Cluster!") producer.send_messages("test-replicated-topic","Message to be replicated.") producer.send_messages("test-replicated-topic","And so is this!") #Listing A.1.8 from kafka import KafkaClient from kafka.common import ProduceRequest from kafka.protocol import KafkaProtocol,create_message kafka = KafkaClient("localhost:9092") f = open('A1.data','r') for line in f: s = line.split("\t")[0] part = abs(hash(s)) % 3 req = ProduceRequest(topic="click-streams",partition=part,messages=[create_message(s)]) resps = kafka.send_produce_request(payloads=[req], fail_on_error=True)
bootstrap_servers=args.ripe_servers.split(",")) save_file = "offsets-{}".format(args.collector) if args.from_beginning: logger.info("starting from scratch") offsets = {("raw-{}".format(args.collector), i): 0 for i in range(0, 10)} consumer.set_topic_partitions(offsets) elif os.path.exists(save_file): with open(save_file, "r") as f: offsets = cPickle.load(f) logger.info("loading offsets from file: %s", offsets) consumer.set_topic_partitions(offsets) else: logger.info("starting from last messages") client = KafkaClient(args.our_servers.split(",")) count = 0 for batch in group_by_n(messages_from_internal(iterate_messages(consumer, args.collector)), 1000): req = ProduceRequest("rib-{}".format(args.collector), 0, batch) count += len(batch) logger.info("sending %i", count) res = client.send_produce_request([req]) offsets = consumer.offsets("fetch") try: # this is a bit buggy but it will do for now with open(save_file, "w") as f: f.write(cPickle.dumps(offsets)) except: logger.warning("could not write offsets to %s", save_file) pass