def synchronize_hosts(select_query, event_producer, chunk_size, config, interrupt=lambda: False): query = select_query.order_by(Host.id) host_list = query.limit(chunk_size).all() while len(host_list) > 0 and not interrupt(): for host in host_list: serialized_host = serialize_host(host, Timestamps.from_config(config), EGRESS_HOST_FIELDS) event = build_event(EventType.updated, serialized_host) insights_id = host.canonical_facts.get("insights_id") headers = message_headers(EventType.updated, insights_id) # in case of a failed update event, event_producer logs the message. event_producer.write_event(event, str(host.id), headers, Topic.events) synchronize_host_count.inc() yield host.id try: # pace the events production speed as flush completes sending all buffered records. event_producer._kafka_producer.flush(300) except KafkaTimeoutError: raise KafkaTimeoutError( f"KafkaTimeoutError: failure to flush {chunk_size} records within 300 seconds" ) # load next chunk using keyset pagination host_list = query.filter( Host.id > host_list[-1].id).limit(chunk_size).all()
def ensure_topic_exists(self, topic, timeout = 30): start_time = time.time() while not self.has_metadata_for_topic(topic): if time.time() > start_time + timeout: raise KafkaTimeoutError('Unable to create topic {0}'.format(topic)) self.load_metadata_for_topics(topic, ignore_leadernotavailable=True) time.sleep(.5)
def setUp(self): super(KafkaIntegrationTestCase, self).setUp() if not os.environ.get('KAFKA_VERSION'): self.skipTest('Integration test requires KAFKA_VERSION') if not self.topic: topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10)) self.topic = topic if self.create_client: self.client = SimpleClient('%s:%d' % (self.server.host, self.server.port)) self.client_async = KafkaClient( bootstrap_servers='%s:%d' % (self.server.host, self.server.port)) timeout = time.time() + 30 while time.time() < timeout: try: self.client.load_metadata_for_topics( self.topic, ignore_leadernotavailable=False) if self.client.has_metadata_for_topic(topic): break except (LeaderNotAvailableError, InvalidTopicError): time.sleep(1) else: raise KafkaTimeoutError('Timeout loading topic metadata!') # Ensure topic partitions have been created on all brokers to avoid UnknownPartitionErrors # TODO: It might be a good idea to move this to self.client.ensure_topic_exists for partition in self.client.get_partition_ids_for_topic(self.topic): while True: try: req = OffsetRequestPayload(self.topic, partition, -1, 100) self.client.send_offset_request([req]) break except (NotLeaderForPartitionError, UnknownTopicOrPartitionError, FailedPayloadsError) as e: if time.time() > timeout: raise KafkaTimeoutError( 'Timeout loading topic metadata!') time.sleep(.1) self._messages = {}
def test_bulk_write_error(self, mocked_logger): transport_error = KafkaTimeoutError('mocked error') es_index_error_ctx = mock.patch( 'time_execution.backends.kafka.KafkaProducer.send', side_effect=transport_error) metrics = [1, 2, 3] with es_index_error_ctx: self.backend.bulk_write(metrics) mocked_logger.warning.assert_called_once_with( 'bulk_write metrics %r failure %r', metrics, transport_error)
def test_send_sync_timeout_error(self): sut = KafkaWriter(config, str) future_mock = Mock() future_mock.get.side_effect = KafkaTimeoutError() sut._producer = MagicMock() sut._producer.send.return_value = future_mock with self.assertRaises(KafkaTimeoutError): sut.send_sync(msg) sut._producer.send.assert_called_once_with(topic, msg) future_mock.get.assert_called_once_with(timeout=timeout)
def test_write_error_warning(self, mocked_logger): transport_error = KafkaTimeoutError('mocked error') es_index_error_ctx = mock.patch( 'time_execution.backends.kafka.KafkaProducer.send', side_effect=transport_error) frozen_time_ctx = freeze_time('2016-07-13') with es_index_error_ctx, frozen_time_ctx: self.backend.write(name='test:metric', value=None) mocked_logger.warning.assert_called_once_with( 'writing metric %r failure %r', { 'timestamp': datetime(2016, 7, 13), 'value': None, 'name': 'test:metric' }, transport_error, )
def test_process_item(self, e): item = self._get_item() spider = MagicMock() spider.name = "link" # test normal send, no appid topics self.pipe.process_item(item, spider) expected = '{"appid":"app","attrs":{},"body":"text","crawlid":"crawlid","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' self.pipe.producer.send.assert_called_once_with('prefix.crawled_firehose', expected) self.pipe.producer.send.reset_mock() # test normal send, with appids item = self._get_item() self.pipe.appid_topics = True self.pipe.process_item(item, spider) self.pipe.producer.send.assert_called_with('prefix.crawled_app', expected) self.pipe.producer.send.reset_mock() # test base64 encode item = self._get_item() self.pipe.appid_topics = False self.pipe.use_base64 = True self.pipe.process_item(item, spider) expected = '{"appid":"app","attrs":{},"body":"dGV4dA==","crawlid":"crawlid","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' self.pipe.producer.send.assert_called_once_with('prefix.crawled_firehose', expected) # test kafka exception item = self._get_item() copy = deepcopy(item) copy['success'] = False copy['exception'] = 'traceback' # send should not crash the pipeline self.pipe.producer.send = MagicMock(side_effect=KafkaTimeoutError('bad kafka')) ret_val = self.pipe.process_item(item, spider)
def send(topic, value): raise KafkaTimeoutError(TEST_ERROR)
def test_process_item(self, e): item = self._get_item() spider = MagicMock() spider.name = "link" # test normal send, no appid topics self.pipe.process_item(item, spider) expected = '{"appid":"app","attrs":{},"body":"text","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' self.pipe.producer.send.assert_called_once_with( 'prefix.crawled_firehose', expected) self.pipe.producer.send.reset_mock() # test normal send, with appids item = self._get_item() self.pipe.appid_topics = True self.pipe.process_item(item, spider) self.pipe.producer.send.assert_called_with('prefix.crawled_app', expected) self.pipe.producer.send.reset_mock() # test base64 encode item = self._get_item() self.pipe.appid_topics = False self.pipe.use_base64 = True self.pipe.process_item(item, spider) expected = '{"appid":"app","attrs":{},"body":"dGV4dA==","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' self.pipe.producer.send.assert_called_with('prefix.crawled_firehose', expected) # test base64 encode/decode with utf-8 encoding item = self._get_internationalized_utf8_item() self.pipe.appid_topics = False self.pipe.use_base64 = True self.pipe.process_item(item, spider) expected = '{"appid":"app","attrs":{},"body":"VGhpcyBpcyBhIHRlc3QgLSDOkc+Fz4TPjCDOtc6vzr3Osc65IM6tzr3OsSDPhM61z4PPhCAtIOi\\/meaYr+S4gOS4qua1i+ivlSAtIOOBk+OCjOOBr+ODhuOCueODiOOBp+OBmQ==","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' self.pipe.producer.send.assert_called_with('prefix.crawled_firehose', expected) # unpack the arguments used for the previous assertion call call_args, call_kwargs = self.pipe.producer.send.call_args crawl_args_dict = ujson.loads(call_args[1]) decoded_string = base64.b64decode(crawl_args_dict['body']).decode( crawl_args_dict['encoding']) self.assertEquals(decoded_string, item.get('body')) # test base64 encode/decode with iso encoding item = self._get_internationalized_iso_item() self.pipe.appid_topics = False self.pipe.use_base64 = True self.pipe.process_item(item, spider) expected = '{"appid":"app","attrs":{},"body":"4fX0\\/CDl3+3h6SDd7eEg9OXz9Ao=","crawlid":"crawlid","encoding":"iso-8859-7","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' self.pipe.producer.send.assert_called_with('prefix.crawled_firehose', expected) # unpack the arguments used for the previous assertion call call_args, call_kwargs = self.pipe.producer.send.call_args crawl_args_dict = ujson.loads(call_args[1]) decoded_string = base64.b64decode(crawl_args_dict['body']).decode( crawl_args_dict['encoding']) self.assertEquals(decoded_string, item.get('body').decode(item.get('encoding'))) # Test again against the original (before it was encoded in iso) string self.assertEquals(decoded_string, u"αυτό είναι ένα τεστ\n") # test kafka exception item = self._get_item() copy = deepcopy(item) copy['success'] = False copy['exception'] = 'traceback' # send should not crash the pipeline self.pipe.producer.send = MagicMock( side_effect=KafkaTimeoutError('bad kafka')) ret_val = self.pipe.process_item(item, spider)