Beispiel #1
0
def synchronize_hosts(select_query,
                      event_producer,
                      chunk_size,
                      config,
                      interrupt=lambda: False):
    query = select_query.order_by(Host.id)
    host_list = query.limit(chunk_size).all()

    while len(host_list) > 0 and not interrupt():
        for host in host_list:
            serialized_host = serialize_host(host,
                                             Timestamps.from_config(config),
                                             EGRESS_HOST_FIELDS)
            event = build_event(EventType.updated, serialized_host)
            insights_id = host.canonical_facts.get("insights_id")
            headers = message_headers(EventType.updated, insights_id)
            # in case of a failed update event, event_producer logs the message.
            event_producer.write_event(event, str(host.id), headers,
                                       Topic.events)
            synchronize_host_count.inc()

            yield host.id

        try:
            # pace the events production speed as flush completes sending all buffered records.
            event_producer._kafka_producer.flush(300)
        except KafkaTimeoutError:
            raise KafkaTimeoutError(
                f"KafkaTimeoutError: failure to flush {chunk_size} records within 300 seconds"
            )

        # load next chunk using keyset pagination
        host_list = query.filter(
            Host.id > host_list[-1].id).limit(chunk_size).all()
Beispiel #2
0
    def ensure_topic_exists(self, topic, timeout = 30):
        start_time = time.time()

        while not self.has_metadata_for_topic(topic):
            if time.time() > start_time + timeout:
                raise KafkaTimeoutError('Unable to create topic {0}'.format(topic))
            self.load_metadata_for_topics(topic, ignore_leadernotavailable=True)
            time.sleep(.5)
Beispiel #3
0
    def setUp(self):
        super(KafkaIntegrationTestCase, self).setUp()
        if not os.environ.get('KAFKA_VERSION'):
            self.skipTest('Integration test requires KAFKA_VERSION')

        if not self.topic:
            topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:],
                               random_string(10))
            self.topic = topic

        if self.create_client:
            self.client = SimpleClient('%s:%d' %
                                       (self.server.host, self.server.port))
            self.client_async = KafkaClient(
                bootstrap_servers='%s:%d' %
                (self.server.host, self.server.port))

        timeout = time.time() + 30
        while time.time() < timeout:
            try:
                self.client.load_metadata_for_topics(
                    self.topic, ignore_leadernotavailable=False)
                if self.client.has_metadata_for_topic(topic):
                    break
            except (LeaderNotAvailableError, InvalidTopicError):
                time.sleep(1)
        else:
            raise KafkaTimeoutError('Timeout loading topic metadata!')

        # Ensure topic partitions have been created on all brokers to avoid UnknownPartitionErrors
        # TODO: It might be a good idea to move this to self.client.ensure_topic_exists
        for partition in self.client.get_partition_ids_for_topic(self.topic):
            while True:
                try:
                    req = OffsetRequestPayload(self.topic, partition, -1, 100)
                    self.client.send_offset_request([req])
                    break
                except (NotLeaderForPartitionError,
                        UnknownTopicOrPartitionError,
                        FailedPayloadsError) as e:
                    if time.time() > timeout:
                        raise KafkaTimeoutError(
                            'Timeout loading topic metadata!')
                    time.sleep(.1)

        self._messages = {}
Beispiel #4
0
 def test_bulk_write_error(self, mocked_logger):
     transport_error = KafkaTimeoutError('mocked error')
     es_index_error_ctx = mock.patch(
         'time_execution.backends.kafka.KafkaProducer.send',
         side_effect=transport_error)
     metrics = [1, 2, 3]
     with es_index_error_ctx:
         self.backend.bulk_write(metrics)
         mocked_logger.warning.assert_called_once_with(
             'bulk_write metrics %r failure %r', metrics, transport_error)
Beispiel #5
0
    def test_send_sync_timeout_error(self):
        sut = KafkaWriter(config, str)
        future_mock = Mock()
        future_mock.get.side_effect = KafkaTimeoutError()
        sut._producer = MagicMock()
        sut._producer.send.return_value = future_mock

        with self.assertRaises(KafkaTimeoutError):
            sut.send_sync(msg)
            sut._producer.send.assert_called_once_with(topic, msg)
            future_mock.get.assert_called_once_with(timeout=timeout)
Beispiel #6
0
    def test_write_error_warning(self, mocked_logger):
        transport_error = KafkaTimeoutError('mocked error')
        es_index_error_ctx = mock.patch(
            'time_execution.backends.kafka.KafkaProducer.send',
            side_effect=transport_error)
        frozen_time_ctx = freeze_time('2016-07-13')

        with es_index_error_ctx, frozen_time_ctx:
            self.backend.write(name='test:metric', value=None)
            mocked_logger.warning.assert_called_once_with(
                'writing metric %r failure %r',
                {
                    'timestamp': datetime(2016, 7, 13),
                    'value': None,
                    'name': 'test:metric'
                },
                transport_error,
            )
    def test_process_item(self, e):
        item = self._get_item()
        spider = MagicMock()
        spider.name = "link"

        # test normal send, no appid topics
        self.pipe.process_item(item, spider)
        expected = '{"appid":"app","attrs":{},"body":"text","crawlid":"crawlid","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
        self.pipe.producer.send.assert_called_once_with('prefix.crawled_firehose',
                                                        expected)
        self.pipe.producer.send.reset_mock()

        # test normal send, with appids
        item = self._get_item()
        self.pipe.appid_topics = True
        self.pipe.process_item(item, spider)
        self.pipe.producer.send.assert_called_with('prefix.crawled_app',
                                                    expected)
        self.pipe.producer.send.reset_mock()

        # test base64 encode
        item = self._get_item()
        self.pipe.appid_topics = False
        self.pipe.use_base64 = True
        self.pipe.process_item(item, spider)
        expected = '{"appid":"app","attrs":{},"body":"dGV4dA==","crawlid":"crawlid","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
        self.pipe.producer.send.assert_called_once_with('prefix.crawled_firehose',
                                                        expected)

        # test kafka exception
        item = self._get_item()
        copy = deepcopy(item)
        copy['success'] = False
        copy['exception'] = 'traceback'

        # send should not crash the pipeline
        self.pipe.producer.send = MagicMock(side_effect=KafkaTimeoutError('bad kafka'))
        ret_val = self.pipe.process_item(item, spider)
Beispiel #8
0
 def send(topic, value):
     raise KafkaTimeoutError(TEST_ERROR)
    def test_process_item(self, e):
        item = self._get_item()
        spider = MagicMock()
        spider.name = "link"

        # test normal send, no appid topics
        self.pipe.process_item(item, spider)
        expected = '{"appid":"app","attrs":{},"body":"text","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
        self.pipe.producer.send.assert_called_once_with(
            'prefix.crawled_firehose', expected)
        self.pipe.producer.send.reset_mock()

        # test normal send, with appids
        item = self._get_item()
        self.pipe.appid_topics = True
        self.pipe.process_item(item, spider)
        self.pipe.producer.send.assert_called_with('prefix.crawled_app',
                                                   expected)
        self.pipe.producer.send.reset_mock()

        # test base64 encode
        item = self._get_item()
        self.pipe.appid_topics = False
        self.pipe.use_base64 = True
        self.pipe.process_item(item, spider)
        expected = '{"appid":"app","attrs":{},"body":"dGV4dA==","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
        self.pipe.producer.send.assert_called_with('prefix.crawled_firehose',
                                                   expected)

        # test base64 encode/decode with utf-8 encoding
        item = self._get_internationalized_utf8_item()
        self.pipe.appid_topics = False
        self.pipe.use_base64 = True
        self.pipe.process_item(item, spider)
        expected = '{"appid":"app","attrs":{},"body":"VGhpcyBpcyBhIHRlc3QgLSDOkc+Fz4TPjCDOtc6vzr3Osc65IM6tzr3OsSDPhM61z4PPhCAtIOi\\/meaYr+S4gOS4qua1i+ivlSAtIOOBk+OCjOOBr+ODhuOCueODiOOBp+OBmQ==","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
        self.pipe.producer.send.assert_called_with('prefix.crawled_firehose',
                                                   expected)
        # unpack the arguments used for the previous assertion call
        call_args, call_kwargs = self.pipe.producer.send.call_args
        crawl_args_dict = ujson.loads(call_args[1])
        decoded_string = base64.b64decode(crawl_args_dict['body']).decode(
            crawl_args_dict['encoding'])
        self.assertEquals(decoded_string, item.get('body'))

        # test base64 encode/decode with iso encoding
        item = self._get_internationalized_iso_item()
        self.pipe.appid_topics = False
        self.pipe.use_base64 = True
        self.pipe.process_item(item, spider)
        expected = '{"appid":"app","attrs":{},"body":"4fX0\\/CDl3+3h6SDd7eEg9OXz9Ao=","crawlid":"crawlid","encoding":"iso-8859-7","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
        self.pipe.producer.send.assert_called_with('prefix.crawled_firehose',
                                                   expected)
        # unpack the arguments used for the previous assertion call
        call_args, call_kwargs = self.pipe.producer.send.call_args
        crawl_args_dict = ujson.loads(call_args[1])
        decoded_string = base64.b64decode(crawl_args_dict['body']).decode(
            crawl_args_dict['encoding'])
        self.assertEquals(decoded_string,
                          item.get('body').decode(item.get('encoding')))
        # Test again against the original (before it was encoded in iso) string
        self.assertEquals(decoded_string, u"αυτό είναι ένα τεστ\n")

        # test kafka exception
        item = self._get_item()
        copy = deepcopy(item)
        copy['success'] = False
        copy['exception'] = 'traceback'

        # send should not crash the pipeline
        self.pipe.producer.send = MagicMock(
            side_effect=KafkaTimeoutError('bad kafka'))
        ret_val = self.pipe.process_item(item, spider)