def test_produce_replacement_messages(self): producer = FakeConfluentKafkaProducer() test_worker = ConsumerWorker( self.dataset, producer=producer, replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) test_worker.flush_batch( [ ProcessedMessage( action=ProcessorAction.REPLACE, data=[("1", {"project_id": 1})], ), ProcessedMessage( action=ProcessorAction.REPLACE, data=[("2", {"project_id": 2})], ), ] ) assert [(m._topic, m._key, m._value) for m in producer.messages] == [ ("event-replacements", b"1", b'{"project_id": 1}'), ("event-replacements", b"2", b'{"project_id": 2}'), ]
def test_offsets(self): event = self.event message: Message[KafkaPayload] = Message( Partition(Topic("events"), 456), 123, KafkaPayload( None, json.dumps((0, "insert", event)).encode("utf-8") ), # event doesn't really matter datetime.now(), ) test_worker = ConsumerWorker( self.dataset, producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) assert self.clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table ) == [(self.event["project_id"], self.event["event_id"], 123, 456)]
def test_skip_too_old(self): test_worker = ConsumerWorker( self.dataset, producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event["datetime"] = old_timestamp_str event["data"]["datetime"] = old_timestamp_str event["data"]["received"] = int(calendar.timegm(old_timestamp.timetuple())) message: Message[KafkaPayload] = Message( Partition(Topic("events"), 1), 42, KafkaPayload(None, json.dumps((0, "insert", event)).encode("utf-8")), datetime.now(), ) assert test_worker.process_message(message) is None
def test_produce_replacement_messages(self): producer = FakeConfluentKafkaProducer() replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, producer, replacement_topic.topic_name, self.metrics) test_worker.flush_batch([ ProcessedMessage( action=ProcessorAction.REPLACE, data=[('1', { 'project_id': 1 })], ), ProcessedMessage( action=ProcessorAction.REPLACE, data=[('2', { 'project_id': 2 })], ), ]) assert [(m._topic, m._key, m._value) for m in producer.messages] == \ [('event-replacements', b'1', b'{"project_id": 1}'), ('event-replacements', b'2', b'{"project_id": 2}')]
def test_send_message( self, value: str, expected: Optional[ProcessedMessage], ) -> None: storage = get_storage("groupedmessages") snapshot_id = uuid1() transact_data = TransactionData(xmin=100, xmax=200, xip_list=[120, 130]) worker = SnapshotAwareWorker( storage=storage, producer=FakeConfluentKafkaProducer(), snapshot_id=str(snapshot_id), transaction_data=transact_data, replacements_topic=None, metrics=DummyMetricsBackend(strict=True), ) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 1, KafkaPayload( None, value.encode("utf-8"), [("table", "sentry_groupedmessage".encode())], ), datetime.now(), ) ret = worker.process_message(message) assert ret == expected
def test_send_message( self, message: str, expected: Optional[ProcessedMessage], ) -> None: dataset = get_dataset("groupedmessage") snapshot_id = uuid1() transact_data = TransactionData(xmin=100, xmax=200, xip_list=[120, 130]) worker = SnapshotAwareWorker( dataset=dataset, producer=FakeConfluentKafkaProducer(), snapshot_id=str(snapshot_id), transaction_data=transact_data, replacements_topic=None, metrics=DummyMetricsBackend(strict=True), ) ret = worker.process_message( KafkaMessage( TopicPartition('topic', 0), 1, message.encode('utf-8'), )) assert ret == expected
def test_commit_log_consumer() -> None: # XXX: This would be better as an integration test (or at least a test # against an abstract Producer interface) instead of against a test against # a mock. commit_log_producer = FakeConfluentKafkaProducer() configuration = get_default_kafka_configuration() consumer: KafkaConsumer = KafkaConsumerWithCommitLog( { **configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }, producer=commit_log_producer, commit_log_topic=Topic("commit-log"), ) producer = KafkaProducer(configuration) topic = Topic("topic") with closing(consumer) as consumer: with closing(producer) as producer: producer.produce(topic, next(get_payloads())).result(5.0) consumer.subscribe([topic]) message = consumer.poll(10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) now = datetime.now() position = Position(message.next_offset, now) consumer.stage_positions({message.partition: position}) assert consumer.commit_positions() == {Partition(topic, 0): position} assert len(commit_log_producer.messages) == 1 commit_message = commit_log_producer.messages[0] assert commit_message.topic() == "commit-log" assert commit_codec.decode( KafkaPayload( commit_message.key(), commit_message.value(), commit_message.headers(), )) == Commit("test", Partition(topic, 0), message.next_offset, now)
def test_commit_log_consumer(self) -> None: # XXX: This would be better as an integration test (or at least a test # against an abstract Producer interface) instead of against a test against # a mock. commit_log_producer = FakeConfluentKafkaProducer() consumer: KafkaConsumer[int] = KafkaConsumerWithCommitLog( { **self.configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }, codec=self.codec, producer=commit_log_producer, commit_log_topic=Topic("commit-log"), ) with self.get_topic() as topic, closing(consumer) as consumer: consumer.subscribe([topic]) with closing(self.get_producer()) as producer: producer.produce(topic, 0).result(5.0) message = consumer.poll( 10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) consumer.stage_offsets( {message.partition: message.get_next_offset()}) assert consumer.commit_offsets() == { Partition(topic, 0): message.get_next_offset() } assert len(commit_log_producer.messages) == 1 commit_message = commit_log_producer.messages[0] assert commit_message.topic() == "commit-log" assert CommitCodec().decode( KafkaPayload(commit_message.key(), commit_message.value())) == Commit( "test", Partition(topic, 0), message.get_next_offset())
def test_commit_log_consumer(topic: str) -> None: # XXX: This would be better as an integration test (or at least a test # against an abstract Producer interface) instead of against a test against # a mock. commit_log_producer = FakeConfluentKafkaProducer() consumer = KafkaConsumerWithCommitLog( { **configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "true", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }, commit_log_producer, 'commit-log', ) consumer.subscribe([topic]) producer = ConfluentProducer(configuration) producer.produce(topic) assert producer.flush(5.0) is 0 message = consumer.poll(10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) assert consumer.commit() == {TopicPartition(topic, 0): message.offset + 1} assert len(commit_log_producer.messages) == 1 commit_message = commit_log_producer.messages[0] assert commit_message.topic() == 'commit-log' assert commit_message.key() == '{}:{}:{}'.format(topic, 0, 'test').encode('utf-8') assert commit_message.value() == '{}'.format(message.offset + 1).encode( 'utf-8') # offsets are last processed message offset + 1
def test_skip_too_old(self): replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, FakeConfluentKafkaProducer(), replacement_topic.topic_name, self.metrics) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event['datetime'] = old_timestamp_str event['data']['datetime'] = old_timestamp_str event['data']['received'] = int( calendar.timegm(old_timestamp.timetuple())) message = KafkaMessage( TopicPartition('events', 1), 42, json.dumps((0, 'insert', event)).encode('utf-8'), ) assert test_worker.process_message(message) is None
def test_offsets(self): event = self.event message = KafkaMessage( TopicPartition('events', 456), 123, json.dumps((0, 'insert', event)).encode('utf-8') # event doesn't really matter ) replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, FakeConfluentKafkaProducer(), replacement_topic.topic_name, self.metrics) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) assert self.clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table) == [(self.event['project_id'], self.event['event_id'], 123, 456)]
def test_produce_replacement_messages(self): producer = FakeConfluentKafkaProducer() test_worker = ConsumerWorker( self.dataset.get_writable_storage(), producer=producer, replacements_topic=Topic( enforce_table_writer(self.dataset).get_stream_loader(). get_replacement_topic_spec().topic_name), metrics=self.metrics, ) test_worker.flush_batch([ ReplacementBatch("1", [{ "project_id": 1 }]), ReplacementBatch("2", [{ "project_id": 2 }]), ]) assert [(m._topic, m._key, m._value) for m in producer.messages] == [ ("event-replacements", b"1", b'{"project_id":1}'), ("event-replacements", b"2", b'{"project_id":2}'), ]
def test_streaming_consumer_strategy() -> None: messages = (Message( Partition(Topic("events"), 0), i, KafkaPayload(None, b"{}", None), datetime.now(), ) for i in itertools.count()) replacements_producer = FakeConfluentKafkaProducer() processor = Mock() processor.process_message.side_effect = [ None, InsertBatch([{}]), ReplacementBatch("key", [{}]), ] writer = Mock() metrics = TestingMetricsBackend() factory = StreamingConsumerStrategyFactory( None, processor, writer, metrics, max_batch_size=10, max_batch_time=60, processes=None, input_block_size=None, output_block_size=None, replacements_producer=replacements_producer, replacements_topic=Topic("replacements"), ) commit_function = Mock() strategy = factory.create(commit_function) for i in range(3): strategy.poll() strategy.submit(next(messages)) assert metrics.calls == [] processor.process_message.side_effect = [{}] with pytest.raises(TypeError): strategy.poll() strategy.submit(next(messages)) def get_number_of_insertion_metrics() -> int: count = 0 for call in metrics.calls: if isinstance(call, Timing) and call.name == "insertions.latency_ms": count += 1 return count expected_write_count = 1 with assert_changes(get_number_of_insertion_metrics, 0, expected_write_count), assert_changes( lambda: writer.write.call_count, 0, expected_write_count), assert_changes( lambda: len(replacements_producer.messages), 0, 1): strategy.close() strategy.join()