def test_table_name_filter() -> None: table_name = "table_name" message_filter = CdcTableNameMessageFilter(table_name) # Messages that math the table should not be dropped. assert not message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", [("table", table_name.encode("utf8"))]), datetime.now(), )) # Messages without a table should be dropped. assert message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", []), datetime.now(), )) # Messages from a different table should be dropped. assert message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", [("table", b"other_table")]), datetime.now(), ))
def test_multistorage_strategy() -> None: from snuba.datasets.storages import groupassignees, groupedmessages from tests.datasets.cdc.test_groupassignee import TestGroupassignee from tests.datasets.cdc.test_groupedmessage import TestGroupedMessage commit = Mock() storages = [groupassignees.storage, groupedmessages.storage] strategy = MultistorageConsumerProcessingStrategyFactory( storages, 10, 10, 1, int(32 * 1e6), int(64 * 1e6), TestingMetricsBackend(), ).create(commit) payloads = [ KafkaPayload(None, b"{}", [("table", b"ignored")]), KafkaPayload( None, json.dumps(TestGroupassignee.INSERT_MSG).encode("utf8"), [("table", groupassignees.storage.get_postgres_table().encode("utf8"))], ), KafkaPayload( None, json.dumps(TestGroupedMessage.INSERT_MSG).encode("utf8"), [("table", groupedmessages.storage.get_postgres_table().encode("utf8"))], ), ] messages = [ Message(Partition(Topic("topic"), 0), offset, payload, datetime.now(), offset + 1) for offset, payload in enumerate(payloads) ] with assert_changes(lambda: get_row_count(groupassignees.storage), 0, 1), assert_changes( lambda: get_row_count(groupedmessages.storage), 0, 1): for message in messages: strategy.submit(message) with assert_changes(lambda: commit.call_args_list, [], [call({Partition(Topic("topic"), 0): 3})]): strategy.close() strategy.join()
def test_offsets(self): event = self.event message: Message[KafkaPayload] = Message( Partition(Topic("events"), 456), 123, KafkaPayload(None, json.dumps((2, "insert", event)).encode("utf-8"), []), # event doesn't really matter datetime.now(), ) test_worker = ConsumerWorker( self.dataset.get_writable_storage(), producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset).get_stream_loader(). get_replacement_topic_spec().topic_name), metrics=self.metrics, ) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) clickhouse = (get_storage( StorageKey.EVENTS).get_cluster().get_query_connection( ClickhouseClientSettings.QUERY)) assert clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table) == [(self.event["project_id"], self.event["event_id"], 123, 456)]
def eventstream(*, dataset: Dataset): record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data, []), datetime.now(), ) type_ = record[1] storage = dataset.get_writable_storage() assert storage is not None if type_ == "insert": from snuba.consumer import ConsumerWorker worker = ConsumerWorker(storage, metrics=metrics) else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(storage, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def _wrap(self, msg: str) -> Message[KafkaPayload]: return Message( Partition(Topic("replacements"), 0), 0, KafkaPayload(None, json.dumps(msg).encode("utf-8"), []), datetime.now(), )
def test_skip_too_old(self): test_worker = ConsumerWorker( self.dataset.get_writable_storage(), producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset).get_stream_loader(). get_replacement_topic_spec().topic_name), metrics=self.metrics, ) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event["datetime"] = old_timestamp_str event["data"]["datetime"] = old_timestamp_str event["data"]["received"] = int( calendar.timegm(old_timestamp.timetuple())) message: Message[KafkaPayload] = Message( Partition(Topic("events"), 1), 42, KafkaPayload(None, json.dumps((2, "insert", event)).encode("utf-8"), []), datetime.now(), ) assert test_worker.process_message(message) is None
def test_flattened_tags(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 # | and = are intentional to test the escaping logic when computing the # flattened_tags on tag deletions self.event["data"]["tags"] = [] self.event["data"]["tags"].append(["browser|name", "foo=1"]) self.event["data"]["tags"].append(["browser|to_delete", "foo=2"]) self.event["data"]["tags"].append(["notbrowser", "foo\\3"]) self.event["data"]["tags"].append(["notbrowser2", "foo4"]) self.write_events([self.event]) project_id = self.project_id def _fetch_flattened_tags(): return json.loads( self.app.post( "/query", data=json.dumps({ "project": [project_id], "selected_columns": [ "_tags_flattened", "tags.key", "tags.value", ], }), ).data)["data"] timestamp = datetime.now(tz=pytz.utc) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_delete_tag", { "project_id": project_id, "tag": "browser|to_delete", "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert _fetch_flattened_tags() == [{ "tags.key": ["browser|name", "notbrowser", "notbrowser2"], "tags.value": ["foo=1", "foo\\3", "foo4"], "_tags_flattened": "|browser\\|name=foo\\=1||notbrowser=foo\\\\3||notbrowser2=foo4|", }]
def __make_msg(self, partition: int, offset: int, payload: str, headers: Headers) -> Message[KafkaPayload]: return Message( partition=Partition(Topic("topic"), partition), offset=offset, payload=KafkaPayload(b"key", payload.encode(), headers), timestamp=datetime(2019, 6, 19, 6, 46, 28), )
def test_delete_tag_promoted_insert(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["data"]["tags"].append(["browser.name", "foo"]) self.event["data"]["tags"].append(["notbrowser", "foo"]) self.write_unprocessed_events([self.event]) project_id = self.project_id def _issue_count(total=False): return json.loads( self.app.post( "/query", data=json.dumps( { "project": [project_id], "aggregations": [["count()", "", "count"]], "conditions": [["tags[browser.name]", "=", "foo"]] if not total else [], "groupby": ["group_id"], } ), ).data )["data"] assert _issue_count() == [{"count": 1, "group_id": 1}] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}] timestamp = datetime.now(tz=pytz.utc) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps( ( 2, "end_delete_tag", { "project_id": project_id, "tag": "browser.name", "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, ) ).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert _issue_count() == [] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]
def test_synchronized_consumer_handles_end_of_partition( broker: Broker[KafkaPayload], ) -> None: topic = Topic("topic") commit_log_topic = Topic("commit-log") broker.create_topic(topic, partitions=1) broker.create_topic(commit_log_topic, partitions=1) consumer = broker.get_consumer("consumer", enable_end_of_partition=True) producer = broker.get_producer() commit_log_consumer = broker.get_consumer("commit-log-consumer") messages = [ producer.produce(topic, KafkaPayload(None, f"{i}".encode("utf8"), [])).result(1.0) for i in range(2) ] synchronized_consumer: Consumer[KafkaPayload] = SynchronizedConsumer( consumer, commit_log_consumer, commit_log_topic=commit_log_topic, commit_log_groups={"leader"}, ) with closing(synchronized_consumer): synchronized_consumer.subscribe([topic]) wait_for_consumer( commit_log_consumer, producer.produce( commit_log_topic, commit_codec.encode( Commit("leader", Partition(topic, 0), messages[0].next_offset), ), ).result(), ) assert synchronized_consumer.poll(0) == messages[0] # If the commit log consumer does not handle EOF, it will have crashed # here and will never return the next message. wait_for_consumer( commit_log_consumer, producer.produce( commit_log_topic, commit_codec.encode( Commit("leader", Partition(topic, 0), messages[1].next_offset), ), ).result(), ) assert synchronized_consumer.poll(0) == messages[1]
def test_parallel_transform_worker_apply() -> None: messages = [ Message( Partition(Topic("test"), 0), i, KafkaPayload(None, b"\x00" * size, None), datetime.now(), ) for i, size in enumerate([1000, 1000, 2000, 4000]) ] with SharedMemoryManager() as smm: input_block = smm.SharedMemory(8192) assert input_block.size == 8192 input_batch = MessageBatch(input_block) for message in messages: input_batch.append(message) assert len(input_batch) == 4 output_block = smm.SharedMemory(4096) assert output_block.size == 4096 index, output_batch = parallel_transform_worker_apply( transform_payload_expand, input_batch, output_block, ) # The first batch should be able to fit 2 messages. assert index == 2 assert len(output_batch) == 2 index, output_batch = parallel_transform_worker_apply( transform_payload_expand, input_batch, output_block, index, ) # The second batch should be able to fit one message. assert index == 3 assert len(output_batch) == 1 # The last message is too large to fit in the batch. with pytest.raises(ValueTooLarge): parallel_transform_worker_apply( transform_payload_expand, input_batch, output_block, index, )
def get_messages(events_file) -> Sequence[Message[KafkaPayload]]: "Create a fake Kafka message for each JSON event in the file." messages: MutableSequence[Message[KafkaPayload]] = [] raw_events = open(events_file).readlines() for raw_event in raw_events: messages.append( Message( Partition(Topic("events"), 1), 0, KafkaPayload(None, raw_event.encode("utf-8"), []), datetime.now(), ), ) return messages
def eventstream(*, dataset: Dataset) -> RespTuple: record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data, []), datetime.now(), ) type_ = record[1] storage = dataset.get_default_entity().get_writable_storage() assert storage is not None if type_ == "insert": from snuba.consumers.consumer import StreamingConsumerStrategyFactory table_writer = storage.get_table_writer() stream_loader = table_writer.get_stream_loader() strategy = StreamingConsumerStrategyFactory( stream_loader.get_pre_filter(), stream_loader.get_processor(), table_writer.get_batch_writer(metrics), metrics, max_batch_size=1, max_batch_time=1.0, processes=None, input_block_size=None, output_block_size=None, ).create(lambda offsets: None) strategy.submit(message) strategy.close() strategy.join() else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(storage, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def test_commit_log_consumer(self) -> None: # XXX: This would be better as an integration test (or at least a test # against an abstract Producer interface) instead of against a test against # a mock. commit_log_producer = FakeConfluentKafkaProducer() consumer: KafkaConsumer = KafkaConsumerWithCommitLog( { **self.configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }, producer=commit_log_producer, commit_log_topic=Topic("commit-log"), ) with self.get_topic() as topic, closing(consumer) as consumer: with closing(self.get_producer()) as producer: producer.produce(topic, next(self.get_payloads())).result(5.0) consumer.subscribe([topic]) message = consumer.poll( 10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) consumer.stage_offsets({message.partition: message.next_offset}) assert consumer.commit_offsets() == { Partition(topic, 0): message.next_offset } assert len(commit_log_producer.messages) == 1 commit_message = commit_log_producer.messages[0] assert commit_message.topic() == "commit-log" assert commit_codec.decode( KafkaPayload( commit_message.key(), commit_message.value(), commit_message.headers(), )) == Commit("test", Partition(topic, 0), message.next_offset)
def encode(self, value: SubscriptionTaskResult) -> KafkaPayload: subscription_id = str(value.task.task.identifier) request, result = value.result return KafkaPayload( subscription_id.encode("utf-8"), json.dumps({ "version": 2, "payload": { "subscription_id": subscription_id, "request": { **request.body }, "result": result, "timestamp": value.task.timestamp.isoformat(), }, }).encode("utf-8"), [], )
def test_unmerge_insert(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 self.write_events([self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_unmerge", { "project_id": project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }]
def test_message_batch() -> None: partition = Partition(Topic("test"), 0) with SharedMemoryManager() as smm: block = smm.SharedMemory(4096) assert block.size == 4096 message = Message(partition, 0, KafkaPayload(None, b"\x00" * 4000, None), datetime.now()) batch: MessageBatch[KafkaPayload] = MessageBatch(block) with assert_changes(lambda: len(batch), 0, 1): batch.append(message) assert batch[0] == message assert list(batch) == [message] with assert_does_not_change(lambda: len(batch), 1), pytest.raises(ValueTooLarge): batch.append(message)
def test_delete_groups_insert(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_delete_groups", { "project_id": project_id, "group_ids": [1], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == []
def test_parallel_transform_step() -> None: next_step = Mock() messages = [ Message( Partition(Topic("test"), 0), i, KafkaPayload(None, b"\x00" * size, None), datetime.now(), ) for i, size in enumerate([1000, 1000, 2000, 2000]) ] starting_processes = get_subprocess_count() worker_processes = 2 manager_processes = 1 metrics = TestingMetricsBackend() with assert_changes( get_subprocess_count, starting_processes, starting_processes + worker_processes + manager_processes, ), assert_changes( lambda: metrics.calls, [], [ GaugeCall("batches_in_progress", value, tags=None) for value in [0.0, 1.0, 2.0] ], ): transform_step = ParallelTransformStep( transform_payload_expand, next_step, processes=worker_processes, max_batch_size=5, max_batch_time=60, input_block_size=4096, output_block_size=4096, metrics=metrics, ) for message in messages: transform_step.poll() transform_step.submit(message) transform_step.close() metrics.calls.clear() with assert_changes( get_subprocess_count, starting_processes + worker_processes + manager_processes, starting_processes, ), assert_changes( lambda: metrics.calls, [], [ GaugeCall("batches_in_progress", value, tags=None) for value in [1.0, 0.0] ], ): transform_step.join() assert next_step.submit.call_count == len(messages)
def test_synchronized_consumer(broker: Broker[KafkaPayload]) -> None: topic = Topic("topic") commit_log_topic = Topic("commit-log") broker.create_topic(topic, partitions=1) broker.create_topic(commit_log_topic, partitions=1) consumer = broker.get_consumer("consumer") producer = broker.get_producer() commit_log_consumer = broker.get_consumer("commit-log-consumer") messages = [ producer.produce(topic, KafkaPayload(None, f"{i}".encode("utf8"), [])).result(1.0) for i in range(6) ] synchronized_consumer: Consumer[KafkaPayload] = SynchronizedConsumer( consumer, commit_log_consumer, commit_log_topic=commit_log_topic, commit_log_groups={"leader-a", "leader-b"}, ) with closing(synchronized_consumer): synchronized_consumer.subscribe([topic]) # The consumer should not consume any messages until it receives a # commit from both groups that are being followed. with assert_changes(consumer.paused, [], [Partition(topic, 0)]), assert_changes( consumer.tell, {}, {Partition(topic, 0): messages[0].offset}): assert synchronized_consumer.poll(0.0) is None wait_for_consumer( commit_log_consumer, producer.produce( commit_log_topic, commit_codec.encode( Commit("leader-a", Partition(topic, 0), messages[0].next_offset)), ).result(), ) # The consumer should remain paused, since it needs both groups to # advance before it may continue. with assert_does_not_change( consumer.paused, [Partition(topic, 0)]), assert_does_not_change( consumer.tell, {Partition(topic, 0): messages[0].offset}): assert synchronized_consumer.poll(0.0) is None wait_for_consumer( commit_log_consumer, producer.produce( commit_log_topic, commit_codec.encode( Commit("leader-b", Partition(topic, 0), messages[0].next_offset)), ).result(), ) # The consumer should be able to resume consuming, since both consumers # have processed the first message. with assert_changes(consumer.paused, [Partition(topic, 0)], []), assert_changes( consumer.tell, {Partition(topic, 0): messages[0].offset}, {Partition(topic, 0): messages[0].next_offset}, ): assert synchronized_consumer.poll(0.0) == messages[0] # After consuming the one available message, the consumer should be # paused again until the remote offsets advance. with assert_changes(consumer.paused, [], [Partition(topic, 0)]), assert_does_not_change( consumer.tell, {Partition(topic, 0): messages[1].offset}): assert synchronized_consumer.poll(0.0) is None # Emulate the unlikely (but possible) scenario of the leader offsets # being within a series of compacted (deleted) messages by: # 1. moving the remote offsets forward, so that the partition is resumed # 2. seeking the consumer beyond the remote offsets producer.produce( commit_log_topic, commit_codec.encode( Commit("leader-a", Partition(topic, 0), messages[3].offset)), ).result() wait_for_consumer( commit_log_consumer, producer.produce( commit_log_topic, commit_codec.encode( Commit("leader-b", Partition(topic, 0), messages[5].offset)), ).result(), ) # The consumer should be able to resume consuming, since both consumers # have processed the first message. with assert_changes(consumer.paused, [Partition(topic, 0)], []), assert_changes( consumer.tell, {Partition(topic, 0): messages[1].offset}, {Partition(topic, 0): messages[1].next_offset}, ): assert synchronized_consumer.poll(0.0) == messages[1] # At this point, we manually seek the consumer offset, to emulate messages being skipped. with assert_changes( consumer.tell, {Partition(topic, 0): messages[2].offset}, {Partition(topic, 0): messages[4].offset}, ): consumer.seek({Partition(topic, 0): messages[4].offset}) # Since the (effective) remote offset is the offset for message #3 (via # ``leader-a``), and the local offset is the offset of message #4, when # message #4 is consumed, it should be discarded and the offset should # be rolled back to wait for the commit log to advance. with assert_changes(consumer.paused, [], [Partition(topic, 0)]), assert_does_not_change( consumer.tell, {Partition(topic, 0): messages[4].offset}): assert synchronized_consumer.poll(0.0) is None wait_for_consumer( commit_log_consumer, producer.produce( commit_log_topic, commit_codec.encode( Commit("leader-a", Partition(topic, 0), messages[5].offset)), ).result(), ) # The consumer should be able to resume consuming. with assert_changes(consumer.paused, [Partition(topic, 0)], []), assert_changes( consumer.tell, {Partition(topic, 0): messages[4].offset}, {Partition(topic, 0): messages[4].next_offset}, ): assert synchronized_consumer.poll(0.0) == messages[4]
def test_payload_equality() -> None: assert KafkaPayload(None, b"", []) == KafkaPayload(None, b"", []) assert KafkaPayload(b"key", b"value", []) == KafkaPayload(b"key", b"value", []) assert KafkaPayload(None, b"", [("key", b"value")]) == KafkaPayload( None, b"", [("key", b"value")]) assert not KafkaPayload(None, b"a", []) == KafkaPayload(None, b"b", []) assert not KafkaPayload(b"this", b"", []) == KafkaPayload(b"that", b"", []) assert not KafkaPayload(None, b"", [("key", b"this")]) == KafkaPayload( None, b"", [("key", b"that")])
def test_synchronized_consumer_pause_resume( broker: Broker[KafkaPayload]) -> None: topic = Topic("topic") commit_log_topic = Topic("commit-log") broker.create_topic(topic, partitions=1) broker.create_topic(commit_log_topic, partitions=1) consumer = broker.get_consumer("consumer") producer = broker.get_producer() commit_log_consumer = broker.get_consumer("commit-log-consumer") messages = [ producer.produce(topic, KafkaPayload(None, f"{i}".encode("utf8"), [])).result(1.0) for i in range(2) ] synchronized_consumer: Consumer[KafkaPayload] = SynchronizedConsumer( consumer, commit_log_consumer, commit_log_topic=commit_log_topic, commit_log_groups={"leader"}, ) with closing(synchronized_consumer): def assignment_callback(offsets: Mapping[Partition, int]) -> None: synchronized_consumer.pause([Partition(topic, 0)]) synchronized_consumer.subscribe([topic], on_assign=assignment_callback) with assert_changes(synchronized_consumer.paused, [], [Partition(topic, 0)]), assert_changes( consumer.paused, [], [Partition(topic, 0)]): assert synchronized_consumer.poll(0.0) is None # Advancing the commit log offset should not cause the consumer to # resume, since it has been explicitly paused. wait_for_consumer( commit_log_consumer, producer.produce( commit_log_topic, commit_codec.encode( Commit("leader", Partition(topic, 0), messages[0].next_offset)), ).result(), ) with assert_does_not_change(consumer.paused, [Partition(topic, 0)]): assert synchronized_consumer.poll(0) is None # Resuming the partition does not immediately cause the partition to # resume, but it should look as if it is resumed to the caller. with assert_changes(synchronized_consumer.paused, [Partition(topic, 0)], []), assert_does_not_change( consumer.paused, [Partition(topic, 0)]): synchronized_consumer.resume([Partition(topic, 0)]) # The partition should be resumed on the next poll call, however. with assert_changes(consumer.paused, [Partition(topic, 0)], []): assert synchronized_consumer.poll(0) == messages[0] # Pausing due to hitting the offset fence should not appear as a paused # partition to the caller. with assert_does_not_change(synchronized_consumer.paused, []), assert_changes( consumer.paused, [], [Partition(topic, 0)]): assert synchronized_consumer.poll(0) is None # Other pause and resume actions should not cause the inner consumer to # change its state while up against the fence. with assert_changes(synchronized_consumer.paused, [], [Partition(topic, 0)]), assert_does_not_change( consumer.paused, [Partition(topic, 0)]): synchronized_consumer.pause([Partition(topic, 0)]) with assert_changes(synchronized_consumer.paused, [Partition(topic, 0)], []), assert_does_not_change( consumer.paused, [Partition(topic, 0)]): synchronized_consumer.resume([Partition(topic, 0)])
def test_streaming_consumer_strategy() -> None: messages = (Message( Partition(Topic("events"), 0), i, KafkaPayload(None, b"{}", None), datetime.now(), ) for i in itertools.count()) replacements_producer = FakeConfluentKafkaProducer() processor = Mock() processor.process_message.side_effect = [ None, InsertBatch([{}]), ReplacementBatch("key", [{}]), ] writer = Mock() metrics = TestingMetricsBackend() factory = StreamingConsumerStrategyFactory( None, processor, writer, metrics, max_batch_size=10, max_batch_time=60, processes=None, input_block_size=None, output_block_size=None, replacements_producer=replacements_producer, replacements_topic=Topic("replacements"), ) commit_function = Mock() strategy = factory.create(commit_function) for i in range(3): strategy.poll() strategy.submit(next(messages)) assert metrics.calls == [] processor.process_message.side_effect = [{}] with pytest.raises(TypeError): strategy.poll() strategy.submit(next(messages)) def get_number_of_insertion_metrics() -> int: count = 0 for call in metrics.calls: if isinstance(call, Timing) and call.name == "insertions.latency_ms": count += 1 return count expected_write_count = 1 with assert_changes(get_number_of_insertion_metrics, 0, expected_write_count), assert_changes( lambda: writer.write.call_count, 0, expected_write_count), assert_changes( lambda: len(replacements_producer.messages), 0, 1): strategy.close() strategy.join()
def test_payload_pickle_simple() -> None: payload = KafkaPayload(b"key", b"value", []) assert pickle.loads(pickle.dumps(payload)) == payload
def test_reprocessing_flow_insert(self) -> None: # We have a group that contains two events, 1 and 2. self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["event_id"] = event_id = "00e24a150d7f4ee4b142b61b4d893b6d" write_unprocessed_events(self.storage, [self.event]) self.event["event_id"] = event_id2 = "00e24a150d7f4ee4b142b61b4d893b6e" write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 2, "group_id": 1 }] project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "tombstone_events", { "project_id": project_id, "event_ids": [event_id] }, )).encode("utf-8"), [], ), datetime.now(), ) # The user chooses to reprocess a subset of the group and throw away # the other events. Event 1 gets manually tombstoned by Sentry while # Event 2 prevails. processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # At this point the count doesn't make any sense but we don't care. assert self._issue_count(self.project_id) == [{ "count": 2, "group_id": 1 }] # The reprocessed event is inserted with a guaranteed-new group ID but # the *same* event ID (this is why we need to skip tombstoning this # event ID) self.event["group_id"] = 2 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "exclude_groups", { "project_id": project_id, "group_ids": [1] }, )).encode("utf-8"), [], ), datetime.now(), ) # Group 1 is excluded from queries. At this point we have almost a # regular group deletion, except only a subset of events have been # tombstoned (the ones that will *not* be reprocessed). processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # Group 2 should contain the one event that the user chose to # reprocess, and Group 1 should be gone. (Note: In the product Group 2 # looks identical to Group 1, including short ID). assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }] assert self._get_group_id(project_id, event_id2) == 2 assert not self._get_group_id(project_id, event_id)
def test_payload_pickle_out_of_band() -> None: payload = KafkaPayload(b"key", b"value", []) buffers: MutableSequence[PickleBuffer] = [] data = pickle.dumps(payload, protocol=5, buffer_callback=buffers.append) assert pickle.loads(data, buffers=[b.raw() for b in buffers]) == payload
def get_payloads(self) -> Iterator[KafkaPayload]: for i in itertools.count(): yield KafkaPayload(None, f"{i}".encode("utf8"), [])
def transform_payload_expand(message: Message[KafkaPayload]) -> KafkaPayload: return KafkaPayload( message.payload.key, message.payload.value * 2, message.payload.headers, )