def test_table_name_filter() -> None: table_name = "table_name" message_filter = CdcTableNameMessageFilter(table_name) # Messages that math the table should not be dropped. assert not message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", [("table", table_name.encode("utf8"))]), datetime.now(), )) # Messages without a table should be dropped. assert message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", []), datetime.now(), )) # Messages from a different table should be dropped. assert message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", [("table", b"other_table")]), datetime.now(), ))
def test_multistorage_strategy( processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], ) -> None: from snuba.datasets.storages import groupassignees, groupedmessages from tests.datasets.cdc.test_groupassignee import TestGroupassignee from tests.datasets.cdc.test_groupedmessage import TestGroupedMessage commit = Mock() storages = [groupassignees.storage, groupedmessages.storage] strategy = MultistorageConsumerProcessingStrategyFactory( storages, 10, 10, processes, input_block_size, output_block_size, TestingMetricsBackend(), ).create(commit) payloads = [ KafkaPayload(None, b"{}", [("table", b"ignored")]), KafkaPayload( None, json.dumps(TestGroupassignee.INSERT_MSG).encode("utf8"), [("table", groupassignees.storage.get_postgres_table().encode("utf8"))], ), KafkaPayload( None, json.dumps(TestGroupedMessage.INSERT_MSG).encode("utf8"), [("table", groupedmessages.storage.get_postgres_table().encode("utf8"))], ), ] messages = [ Message( Partition(Topic("topic"), 0), offset, payload, datetime.now(), offset + 1 ) for offset, payload in enumerate(payloads) ] with assert_changes( lambda: get_row_count(groupassignees.storage), 0, 1 ), assert_changes(lambda: get_row_count(groupedmessages.storage), 0, 1): for message in messages: strategy.submit(message) with assert_changes( lambda: commit.call_args_list, [], [call({Partition(Topic("topic"), 0): 3})] ): strategy.close() strategy.join()
def _assignment_callback(offsets: Mapping[Partition, int]) -> None: assert consumer.tell() == { Partition(topic, 0): 0, Partition(topic, 1): 0, } assert inner_consumer.tell() == { Partition(topic, 0): 0, Partition(topic, 1): 0, }
def test_commit_log_consumer() -> None: # XXX: This would be better as an integration test (or at least a test # against an abstract Producer interface) instead of against a test against # a mock. commit_log_producer = FakeConfluentKafkaProducer() configuration = get_default_kafka_configuration() consumer: KafkaConsumer = KafkaConsumerWithCommitLog( { **configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }, producer=commit_log_producer, commit_log_topic=Topic("commit-log"), ) producer = KafkaProducer(configuration) topic = Topic("topic") with closing(consumer) as consumer: with closing(producer) as producer: producer.produce(topic, next(get_payloads())).result(5.0) consumer.subscribe([topic]) message = consumer.poll(10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) now = datetime.now() position = Position(message.next_offset, now) consumer.stage_positions({message.partition: position}) assert consumer.commit_positions() == {Partition(topic, 0): position} assert len(commit_log_producer.messages) == 1 commit_message = commit_log_producer.messages[0] assert commit_message.topic() == "commit-log" assert commit_codec.decode( KafkaPayload( commit_message.key(), commit_message.value(), commit_message.headers(), )) == Commit("test", Partition(topic, 0), message.next_offset, now)
def test_multiple_partitions(self) -> None: """ Different partitions should have independent offset checks. """ set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) payload = KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ) offset = 42 timestamp = datetime.now() partition_one: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), offset, payload, timestamp, ) partition_two: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 2), offset, payload, timestamp, ) processed = self.replacer.process_message(partition_one) self.replacer.flush_batch([processed]) # different partition should be unaffected even if it's the same offset assert self.replacer.process_message(partition_two) is not None
def test_tick_consumer_min_interval() -> None: clock = TestingClock() broker: Broker[int] = Broker(MemoryMessageStorage(), clock) topic = Topic("messages") broker.create_topic(topic, partitions=2) producer = broker.get_producer() for payload in range(3): producer.produce(Partition(topic, 0), payload).result() clock.sleep(1.0) inner_consumer = broker.get_consumer("group") consumer = TickConsumer(inner_consumer, min_interval=timedelta(seconds=2)) consumer.subscribe([topic]) assert consumer.poll() is None assert consumer.poll() is None message = consumer.poll() assert message is not None tick = message.payload assert tick.offsets.upper - tick.offsets.lower == 2 assert tick.timestamps.upper - tick.timestamps.lower == timedelta(seconds=2)
def test_delete_tag_promoted_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["data"]["tags"].append(["browser.name", "foo"]) self.event["data"]["tags"].append(["notbrowser", "foo"]) write_unprocessed_events(self.storage, [self.event]) project_id = self.project_id def _issue_count(total: bool = False) -> Sequence[Mapping[str, Any]]: clickhouse = self.storage.get_cluster().get_query_connection( ClickhouseClientSettings.QUERY) total_cond = ( "AND has(_tags_hash_map, cityHash64('browser.name=foo'))" if not total else "") data = clickhouse.execute(f""" SELECT group_id, count() FROM errors_local FINAL WHERE deleted = 0 AND project_id = {project_id} {total_cond} GROUP BY group_id """).results return [{"group_id": row[0], "count": row[1]} for row in data] assert _issue_count() == [{"count": 1, "group_id": 1}] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}] timestamp = datetime.now(tz=pytz.utc) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_DELETE_TAG, { "project_id": project_id, "tag": "browser.name", "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) assert processed is not None self.replacer.flush_batch([processed]) assert _issue_count() == [] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]
def test_execute_and_produce_result() -> None: state.set_config("subscription_mode_events", "new") dataset = get_dataset("events") entity_names = ["events"] max_concurrent_queries = 2 total_concurrent_queries = 2 metrics = TestingMetricsBackend() scheduled_topic = Topic("scheduled-subscriptions-events") result_topic = Topic("events-subscriptions-results") clock = TestingClock() broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage() broker: Broker[KafkaPayload] = Broker(broker_storage, clock) broker.create_topic(scheduled_topic, partitions=1) broker.create_topic(result_topic, partitions=1) producer = broker.get_producer() commit = mock.Mock() strategy = ExecuteQuery( dataset, entity_names, max_concurrent_queries, total_concurrent_queries, None, metrics, ProduceResult(producer, result_topic.name, commit), commit, ) subscription_identifier = SubscriptionIdentifier(PartitionId(0), uuid.uuid1()) make_message = generate_message(EntityKey.EVENTS, subscription_identifier) message = next(make_message) strategy.submit(message) # Eventually a message should be produced and offsets committed while (broker_storage.consume(Partition(result_topic, 0), 0) is None or commit.call_count == 0): strategy.poll() produced_message = broker_storage.consume(Partition(result_topic, 0), 0) assert produced_message is not None assert produced_message.payload.key == str(subscription_identifier).encode( "utf-8") assert commit.call_count == 1
def _wrap(self, msg: Tuple[Any, ...]) -> Message[KafkaPayload]: return Message( Partition(Topic("replacements"), 0), 0, KafkaPayload(None, json.dumps(msg).encode("utf-8"), []), datetime.now(), )
def test_offset_already_processed(self) -> None: """ Don't process an offset that already exists in Redis. """ set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) key = f"replacement:{CONSUMER_GROUP}:errors:1" redis_client.set(key, 42) old_offset: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 41, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, {}, )).encode("utf-8"), [], ), datetime.now(), ) same_offset: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, {}, )).encode("utf-8"), [], ), datetime.now(), ) assert self.replacer.process_message(old_offset) is None assert self.replacer.process_message(same_offset) is None
def test_subscription_worker_consistent( subscription_data: SubscriptionData) -> None: state.set_config("event_subscription_non_consistent_sample_rate", 1) broker: Broker[SubscriptionTaskResult] = Broker(MemoryMessageStorage(), TestingClock()) result_topic = Topic("subscription-results") broker.create_topic(result_topic, partitions=1) frequency = timedelta(minutes=1) evaluations = 1 subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), subscription_data, ) store = DummySubscriptionDataStore() store.create(subscription.identifier.uuid, subscription.data) metrics = TestingMetricsBackend() dataset = get_dataset("events") worker = SubscriptionWorker( dataset, ThreadPoolExecutor(), { 0: SubscriptionScheduler(store, PartitionId(0), timedelta(), DummyMetricsBackend(strict=True)) }, broker.get_producer(), result_topic, metrics, ) now = datetime(2000, 1, 1) tick = Tick( offsets=Interval(0, 1), timestamps=Interval(now - (frequency * evaluations), now), ) worker.process_message(Message(Partition(Topic("events"), 0), 0, tick, now)) time.sleep(0.1) assert (len([ m for m in metrics.calls if isinstance(m, Increment) and m.name == "consistent" ]) == 1)
def eventstream(*, dataset: Dataset) -> RespTuple: record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data, []), datetime.now(), ) type_ = record[1] storage = dataset.get_default_entity().get_writable_storage() assert storage is not None if type_ == "insert": from arroyo.processing.strategies.streaming import ( KafkaConsumerStrategyFactory, ) from snuba.consumers.consumer import build_batch_writer, process_message table_writer = storage.get_table_writer() stream_loader = table_writer.get_stream_loader() strategy = KafkaConsumerStrategyFactory( stream_loader.get_pre_filter(), functools.partial( process_message, stream_loader.get_processor(), "consumer_grouup" ), build_batch_writer(table_writer, metrics=metrics), max_batch_size=1, max_batch_time=1.0, processes=None, input_block_size=None, output_block_size=None, ).create(lambda offsets: None) strategy.submit(message) strategy.close() strategy.join() else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(storage, "consumer_group", metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def test_kafka_filter_header_with_bypass() -> None: header_filter = KafkaHeaderFilterWithBypass("should_drop", "1", 5) message = Message( Partition(Topic("random"), 1), 1, KafkaPayload(b"key", b"value", [("should_drop", b"1")]), datetime.now(), ) for _ in range(3): assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is False
def test_skip_kafka_message(self) -> None: state.set_config("kafka_messages_to_skip", "[snuba-test-lol:1:2,snuba-test-yeet:0:1]") assert skip_kafka_message( Message( Partition(Topic("snuba-test-lol"), 1), 2, KafkaPayload(None, b"", []), datetime.now(), )) assert skip_kafka_message( Message( Partition(Topic("snuba-test-yeet"), 0), 1, KafkaPayload(None, b"", []), datetime.now(), )) assert not skip_kafka_message( Message( Partition(Topic("snuba-test-lol"), 2), 1, KafkaPayload(None, b"", []), datetime.now(), ))
def test_unmerge_hierarchical_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "b" * 32 self.event["data"]["hierarchical_hashes"] = ["a" * 32] write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE_HIERARCHICAL, { "project_id": project_id, "previous_group_id": 1, "new_group_id": 2, "hierarchical_hash": "a" * 32, "primary_hash": "b" * 32, "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) assert processed is not None self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }]
def test_unmerge_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_unmerge", { "project_id": project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }]
def test_skip_stale_message() -> None: dataset = get_dataset("events") entity_names = ["events"] max_concurrent_queries = 2 total_concurrent_queries = 2 metrics = TestingMetricsBackend() scheduled_topic = Topic("scheduled-subscriptions-events") result_topic = Topic("events-subscriptions-results") clock = TestingClock() broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage() broker: Broker[KafkaPayload] = Broker(broker_storage, clock) broker.create_topic(scheduled_topic, partitions=1) broker.create_topic(result_topic, partitions=1) producer = broker.get_producer() commit = mock.Mock() stale_threshold_seconds = 60 strategy = ExecuteQuery( dataset, entity_names, max_concurrent_queries, total_concurrent_queries, stale_threshold_seconds, metrics, ProduceResult(producer, result_topic.name, commit), commit, ) subscription_identifier = SubscriptionIdentifier(PartitionId(0), uuid.uuid1()) make_message = generate_message(EntityKey.EVENTS, subscription_identifier) message = next(make_message) strategy.submit(message) # No message will be produced strategy.poll() assert broker_storage.consume(Partition(result_topic, 0), 0) is None assert Increment("skipped_execution", 1, {"entity": "events"}) in metrics.calls
def test_delete_groups_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.utcnow() project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_DELETE_GROUPS, { "project_id": project_id, "group_ids": [1], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [] # Count is still zero after Redis flushed and parts merged self._clear_redis_and_force_merge() assert self._issue_count(self.project_id) == []
def generate_message( entity_key: EntityKey, subscription_identifier: Optional[SubscriptionIdentifier] = None, ) -> Iterator[Message[KafkaPayload]]: codec = SubscriptionScheduledTaskEncoder() epoch = datetime(1970, 1, 1) i = 0 if subscription_identifier is None: subscription_identifier = SubscriptionIdentifier( PartitionId(1), uuid.uuid1()) data_dict = {} if entity_key in (EntityKey.METRICS_SETS, EntityKey.METRICS_COUNTERS): data_dict = {"organization": 1} entity_subscription = ENTITY_KEY_TO_SUBSCRIPTION_MAPPER[entity_key]( data_dict=data_dict) while True: payload = codec.encode( ScheduledSubscriptionTask( epoch + timedelta(minutes=i), SubscriptionWithMetadata( entity_key, Subscription( subscription_identifier, SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query=f"MATCH ({entity_key.value}) SELECT count()", entity_subscription=entity_subscription, ), ), i + 1, ), )) yield Message(Partition(Topic("test"), 0), i, payload, epoch) i += 1
def test_reset_consumer_group_offset_check(self) -> None: set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) self.replacer.flush_batch([self.replacer.process_message(message)]) set_config(replacer.RESET_CHECK_CONFIG, f"[{CONSUMER_GROUP}]") # Offset to check against should be reset so this message shouldn't be skipped assert self.replacer.process_message(message) is not None
def test_process_offset_twice(self) -> None: set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # should be None since the offset should be in Redis, indicating it should be skipped assert self.replacer.process_message(message) is None
def test_invalid_commit_log_message(caplog: Any) -> None: clock = TestingClock() broker: Broker[KafkaPayload] = Broker(MemoryMessageStorage(), clock) topic = Topic("messages") followed_consumer_group = "events" partition = Partition(topic, 0) broker.create_topic(topic, partitions=1) producer = broker.get_producer() inner_consumer = broker.get_consumer("group") consumer = CommitLogTickConsumer(inner_consumer, followed_consumer_group) def _assignment_callback(offsets: Mapping[Partition, int]) -> None: assert inner_consumer.tell() == {partition: 0} assert consumer.tell() == {partition: 0} assignment_callback = mock.Mock(side_effect=_assignment_callback) consumer.subscribe([topic], on_assign=assignment_callback) # produce invalid payload to commit log topic (key should not be None) producer.produce( partition, KafkaPayload(None, b"some-value", []), ).result() clock.sleep(1) with caplog.at_level(logging.ERROR): assert consumer.poll() is None assert followed_consumer_group in caplog.text
def test_produce_result() -> None: state.set_config("subscription_mode_events", "new") epoch = datetime(1970, 1, 1) scheduled_topic = Topic("scheduled-subscriptions-events") result_topic = Topic("events-subscriptions-results") clock = TestingClock() broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage() broker: Broker[KafkaPayload] = Broker(broker_storage, clock) broker.create_topic(scheduled_topic, partitions=1) broker.create_topic(result_topic, partitions=1) producer = broker.get_producer() commit = mock.Mock() strategy = ProduceResult(producer, result_topic.name, commit) subscription_data = SubscriptionData( project_id=1, query="MATCH (events) SELECT count() AS count", time_window_sec=60, resolution_sec=60, entity_subscription=EventsSubscription(data_dict={}), ) subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid.uuid1()), subscription_data) request = subscription_data.build_request(get_dataset("events"), epoch, None, Timer("timer")) result: Result = { "meta": [{ "type": "UInt64", "name": "count" }], "data": [{ "count": 1 }], } message = Message( Partition(scheduled_topic, 0), 1, SubscriptionTaskResult( ScheduledSubscriptionTask( epoch, SubscriptionWithMetadata(EntityKey.EVENTS, subscription, 1), ), (request, result), ), epoch, ) strategy.submit(message) produced_message = broker_storage.consume(Partition(result_topic, 0), 0) assert produced_message is not None assert produced_message.payload.key == str( subscription.identifier).encode("utf-8") assert broker_storage.consume(Partition(result_topic, 0), 1) is None assert commit.call_count == 0 strategy.poll() assert commit.call_count == 1 # Commit is throttled so if we immediately submit another message, the commit count will not change strategy.submit(message) strategy.poll() assert commit.call_count == 1 # Commit count immediately increases once we call join() strategy.join() assert commit.call_count == 2
def test_scheduler_consumer() -> None: settings.TOPIC_PARTITION_COUNTS = {"events": 2} importlib.reload(scheduler_consumer) admin_client = AdminClient(get_default_kafka_configuration()) create_topics(admin_client, [SnubaTopic.COMMIT_LOG]) metrics_backend = TestingMetricsBackend() entity_name = "events" entity = get_entity(EntityKey(entity_name)) storage = entity.get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() commit_log_topic = Topic("snuba-commit-log") mock_scheduler_producer = mock.Mock() from snuba.redis import redis_client from snuba.subscriptions.data import PartitionId, SubscriptionData from snuba.subscriptions.entity_subscription import EventsSubscription from snuba.subscriptions.store import RedisSubscriptionDataStore entity_key = EntityKey(entity_name) partition_index = 0 store = RedisSubscriptionDataStore(redis_client, entity_key, PartitionId(partition_index)) store.create( uuid.uuid4(), SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query="MATCH events SELECT count()", entity_subscription=EventsSubscription(data_dict={}), ), ) builder = scheduler_consumer.SchedulerBuilder( entity_name, str(uuid.uuid1().hex), "events", mock_scheduler_producer, "latest", False, 60 * 5, None, None, metrics_backend, ) scheduler = builder.build_consumer() time.sleep(2) scheduler._run_once() scheduler._run_once() scheduler._run_once() epoch = datetime(1970, 1, 1) producer = KafkaProducer( build_kafka_producer_configuration( stream_loader.get_default_topic_spec().topic, )) for (partition, offset, orig_message_ts) in [ (0, 0, epoch), (1, 0, epoch + timedelta(minutes=1)), (0, 1, epoch + timedelta(minutes=2)), (1, 1, epoch + timedelta(minutes=3)), ]: fut = producer.produce( commit_log_topic, payload=commit_codec.encode( Commit( "events", Partition(commit_log_topic, partition), offset, orig_message_ts, )), ) fut.result() producer.close() for _ in range(5): scheduler._run_once() scheduler._shutdown() assert mock_scheduler_producer.produce.call_count == 2 settings.TOPIC_PARTITION_COUNTS = {}
def test_tick_consumer_non_monotonic() -> None: clock = TestingClock() broker: Broker[KafkaPayload] = Broker(MemoryMessageStorage(), clock) epoch = datetime.fromtimestamp(clock.time()) topic = Topic("messages") followed_consumer_group = "events" partition = Partition(topic, 0) broker.create_topic(topic, partitions=1) producer = broker.get_producer() inner_consumer = broker.get_consumer("group") consumer = CommitLogTickConsumer(inner_consumer, followed_consumer_group) def _assignment_callback(offsets: Mapping[Partition, int]) -> None: assert inner_consumer.tell() == {partition: 0} assert consumer.tell() == {partition: 0} assignment_callback = mock.Mock(side_effect=_assignment_callback) consumer.subscribe([topic], on_assign=assignment_callback) producer.produce( partition, commit_codec.encode( Commit(followed_consumer_group, partition, 0, epoch)), ).result() clock.sleep(1) producer.produce( partition, commit_codec.encode( Commit(followed_consumer_group, partition, 1, epoch + timedelta(seconds=1))), ).result() with assert_changes(lambda: assignment_callback.called, False, True): assert consumer.poll() is None assert consumer.tell() == {partition: 1} with assert_changes(consumer.tell, {partition: 1}, {partition: 2}): assert consumer.poll() == Message( partition, 1, Tick( 0, offsets=Interval(0, 1), timestamps=Interval(epoch, epoch + timedelta(seconds=1)), ), epoch + timedelta(seconds=1), ) clock.sleep(-1) producer.produce( partition, commit_codec.encode( Commit(followed_consumer_group, partition, 2, epoch)), ).result() with assert_changes(consumer.tell, {partition: 2}, {partition: 3}): assert consumer.poll() is None clock.sleep(2) producer.produce( partition, commit_codec.encode( Commit(followed_consumer_group, partition, 3, epoch + timedelta(seconds=2))), ).result() with assert_changes(consumer.tell, {partition: 3}, {partition: 4}): assert consumer.poll() == Message( partition, 3, Tick( 0, offsets=Interval(1, 3), timestamps=Interval(epoch + timedelta(seconds=1), epoch + timedelta(seconds=2)), ), epoch + timedelta(seconds=2), )
def test_tick_consumer(time_shift: Optional[timedelta]) -> None: clock = TestingClock() broker: Broker[KafkaPayload] = Broker(MemoryMessageStorage(), clock) epoch = datetime.fromtimestamp(clock.time()) topic = Topic("messages") followed_consumer_group = "events" broker.create_topic(topic, partitions=1) producer = broker.get_producer() for partition, offsets in enumerate([[0, 1, 2], [0]]): for offset in offsets: payload = commit_codec.encode( Commit(followed_consumer_group, Partition(topic, partition), offset, epoch)) producer.produce(Partition(topic, 0), payload).result() inner_consumer = broker.get_consumer("group") consumer = CommitLogTickConsumer(inner_consumer, followed_consumer_group, time_shift=time_shift) if time_shift is None: time_shift = timedelta() def _assignment_callback(offsets: Mapping[Partition, int]) -> None: assert consumer.tell() == { Partition(topic, 0): 0, } assignment_callback = mock.Mock(side_effect=_assignment_callback) consumer.subscribe([topic], on_assign=assignment_callback) with assert_changes(lambda: assignment_callback.called, False, True): # consume 0, 0 assert consumer.poll() is None assert consumer.tell() == { Partition(topic, 0): 1, } # consume 0, 1 assert consumer.poll() == Message( Partition(topic, 0), 1, Tick(0, offsets=Interval(0, 1), timestamps=Interval(epoch, epoch)).time_shift(time_shift), epoch, ) assert consumer.tell() == { Partition(topic, 0): 2, } # consume 0, 2 assert consumer.poll() == Message( Partition(topic, 0), 2, Tick(0, offsets=Interval(1, 2), timestamps=Interval(epoch, epoch)).time_shift(time_shift), epoch, ) assert consumer.tell() == { Partition(topic, 0): 3, } # consume 1, 0 assert consumer.poll() is None assert consumer.tell() == { Partition(topic, 0): 4, } # consume no message assert consumer.poll() is None assert consumer.tell() == { Partition(topic, 0): 4, } consumer.seek({Partition(topic, 0): 1}) assert consumer.tell() == { Partition(topic, 0): 1, } # consume 0, 1 assert consumer.poll() is None assert consumer.tell() == { Partition(topic, 0): 2, } # consume 0, 2 assert consumer.poll() == Message( Partition(topic, 0), 2, Tick(0, offsets=Interval(1, 2), timestamps=Interval(epoch, epoch)).time_shift(time_shift), epoch, ) assert consumer.tell() == { Partition(topic, 0): 3, } with pytest.raises(ConsumerError): consumer.seek({Partition(topic, -1): 0})
def test_subscription_worker(subscription_data: SubscriptionData) -> None: broker: Broker[SubscriptionTaskResult] = Broker(MemoryMessageStorage(), TestingClock()) result_topic = Topic("subscription-results") broker.create_topic(result_topic, partitions=1) frequency = timedelta(minutes=1) evaluations = 3 subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), subscription_data, ) store = DummySubscriptionDataStore() store.create(subscription.identifier.uuid, subscription.data) metrics = DummyMetricsBackend(strict=True) dataset = get_dataset("events") worker = SubscriptionWorker( dataset, ThreadPoolExecutor(), { 0: SubscriptionScheduler(store, PartitionId(0), timedelta(), metrics) }, broker.get_producer(), result_topic, metrics, ) now = datetime(2000, 1, 1) tick = Tick( offsets=Interval(0, 1), timestamps=Interval(now - (frequency * evaluations), now), ) result_futures = worker.process_message( Message(Partition(Topic("events"), 0), 0, tick, now)) assert result_futures is not None and len(result_futures) == evaluations # Publish the results. worker.flush_batch([result_futures]) # Check to make sure the results were published. # NOTE: This does not cover the ``SubscriptionTaskResultCodec``! consumer = broker.get_consumer("group") consumer.subscribe([result_topic]) for i in range(evaluations): timestamp = now - frequency * (evaluations - i) message = consumer.poll() assert message is not None assert message.partition.topic == result_topic task, future = result_futures[i] future_result = request, result = future.result() assert message.payload.task.timestamp == timestamp assert message.payload == SubscriptionTaskResult(task, future_result) # NOTE: The time series extension is folded back into the request # body, ideally this would reference the timeseries options in # isolation. from_pattern = FunctionCall( String(ConditionFunctions.GTE), ( Column(None, String("timestamp")), Literal(Datetime(timestamp - subscription.data.time_window)), ), ) to_pattern = FunctionCall( String(ConditionFunctions.LT), (Column(None, String("timestamp")), Literal(Datetime(timestamp))), ) condition = request.query.get_condition() assert condition is not None conditions = get_first_level_and_conditions(condition) assert any([from_pattern.match(e) for e in conditions]) assert any([to_pattern.match(e) for e in conditions]) assert result == { "meta": [{ "name": "count", "type": "UInt64" }], "data": [{ "count": 0 }], }
def test_tick_consumer_non_monotonic() -> None: clock = TestingClock() broker: Broker[int] = Broker(MemoryMessageStorage(), clock) epoch = datetime.fromtimestamp(clock.time()) topic = Topic("messages") partition = Partition(topic, 0) broker.create_topic(topic, partitions=1) producer = broker.get_producer() inner_consumer = broker.get_consumer("group") consumer = TickConsumer(inner_consumer) def _assignment_callback(offsets: Mapping[Partition, int]) -> None: assert inner_consumer.tell() == {partition: 0} assert consumer.tell() == {partition: 0} assignment_callback = mock.Mock(side_effect=_assignment_callback) consumer.subscribe([topic], on_assign=assignment_callback) producer.produce(partition, 0) clock.sleep(1) producer.produce(partition, 1) with assert_changes(lambda: assignment_callback.called, False, True): assert consumer.poll() is None assert inner_consumer.tell() == {partition: 1} assert consumer.tell() == {partition: 0} with assert_changes( inner_consumer.tell, {partition: 1}, {partition: 2} ), assert_changes(consumer.tell, {partition: 0}, {partition: 1}): assert consumer.poll() == Message( partition, 0, Tick( offsets=Interval(0, 1), timestamps=Interval(epoch, epoch + timedelta(seconds=1)), ), epoch + timedelta(seconds=1), ) clock.sleep(-1) producer.produce(partition, 2) with assert_changes( inner_consumer.tell, {partition: 2}, {partition: 3} ), assert_does_not_change(consumer.tell, {partition: 1}): assert consumer.poll() is None clock.sleep(2) producer.produce(partition, 3) with assert_changes( inner_consumer.tell, {partition: 3}, {partition: 4} ), assert_changes(consumer.tell, {partition: 1}, {partition: 3}): assert consumer.poll() == Message( partition, 1, Tick( offsets=Interval(1, 3), timestamps=Interval( epoch + timedelta(seconds=1), epoch + timedelta(seconds=2) ), ), epoch + timedelta(seconds=2), )
def test_reprocessing_flow_insert(self) -> None: # We have a group that contains two events, 1 and 2. self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["event_id"] = event_id = "00e24a150d7f4ee4b142b61b4d893b6d" write_unprocessed_events(self.storage, [self.event]) self.event["event_id"] = event_id2 = "00e24a150d7f4ee4b142b61b4d893b6e" write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 2, "group_id": 1 }] project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 41, KafkaPayload( None, json.dumps(( 2, ReplacementType.TOMBSTONE_EVENTS, { "project_id": project_id, "event_ids": [event_id] }, )).encode("utf-8"), [], ), datetime.now(), ) # The user chooses to reprocess a subset of the group and throw away # the other events. Event 1 gets manually tombstoned by Sentry while # Event 2 prevails. processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # At this point the count doesn't make any sense but we don't care. assert self._issue_count(self.project_id) == [{ "count": 2, "group_id": 1 }] # The reprocessed event is inserted with a guaranteed-new group ID but # the *same* event ID (this is why we need to skip tombstoning this # event ID) self.event["group_id"] = 2 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.EXCLUDE_GROUPS, { "project_id": project_id, "group_ids": [1] }, )).encode("utf-8"), [], ), datetime.now(), ) # Group 1 is excluded from queries. At this point we have almost a # regular group deletion, except only a subset of events have been # tombstoned (the ones that will *not* be reprocessed). processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # Group 2 should contain the one event that the user chose to # reprocess, and Group 1 should be gone. (Note: In the product Group 2 # looks identical to Group 1, including short ID). assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }] assert self._get_group_id(project_id, event_id2) == 2 assert not self._get_group_id(project_id, event_id)
def test_combined_scheduler_and_executor() -> None: state.set_config("subscription_mode_events", "new") create_subscription() epoch = datetime(1970, 1, 1) dataset = get_dataset("events") entity_names = ["events"] num_partitions = 2 max_concurrent_queries = 2 total_concurrent_queries = 2 metrics = TestingMetricsBackend() commit = mock.Mock() partitions = mock.Mock() topic = Topic("snuba-commit-log") partition = Partition(topic, 0) stale_threshold_seconds = None result_topic = "events-subscription-results" schedule_ttl = 60 producer = KafkaProducer( build_kafka_producer_configuration( SnubaTopic.SUBSCRIPTION_RESULTS_EVENTS)) with closing(producer): factory = CombinedSchedulerExecutorFactory( dataset, entity_names, num_partitions, max_concurrent_queries, total_concurrent_queries, producer, metrics, stale_threshold_seconds, result_topic, schedule_ttl, ) strategy = factory.create_with_partitions(commit, partitions) message = Message( partition, 4, Tick( 0, offsets=Interval(1, 3), timestamps=Interval(epoch, epoch + timedelta(seconds=60)), ), epoch, ) strategy.submit(message) # Wait for the query to be executed and the result message produced for i in range(10): time.sleep(0.5) strategy.poll() if commit.call_count == 1: break assert commit.call_count == 1 strategy.close() strategy.join()