def test_table_name_filter() -> None: table_name = "table_name" message_filter = CdcTableNameMessageFilter(table_name) # Messages that math the table should not be dropped. assert not message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", [("table", table_name.encode("utf8"))]), datetime.now(), )) # Messages without a table should be dropped. assert message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", []), datetime.now(), )) # Messages from a different table should be dropped. assert message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", [("table", b"other_table")]), datetime.now(), ))
def test_multistorage_strategy( processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], ) -> None: from snuba.datasets.storages import groupassignees, groupedmessages from tests.datasets.cdc.test_groupassignee import TestGroupassignee from tests.datasets.cdc.test_groupedmessage import TestGroupedMessage commit = Mock() storages = [groupassignees.storage, groupedmessages.storage] strategy = MultistorageConsumerProcessingStrategyFactory( storages, 10, 10, processes, input_block_size, output_block_size, TestingMetricsBackend(), ).create(commit) payloads = [ KafkaPayload(None, b"{}", [("table", b"ignored")]), KafkaPayload( None, json.dumps(TestGroupassignee.INSERT_MSG).encode("utf8"), [("table", groupassignees.storage.get_postgres_table().encode("utf8"))], ), KafkaPayload( None, json.dumps(TestGroupedMessage.INSERT_MSG).encode("utf8"), [("table", groupedmessages.storage.get_postgres_table().encode("utf8"))], ), ] messages = [ Message( Partition(Topic("topic"), 0), offset, payload, datetime.now(), offset + 1 ) for offset, payload in enumerate(payloads) ] with assert_changes( lambda: get_row_count(groupassignees.storage), 0, 1 ), assert_changes(lambda: get_row_count(groupedmessages.storage), 0, 1): for message in messages: strategy.submit(message) with assert_changes( lambda: commit.call_args_list, [], [call({Partition(Topic("topic"), 0): 3})] ): strategy.close() strategy.join()
def test_commit_log_consumer() -> None: # XXX: This would be better as an integration test (or at least a test # against an abstract Producer interface) instead of against a test against # a mock. commit_log_producer = FakeConfluentKafkaProducer() configuration = get_default_kafka_configuration() consumer: KafkaConsumer = KafkaConsumerWithCommitLog( { **configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }, producer=commit_log_producer, commit_log_topic=Topic("commit-log"), ) producer = KafkaProducer(configuration) topic = Topic("topic") with closing(consumer) as consumer: with closing(producer) as producer: producer.produce(topic, next(get_payloads())).result(5.0) consumer.subscribe([topic]) message = consumer.poll(10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) now = datetime.now() position = Position(message.next_offset, now) consumer.stage_positions({message.partition: position}) assert consumer.commit_positions() == {Partition(topic, 0): position} assert len(commit_log_producer.messages) == 1 commit_message = commit_log_producer.messages[0] assert commit_message.topic() == "commit-log" assert commit_codec.decode( KafkaPayload( commit_message.key(), commit_message.value(), commit_message.headers(), )) == Commit("test", Partition(topic, 0), message.next_offset, now)
def test_subscription_worker_consistent( subscription_data: SubscriptionData) -> None: state.set_config("event_subscription_non_consistent_sample_rate", 1) broker: Broker[SubscriptionTaskResult] = Broker(MemoryMessageStorage(), TestingClock()) result_topic = Topic("subscription-results") broker.create_topic(result_topic, partitions=1) frequency = timedelta(minutes=1) evaluations = 1 subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), subscription_data, ) store = DummySubscriptionDataStore() store.create(subscription.identifier.uuid, subscription.data) metrics = TestingMetricsBackend() dataset = get_dataset("events") worker = SubscriptionWorker( dataset, ThreadPoolExecutor(), { 0: SubscriptionScheduler(store, PartitionId(0), timedelta(), DummyMetricsBackend(strict=True)) }, broker.get_producer(), result_topic, metrics, ) now = datetime(2000, 1, 1) tick = Tick( offsets=Interval(0, 1), timestamps=Interval(now - (frequency * evaluations), now), ) worker.process_message(Message(Partition(Topic("events"), 0), 0, tick, now)) time.sleep(0.1) assert (len([ m for m in metrics.calls if isinstance(m, Increment) and m.name == "consistent" ]) == 1)
def test_multiple_partitions(self) -> None: """ Different partitions should have independent offset checks. """ set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) payload = KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ) offset = 42 timestamp = datetime.now() partition_one: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), offset, payload, timestamp, ) partition_two: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 2), offset, payload, timestamp, ) processed = self.replacer.process_message(partition_one) self.replacer.flush_batch([processed]) # different partition should be unaffected even if it's the same offset assert self.replacer.process_message(partition_two) is not None
def test_delete_tag_promoted_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["data"]["tags"].append(["browser.name", "foo"]) self.event["data"]["tags"].append(["notbrowser", "foo"]) write_unprocessed_events(self.storage, [self.event]) project_id = self.project_id def _issue_count(total: bool = False) -> Sequence[Mapping[str, Any]]: clickhouse = self.storage.get_cluster().get_query_connection( ClickhouseClientSettings.QUERY) total_cond = ( "AND has(_tags_hash_map, cityHash64('browser.name=foo'))" if not total else "") data = clickhouse.execute(f""" SELECT group_id, count() FROM errors_local FINAL WHERE deleted = 0 AND project_id = {project_id} {total_cond} GROUP BY group_id """).results return [{"group_id": row[0], "count": row[1]} for row in data] assert _issue_count() == [{"count": 1, "group_id": 1}] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}] timestamp = datetime.now(tz=pytz.utc) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_DELETE_TAG, { "project_id": project_id, "tag": "browser.name", "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) assert processed is not None self.replacer.flush_batch([processed]) assert _issue_count() == [] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]
def test_encoding_producer() -> None: broker: Broker[str] = Broker(MemoryMessageStorage(), TestingClock()) topic = Topic("test") broker.create_topic(topic, 1) class ReverseEncoder(Encoder[str, str]): def encode(self, value: str) -> str: return "".join(value[::-1]) producer = ProducerEncodingWrapper(broker.get_producer(), ReverseEncoder()) decoded_message = producer.produce(topic, "hello").result() assert decoded_message.payload == "hello" consumer = broker.get_consumer("group") consumer.subscribe([topic]) encoded_message = consumer.poll() assert encoded_message is not None # The payload returned by the consumer should not be decoded. assert encoded_message.payload == "olleh" # All other attributes should be the same. for attribute in set(Message.__slots__) - {"payload"}: assert getattr(encoded_message, attribute) == getattr(decoded_message, attribute)
def test_tick_consumer_min_interval() -> None: clock = TestingClock() broker: Broker[int] = Broker(MemoryMessageStorage(), clock) topic = Topic("messages") broker.create_topic(topic, partitions=2) producer = broker.get_producer() for payload in range(3): producer.produce(Partition(topic, 0), payload).result() clock.sleep(1.0) inner_consumer = broker.get_consumer("group") consumer = TickConsumer(inner_consumer, min_interval=timedelta(seconds=2)) consumer.subscribe([topic]) assert consumer.poll() is None assert consumer.poll() is None message = consumer.poll() assert message is not None tick = message.payload assert tick.offsets.upper - tick.offsets.lower == 2 assert tick.timestamps.upper - tick.timestamps.lower == timedelta(seconds=2)
def test_execute_and_produce_result() -> None: state.set_config("subscription_mode_events", "new") dataset = get_dataset("events") entity_names = ["events"] max_concurrent_queries = 2 total_concurrent_queries = 2 metrics = TestingMetricsBackend() scheduled_topic = Topic("scheduled-subscriptions-events") result_topic = Topic("events-subscriptions-results") clock = TestingClock() broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage() broker: Broker[KafkaPayload] = Broker(broker_storage, clock) broker.create_topic(scheduled_topic, partitions=1) broker.create_topic(result_topic, partitions=1) producer = broker.get_producer() commit = mock.Mock() strategy = ExecuteQuery( dataset, entity_names, max_concurrent_queries, total_concurrent_queries, None, metrics, ProduceResult(producer, result_topic.name, commit), commit, ) subscription_identifier = SubscriptionIdentifier(PartitionId(0), uuid.uuid1()) make_message = generate_message(EntityKey.EVENTS, subscription_identifier) message = next(make_message) strategy.submit(message) # Eventually a message should be produced and offsets committed while (broker_storage.consume(Partition(result_topic, 0), 0) is None or commit.call_count == 0): strategy.poll() produced_message = broker_storage.consume(Partition(result_topic, 0), 0) assert produced_message is not None assert produced_message.payload.key == str(subscription_identifier).encode( "utf-8") assert commit.call_count == 1
def _wrap(self, msg: Tuple[Any, ...]) -> Message[KafkaPayload]: return Message( Partition(Topic("replacements"), 0), 0, KafkaPayload(None, json.dumps(msg).encode("utf-8"), []), datetime.now(), )
def test_offset_already_processed(self) -> None: """ Don't process an offset that already exists in Redis. """ set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) key = f"replacement:{CONSUMER_GROUP}:errors:1" redis_client.set(key, 42) old_offset: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 41, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, {}, )).encode("utf-8"), [], ), datetime.now(), ) same_offset: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, {}, )).encode("utf-8"), [], ), datetime.now(), ) assert self.replacer.process_message(old_offset) is None assert self.replacer.process_message(same_offset) is None
def write_step() -> ProcessedMessageBatchWriter: return ProcessedMessageBatchWriter( insert_batch_writer=InsertBatchWriter( writer, MetricsWrapper(metrics, "insertions") ), replacement_batch_writer=ReplacementBatchWriter( replacements_producer, Topic("replacements") ), )
def test_skip_stale_message() -> None: dataset = get_dataset("events") entity_names = ["events"] max_concurrent_queries = 2 total_concurrent_queries = 2 metrics = TestingMetricsBackend() scheduled_topic = Topic("scheduled-subscriptions-events") result_topic = Topic("events-subscriptions-results") clock = TestingClock() broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage() broker: Broker[KafkaPayload] = Broker(broker_storage, clock) broker.create_topic(scheduled_topic, partitions=1) broker.create_topic(result_topic, partitions=1) producer = broker.get_producer() commit = mock.Mock() stale_threshold_seconds = 60 strategy = ExecuteQuery( dataset, entity_names, max_concurrent_queries, total_concurrent_queries, stale_threshold_seconds, metrics, ProduceResult(producer, result_topic.name, commit), commit, ) subscription_identifier = SubscriptionIdentifier(PartitionId(0), uuid.uuid1()) make_message = generate_message(EntityKey.EVENTS, subscription_identifier) message = next(make_message) strategy.submit(message) # No message will be produced strategy.poll() assert broker_storage.consume(Partition(result_topic, 0), 0) is None assert Increment("skipped_execution", 1, {"entity": "events"}) in metrics.calls
def eventstream(*, dataset: Dataset) -> RespTuple: record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data, []), datetime.now(), ) type_ = record[1] storage = dataset.get_default_entity().get_writable_storage() assert storage is not None if type_ == "insert": from arroyo.processing.strategies.streaming import ( KafkaConsumerStrategyFactory, ) from snuba.consumers.consumer import build_batch_writer, process_message table_writer = storage.get_table_writer() stream_loader = table_writer.get_stream_loader() strategy = KafkaConsumerStrategyFactory( stream_loader.get_pre_filter(), functools.partial( process_message, stream_loader.get_processor(), "consumer_grouup" ), build_batch_writer(table_writer, metrics=metrics), max_batch_size=1, max_batch_time=1.0, processes=None, input_block_size=None, output_block_size=None, ).create(lambda offsets: None) strategy.submit(message) strategy.close() strategy.join() else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(storage, "consumer_group", metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def test_kafka_filter_header_with_bypass() -> None: header_filter = KafkaHeaderFilterWithBypass("should_drop", "1", 5) message = Message( Partition(Topic("random"), 1), 1, KafkaPayload(b"key", b"value", [("should_drop", b"1")]), datetime.now(), ) for _ in range(3): assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is False
def test_skip_kafka_message(self) -> None: state.set_config("kafka_messages_to_skip", "[snuba-test-lol:1:2,snuba-test-yeet:0:1]") assert skip_kafka_message( Message( Partition(Topic("snuba-test-lol"), 1), 2, KafkaPayload(None, b"", []), datetime.now(), )) assert skip_kafka_message( Message( Partition(Topic("snuba-test-yeet"), 0), 1, KafkaPayload(None, b"", []), datetime.now(), )) assert not skip_kafka_message( Message( Partition(Topic("snuba-test-lol"), 2), 1, KafkaPayload(None, b"", []), datetime.now(), ))
def test_unmerge_hierarchical_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "b" * 32 self.event["data"]["hierarchical_hashes"] = ["a" * 32] write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE_HIERARCHICAL, { "project_id": project_id, "previous_group_id": 1, "new_group_id": 2, "hierarchical_hash": "a" * 32, "primary_hash": "b" * 32, "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) assert processed is not None self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }]
def test_unmerge_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_unmerge", { "project_id": project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }]
def __build_batch_writer( self, storage: WritableTableStorage) -> ProcessedMessageBatchWriter: replacement_batch_writer: Optional[ReplacementBatchWriter] stream_loader = storage.get_table_writer().get_stream_loader() replacement_topic_spec = stream_loader.get_replacement_topic_spec() default_topic_spec = stream_loader.get_default_topic_spec() if replacement_topic_spec is not None: # XXX: The producer is flushed when closed on strategy teardown # after an assignment is revoked, but never explicitly closed. # XXX: This assumes that the Kafka cluster used for the input topic # to the storage is the same as the replacement topic. replacement_batch_writer = ReplacementBatchWriter( ConfluentKafkaProducer( build_kafka_producer_configuration( default_topic_spec.topic, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )), Topic(replacement_topic_spec.topic_name), ) else: replacement_batch_writer = None return ProcessedMessageBatchWriter( InsertBatchWriter( storage.get_table_writer().get_batch_writer( self.__metrics, { "load_balancing": "in_order", "insert_distributed_sync": 1 }, ), MetricsWrapper( self.__metrics, "insertions", {"storage": storage.get_storage_key().value}, ), ), replacement_batch_writer, )
def test_delete_groups_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.utcnow() project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_DELETE_GROUPS, { "project_id": project_id, "group_ids": [1], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [] # Count is still zero after Redis flushed and parts merged self._clear_redis_and_force_merge() assert self._issue_count(self.project_id) == []
def generate_message( entity_key: EntityKey, subscription_identifier: Optional[SubscriptionIdentifier] = None, ) -> Iterator[Message[KafkaPayload]]: codec = SubscriptionScheduledTaskEncoder() epoch = datetime(1970, 1, 1) i = 0 if subscription_identifier is None: subscription_identifier = SubscriptionIdentifier( PartitionId(1), uuid.uuid1()) data_dict = {} if entity_key in (EntityKey.METRICS_SETS, EntityKey.METRICS_COUNTERS): data_dict = {"organization": 1} entity_subscription = ENTITY_KEY_TO_SUBSCRIPTION_MAPPER[entity_key]( data_dict=data_dict) while True: payload = codec.encode( ScheduledSubscriptionTask( epoch + timedelta(minutes=i), SubscriptionWithMetadata( entity_key, Subscription( subscription_identifier, SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query=f"MATCH ({entity_key.value}) SELECT count()", entity_subscription=entity_subscription, ), ), i + 1, ), )) yield Message(Partition(Topic("test"), 0), i, payload, epoch) i += 1
def __init__( self, producer: Producer[KafkaPayload], result_topic: str, commit: Callable[[Mapping[Partition, Position]], None], ): self.__producer = producer self.__result_topic = Topic(result_topic) self.__commit = commit self.__commit_data: MutableMapping[Partition, Position] = {} # Time we last called commit self.__last_committed: Optional[float] = None self.__encoder = SubscriptionTaskResultEncoder() self.__queue: Deque[Tuple[Message[SubscriptionTaskResult], Future[Message[KafkaPayload]]]] = deque() self.__max_buffer_size = 10000 self.__closed = False
def test_reset_consumer_group_offset_check(self) -> None: set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) self.replacer.flush_batch([self.replacer.process_message(message)]) set_config(replacer.RESET_CHECK_CONFIG, f"[{CONSUMER_GROUP}]") # Offset to check against should be reset so this message shouldn't be skipped assert self.replacer.process_message(message) is not None
def __init__( self, schedulers: Mapping[int, SubscriptionScheduler], producer: Producer[KafkaPayload], scheduled_topic_spec: KafkaTopicSpec, commit: Callable[[Mapping[Partition, Position]], None], stale_threshold_seconds: Optional[int], metrics: MetricsBackend, ) -> None: self.__schedulers = schedulers self.__encoder = SubscriptionScheduledTaskEncoder() self.__producer = producer self.__scheduled_topic = Topic(scheduled_topic_spec.topic_name) self.__commit = commit self.__stale_threshold_seconds = stale_threshold_seconds self.__metrics = metrics self.__closed = False # Stores each tick with it's futures self.__queue = ScheduledSubscriptionQueue() # Not a hard max self.__max_buffer_size = 10000
def test_process_offset_twice(self) -> None: set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # should be None since the offset should be in Redis, indicating it should be skipped assert self.replacer.process_message(message) is None
def test_invalid_commit_log_message(caplog: Any) -> None: clock = TestingClock() broker: Broker[KafkaPayload] = Broker(MemoryMessageStorage(), clock) topic = Topic("messages") followed_consumer_group = "events" partition = Partition(topic, 0) broker.create_topic(topic, partitions=1) producer = broker.get_producer() inner_consumer = broker.get_consumer("group") consumer = CommitLogTickConsumer(inner_consumer, followed_consumer_group) def _assignment_callback(offsets: Mapping[Partition, int]) -> None: assert inner_consumer.tell() == {partition: 0} assert consumer.tell() == {partition: 0} assignment_callback = mock.Mock(side_effect=_assignment_callback) consumer.subscribe([topic], on_assign=assignment_callback) # produce invalid payload to commit log topic (key should not be None) producer.produce( partition, KafkaPayload(None, b"some-value", []), ).result() clock.sleep(1) with caplog.at_level(logging.ERROR): assert consumer.poll() is None assert followed_consumer_group in caplog.text
def multistorage_consumer( storage_names: Sequence[str], consumer_group: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, ) -> None: DEFAULT_BLOCK_SIZE = int(32 * 1e6) if processes is not None: if input_block_size is None: input_block_size = DEFAULT_BLOCK_SIZE if output_block_size is None: output_block_size = DEFAULT_BLOCK_SIZE setup_logging(log_level) setup_sentry() storages = { key: get_writable_storage(key) for key in (getattr(StorageKey, name.upper()) for name in storage_names) } topics = { storage.get_table_writer().get_stream_loader().get_default_topic_spec( ).topic_name for storage in storages.values() } # XXX: The ``StreamProcessor`` only supports a single topic at this time, # but is easily modified. The topic routing in the processing strategy is a # bit trickier (but also shouldn't be too bad.) topic = Topic(topics.pop()) if topics: raise ValueError("only one topic is supported") # XXX: The ``CommitLogConsumer`` also only supports a single topic at this # time. (It is less easily modified.) This also assumes the commit log # topic is on the same Kafka cluster as the input topic. commit_log_topics = { spec.topic_name for spec in (storage.get_table_writer().get_stream_loader( ).get_commit_log_topic_spec() for storage in storages.values()) if spec is not None } commit_log_topic: Optional[Topic] if commit_log_topics: commit_log_topic = Topic(commit_log_topics.pop()) else: commit_log_topic = None if commit_log_topics: raise ValueError("only one commit log topic is supported") # XXX: This requires that all storages are associated with the same Kafka # cluster so that they can be consumed by the same consumer instance. # Unfortunately, we don't have the concept of independently configurable # Kafka clusters in settings, only consumer configurations that are # associated with storages and/or global default configurations. To avoid # implementing yet another method of configuring Kafka clusters, this just # piggybacks on the existing configuration method(s), with the assumption # that most deployments are going to be using the default configuration. storage_keys = [*storages.keys()] kafka_topic = (storages[storage_keys[0]].get_table_writer(). get_stream_loader().get_default_topic_spec().topic) consumer_configuration = build_kafka_consumer_configuration( kafka_topic, consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ) for storage_key in storage_keys[1:]: if (build_kafka_consumer_configuration( storages[storage_key].get_table_writer().get_stream_loader(). get_default_topic_spec().topic, consumer_group, )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]): raise ValueError( "storages cannot be located on different Kafka clusters") if commit_log_topic is None: consumer = KafkaConsumer(consumer_configuration) else: # XXX: This relies on the assumptions that a.) all storages are # located on the same Kafka cluster (validated above.) commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer( ).get_stream_loader().get_commit_log_topic_spec()) assert commit_log_topic_spec is not None producer = ConfluentKafkaProducer( build_kafka_producer_configuration(commit_log_topic_spec.topic)) consumer = KafkaConsumerWithCommitLog( consumer_configuration, producer=producer, commit_log_topic=commit_log_topic, ) metrics = MetricsWrapper(environment.metrics, "consumer") configure_metrics(StreamMetricsAdapter(metrics)) processor = StreamProcessor( consumer, topic, MultistorageConsumerProcessingStrategyFactory( [*storages.values()], max_batch_size, max_batch_time_ms / 1000.0, processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, metrics=metrics, ), ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) processor.run()
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, max_query_workers: Optional[int], schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], delay_seconds: Optional[int], ) -> None: """Evaluates subscribed queries for a dataset.""" setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) storage = dataset.get_default_entity().get_writable_storage() assert ( storage is not None ), f"Dataset {dataset_name} does not have a writable storage by default." loader = enforce_table_writer(dataset).get_stream_loader() commit_log_topic_spec = loader.get_commit_log_topic_spec() assert commit_log_topic_spec is not None result_topic_spec = loader.get_subscription_result_topic_spec() assert result_topic_spec is not None metrics = MetricsWrapper( environment.metrics, "subscriptions", tags={ "group": consumer_group, "dataset": dataset_name }, ) consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( loader.get_default_topic_spec().topic, consumer_group, auto_offset_reset=auto_offset_reset, bootstrap_servers=bootstrap_servers, ), ), KafkaConsumer( build_kafka_consumer_configuration( commit_log_topic_spec.topic, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", bootstrap_servers=bootstrap_servers, ), ), (Topic(commit_log_topic) if commit_log_topic is not None else Topic(commit_log_topic_spec.topic_name)), set(commit_log_groups), ), time_shift=(timedelta(seconds=delay_seconds * -1) if delay_seconds is not None else None), ) producer = ProducerEncodingWrapper( KafkaProducer( build_kafka_producer_configuration( loader.get_default_topic_spec().topic, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )), SubscriptionTaskResultEncoder(), ) executor = ThreadPoolExecutor(max_workers=max_query_workers) logger.debug("Starting %r with %s workers...", executor, getattr(executor, "_max_workers", 0)) metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0)) with closing(consumer), executor, closing(producer): from arroyo import configure_metrics configure_metrics(StreamMetricsAdapter(metrics)) batching_consumer = StreamProcessor( consumer, (Topic(topic) if topic is not None else Topic( loader.get_default_topic_spec().topic_name)), BatchProcessingStrategyFactory( SubscriptionWorker( dataset, executor, { index: SubscriptionScheduler( RedisSubscriptionDataStore(redis_client, dataset, PartitionId(index)), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=metrics, ) for index in range(partitions if partitions is not None else loader. get_default_topic_spec().partitions_number) }, producer, Topic(result_topic) if result_topic is not None else Topic( result_topic_spec.topic_name), metrics, ), max_batch_size, max_batch_time_ms, ), ) def handler(signum: int, frame: Optional[Any]) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def test_scheduler_consumer() -> None: settings.TOPIC_PARTITION_COUNTS = {"events": 2} importlib.reload(scheduler_consumer) admin_client = AdminClient(get_default_kafka_configuration()) create_topics(admin_client, [SnubaTopic.COMMIT_LOG]) metrics_backend = TestingMetricsBackend() entity_name = "events" entity = get_entity(EntityKey(entity_name)) storage = entity.get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() commit_log_topic = Topic("snuba-commit-log") mock_scheduler_producer = mock.Mock() from snuba.redis import redis_client from snuba.subscriptions.data import PartitionId, SubscriptionData from snuba.subscriptions.entity_subscription import EventsSubscription from snuba.subscriptions.store import RedisSubscriptionDataStore entity_key = EntityKey(entity_name) partition_index = 0 store = RedisSubscriptionDataStore(redis_client, entity_key, PartitionId(partition_index)) store.create( uuid.uuid4(), SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query="MATCH events SELECT count()", entity_subscription=EventsSubscription(data_dict={}), ), ) builder = scheduler_consumer.SchedulerBuilder( entity_name, str(uuid.uuid1().hex), "events", mock_scheduler_producer, "latest", False, 60 * 5, None, None, metrics_backend, ) scheduler = builder.build_consumer() time.sleep(2) scheduler._run_once() scheduler._run_once() scheduler._run_once() epoch = datetime(1970, 1, 1) producer = KafkaProducer( build_kafka_producer_configuration( stream_loader.get_default_topic_spec().topic, )) for (partition, offset, orig_message_ts) in [ (0, 0, epoch), (1, 0, epoch + timedelta(minutes=1)), (0, 1, epoch + timedelta(minutes=2)), (1, 1, epoch + timedelta(minutes=3)), ]: fut = producer.produce( commit_log_topic, payload=commit_codec.encode( Commit( "events", Partition(commit_log_topic, partition), offset, orig_message_ts, )), ) fut.result() producer.close() for _ in range(5): scheduler._run_once() scheduler._shutdown() assert mock_scheduler_producer.produce.call_count == 2 settings.TOPIC_PARTITION_COUNTS = {}
def test_tick_consumer_non_monotonic() -> None: clock = TestingClock() broker: Broker[KafkaPayload] = Broker(MemoryMessageStorage(), clock) epoch = datetime.fromtimestamp(clock.time()) topic = Topic("messages") followed_consumer_group = "events" partition = Partition(topic, 0) broker.create_topic(topic, partitions=1) producer = broker.get_producer() inner_consumer = broker.get_consumer("group") consumer = CommitLogTickConsumer(inner_consumer, followed_consumer_group) def _assignment_callback(offsets: Mapping[Partition, int]) -> None: assert inner_consumer.tell() == {partition: 0} assert consumer.tell() == {partition: 0} assignment_callback = mock.Mock(side_effect=_assignment_callback) consumer.subscribe([topic], on_assign=assignment_callback) producer.produce( partition, commit_codec.encode( Commit(followed_consumer_group, partition, 0, epoch)), ).result() clock.sleep(1) producer.produce( partition, commit_codec.encode( Commit(followed_consumer_group, partition, 1, epoch + timedelta(seconds=1))), ).result() with assert_changes(lambda: assignment_callback.called, False, True): assert consumer.poll() is None assert consumer.tell() == {partition: 1} with assert_changes(consumer.tell, {partition: 1}, {partition: 2}): assert consumer.poll() == Message( partition, 1, Tick( 0, offsets=Interval(0, 1), timestamps=Interval(epoch, epoch + timedelta(seconds=1)), ), epoch + timedelta(seconds=1), ) clock.sleep(-1) producer.produce( partition, commit_codec.encode( Commit(followed_consumer_group, partition, 2, epoch)), ).result() with assert_changes(consumer.tell, {partition: 2}, {partition: 3}): assert consumer.poll() is None clock.sleep(2) producer.produce( partition, commit_codec.encode( Commit(followed_consumer_group, partition, 3, epoch + timedelta(seconds=2))), ).result() with assert_changes(consumer.tell, {partition: 3}, {partition: 4}): assert consumer.poll() == Message( partition, 3, Tick( 0, offsets=Interval(1, 3), timestamps=Interval(epoch + timedelta(seconds=1), epoch + timedelta(seconds=2)), ), epoch + timedelta(seconds=2), )