def test_batch_size(self, broker: Broker[int]) -> None: topic = Topic("topic") broker.create_topic(topic, partitions=1) producer = broker.get_producer() for i in [1, 2, 3]: producer.produce(topic, i).result() consumer = broker.get_consumer("group") worker = FakeWorker() batching_consumer = StreamProcessor( consumer, topic, BatchProcessingStrategyFactory( worker=worker, max_batch_size=2, max_batch_time=100, metrics=DummyMetricsBackend(strict=True), ), ) for _ in range(3): batching_consumer._run_once() batching_consumer._shutdown() assert worker.processed == [1, 2, 3] assert worker.flushed == [[1, 2]] assert consumer.commit_offsets_calls == 1 assert consumer.close_calls == 1
def test_batch_time(self, mock_time: Any, broker: Broker[int]) -> None: topic = Topic("topic") broker.create_topic(topic, partitions=1) producer = broker.get_producer() consumer = broker.get_consumer("group") worker = FakeWorker() metrics = DummyMetricsBackend(strict=True) batching_consumer = StreamProcessor( consumer, topic, BatchProcessingStrategyFactory( worker=worker, max_batch_size=100, max_batch_time=2000, metrics=metrics, ), metrics=metrics, ) mock_time.return_value = time.mktime( datetime(2018, 1, 1, 0, 0, 0).timetuple()) for i in [1, 2, 3]: producer.produce(topic, i).result() for _ in range(3): batching_consumer._run_once() mock_time.return_value = time.mktime( datetime(2018, 1, 1, 0, 0, 1).timetuple()) for i in [4, 5, 6]: producer.produce(topic, i).result() for _ in range(3): batching_consumer._run_once() mock_time.return_value = time.mktime( datetime(2018, 1, 1, 0, 0, 5).timetuple()) for i in [7, 8, 9]: producer.produce(topic, i).result() for _ in range(3): batching_consumer._run_once() batching_consumer._shutdown() assert worker.processed == [1, 2, 3, 4, 5, 6, 7, 8, 9] assert worker.flushed == [[1, 2, 3, 4, 5, 6]] assert consumer.commit_offsets_calls == 1 assert consumer.close_calls == 1
def __build_batching_strategy_factory( self, ) -> BatchProcessingStrategyFactory[KafkaPayload]: return BatchProcessingStrategyFactory( worker=ConsumerWorker( storage=self.storage, producer=self.producer, replacements_topic=self.replacements_topic, metrics=self.metrics, ), max_batch_size=self.max_batch_size, max_batch_time=self.max_batch_time_ms, metrics=self.metrics, )
def build_snapshot_aware_consumer( self, snapshot_id: SnapshotId, transaction_data: TransactionData, ) -> StreamProcessor[KafkaPayload]: """ Builds the consumer with a ConsumerWorker able to handle snapshots. """ return self.__build_consumer( BatchProcessingStrategyFactory( worker=SnapshotAwareWorker( storage=self.storage, producer=self.producer, snapshot_id=snapshot_id, transaction_data=transaction_data, metrics=self.metrics, replacements_topic=self.replacements_topic, ), max_batch_size=self.max_batch_size, max_batch_time=self.max_batch_time_ms, metrics=self.metrics, ) )
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, max_query_workers: Optional[int], schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], delay_seconds: Optional[int], ) -> None: """Evaluates subscribed queries for a dataset.""" assert result_topic is not None setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) if not bootstrap_servers: storage = dataset.get_default_entity().get_writable_storage() assert storage is not None storage_key = storage.get_storage_key().value bootstrap_servers = settings.DEFAULT_STORAGE_BROKERS.get( storage_key, settings.DEFAULT_BROKERS) loader = enforce_table_writer(dataset).get_stream_loader() metrics = MetricsWrapper( environment.metrics, "subscriptions", tags={ "group": consumer_group, "dataset": dataset_name }, ) consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, consumer_group, auto_offset_reset=auto_offset_reset, ), ), KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", ), ), (Topic(commit_log_topic) if commit_log_topic is not None else Topic(loader.get_commit_log_topic_spec().topic_name)), set(commit_log_groups), ), time_shift=(timedelta(seconds=delay_seconds * -1) if delay_seconds is not None else None), ) producer = ProducerEncodingWrapper( KafkaProducer({ "bootstrap.servers": ",".join(bootstrap_servers), "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }), SubscriptionTaskResultEncoder(), ) executor = ThreadPoolExecutor(max_workers=max_query_workers) logger.debug("Starting %r with %s workers...", executor, executor._max_workers) metrics.gauge("executor.workers", executor._max_workers) with closing(consumer), executor, closing(producer): batching_consumer = StreamProcessor( consumer, (Topic(topic) if topic is not None else Topic( loader.get_default_topic_spec().topic_name)), BatchProcessingStrategyFactory( SubscriptionWorker( dataset, executor, { index: SubscriptionScheduler( RedisSubscriptionDataStore(redis_client, dataset, PartitionId(index)), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=metrics, ) for index in range(partitions if partitions is not None else loader. get_default_topic_spec().partitions_number) }, producer, Topic(result_topic), metrics, ), max_batch_size, max_batch_time_ms, metrics, ), metrics=metrics, ) def handler(signum, frame) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def replacer( *, replacements_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, log_level: Optional[str] = None, ) -> None: from snuba.replacer import ReplacerWorker from snuba.utils.streams import Topic from snuba.utils.streams.backends.kafka import ( KafkaConsumer, TransportError, build_kafka_consumer_configuration, ) from snuba.utils.streams.processing import StreamProcessor from snuba.utils.streams.processing.strategies.batching import ( BatchProcessingStrategyFactory, ) setup_logging(log_level) setup_sentry() storage_key = StorageKey(storage_name) storage = get_writable_storage(storage_key) metrics_tags = {"group": consumer_group, "storage": storage_name} stream_loader = storage.get_table_writer().get_stream_loader() default_replacement_topic_spec = stream_loader.get_replacement_topic_spec() assert ( default_replacement_topic_spec is not None ), f"Storage {storage.get_storage_key().value} does not have a replacement topic." replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name metrics = MetricsWrapper( environment.metrics, "replacer", tags=metrics_tags, ) replacer = StreamProcessor( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers=bootstrap_server, group_id=consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), ), Topic(replacements_topic), BatchProcessingStrategyFactory( worker=ReplacerWorker(storage, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, ), metrics=metrics, recoverable_errors=[TransportError], ) def handler(signum: int, frame: Any) -> None: replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) replacer.run()