def test_default_config_cli_bootstrap_servers() -> None: broker_config = get_default_kafka_configuration( bootstrap_servers=["cli.server:9092"] ) assert broker_config["bootstrap.servers"] == "cli.server:9092" broker_config = get_default_kafka_configuration( bootstrap_servers=["cli.server:9092", "cli2.server:9092"] ) assert broker_config["bootstrap.servers"] == "cli.server:9092,cli2.server:9092"
def test_kafka_broker_config() -> None: default_broker = "my.broker:9092" events_broker = "my.other.broker:9092" settings.BROKER_CONFIG = { "bootstrap.servers": default_broker, } settings.KAFKA_BROKER_CONFIG = { Topic.EVENTS.value: {"bootstrap.servers": events_broker} } events_broker_config = get_default_kafka_configuration(Topic.EVENTS) assert events_broker_config["bootstrap.servers"] == events_broker other_broker_config = get_default_kafka_configuration(Topic.EVENT_REPLACEMENTS) assert other_broker_config["bootstrap.servers"] == default_broker
def test_default_config_override_new_config() -> None: default_broker = "my.broker:9092" default_broker_config = { "bootstrap.servers": default_broker, } settings.BROKER_CONFIG = default_broker_config broker_config = get_default_kafka_configuration() assert broker_config["bootstrap.servers"] == default_broker
def test_commit_log_consumer() -> None: # XXX: This would be better as an integration test (or at least a test # against an abstract Producer interface) instead of against a test against # a mock. commit_log_producer = FakeConfluentKafkaProducer() configuration = get_default_kafka_configuration() consumer: KafkaConsumer = KafkaConsumerWithCommitLog( { **configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }, producer=commit_log_producer, commit_log_topic=Topic("commit-log"), ) producer = KafkaProducer(configuration) topic = Topic("topic") with closing(consumer) as consumer: with closing(producer) as producer: producer.produce(topic, next(get_payloads())).result(5.0) consumer.subscribe([topic]) message = consumer.poll(10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) now = datetime.now() position = Position(message.next_offset, now) consumer.stage_positions({message.partition: position}) assert consumer.commit_positions() == {Partition(topic, 0): position} assert len(commit_log_producer.messages) == 1 commit_message = commit_log_producer.messages[0] assert commit_message.topic() == "commit-log" assert commit_codec.decode( KafkaPayload( commit_message.key(), commit_message.value(), commit_message.headers(), )) == Commit("test", Partition(topic, 0), message.next_offset, now)
class TestStrictConsumer: broker_config = get_default_kafka_configuration(bootstrap_servers=["somewhere"]) def __consumer(self, on_message) -> StrictConsumer: return StrictConsumer( topic="my_topic", group_id="something", broker_config=self.broker_config, initial_auto_offset_reset="earliest", partition_assignment_timeout=1, on_partitions_assigned=None, on_partitions_revoked=None, on_message=on_message, ) @patch("snuba.consumers.strict_consumer.StrictConsumer._create_consumer") def test_empty_topic(self, create_consumer) -> None: kafka_consumer = FakeConfluentKafkaConsumer() kafka_consumer.items = [ build_confluent_kafka_message(0, 0, None, True), ] create_consumer.return_value = kafka_consumer on_message = MagicMock() consumer = self.__consumer(on_message) consumer.run() on_message.assert_not_called() @patch("snuba.consumers.strict_consumer.StrictConsumer._create_consumer") def test_failure(self, create_consumer) -> None: kafka_consumer = FakeConfluentKafkaConsumer() create_consumer.return_value = kafka_consumer on_message = MagicMock() consumer = self.__consumer(on_message) with pytest.raises(NoPartitionAssigned): consumer.run() on_message.assert_not_called() @patch("snuba.consumers.strict_consumer.StrictConsumer._create_consumer") def test_one_message(self, create_consumer) -> None: kafka_consumer = FakeConfluentKafkaConsumer() create_consumer.return_value = kafka_consumer msg = build_confluent_kafka_message(0, 0, b"ABCABC", False) kafka_consumer.items = [ msg, build_confluent_kafka_message(0, 0, None, True), ] on_message = MagicMock() on_message.return_value = CommitDecision.DO_NOT_COMMIT consumer = self.__consumer(on_message) consumer.run() on_message.assert_called_once_with(msg) assert kafka_consumer.commit_calls == 0 @patch("snuba.consumers.strict_consumer.StrictConsumer._create_consumer") def test_commits(self, create_consumer) -> None: kafka_consumer = FakeConfluentKafkaConsumer() create_consumer.return_value = kafka_consumer error = MagicMock() error.code.return_value = KafkaError._PARTITION_EOF kafka_consumer.items = [ build_confluent_kafka_message(0, 0, b"ABCABC", False), build_confluent_kafka_message(1, 0, b"ABCABC", False), build_confluent_kafka_message(2, 0, b"ABCABC", False), build_confluent_kafka_message(0, 0, None, True), ] on_message = MagicMock() on_message.return_value = CommitDecision.COMMIT_PREV consumer = self.__consumer(on_message) consumer.run() on_message.assert_called() assert kafka_consumer.commit_calls == 2
def test_scheduler_consumer() -> None: settings.TOPIC_PARTITION_COUNTS = {"events": 2} importlib.reload(scheduler_consumer) admin_client = AdminClient(get_default_kafka_configuration()) create_topics(admin_client, [SnubaTopic.COMMIT_LOG]) metrics_backend = TestingMetricsBackend() entity_name = "events" entity = get_entity(EntityKey(entity_name)) storage = entity.get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() commit_log_topic = Topic("snuba-commit-log") mock_scheduler_producer = mock.Mock() from snuba.redis import redis_client from snuba.subscriptions.data import PartitionId, SubscriptionData from snuba.subscriptions.entity_subscription import EventsSubscription from snuba.subscriptions.store import RedisSubscriptionDataStore entity_key = EntityKey(entity_name) partition_index = 0 store = RedisSubscriptionDataStore(redis_client, entity_key, PartitionId(partition_index)) store.create( uuid.uuid4(), SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query="MATCH events SELECT count()", entity_subscription=EventsSubscription(data_dict={}), ), ) builder = scheduler_consumer.SchedulerBuilder( entity_name, str(uuid.uuid1().hex), "events", mock_scheduler_producer, "latest", False, 60 * 5, None, None, metrics_backend, ) scheduler = builder.build_consumer() time.sleep(2) scheduler._run_once() scheduler._run_once() scheduler._run_once() epoch = datetime(1970, 1, 1) producer = KafkaProducer( build_kafka_producer_configuration( stream_loader.get_default_topic_spec().topic, )) for (partition, offset, orig_message_ts) in [ (0, 0, epoch), (1, 0, epoch + timedelta(minutes=1)), (0, 1, epoch + timedelta(minutes=2)), (1, 1, epoch + timedelta(minutes=3)), ]: fut = producer.produce( commit_log_topic, payload=commit_codec.encode( Commit( "events", Partition(commit_log_topic, partition), offset, orig_message_ts, )), ) fut.result() producer.close() for _ in range(5): scheduler._run_once() scheduler._shutdown() assert mock_scheduler_producer.produce.call_count == 2 settings.TOPIC_PARTITION_COUNTS = {}
def __init__( self, storage_key: StorageKey, raw_topic: Optional[str], replacements_topic: Optional[str], max_batch_size: int, max_batch_time_ms: int, bootstrap_servers: Sequence[str], group_id: str, commit_log_topic: Optional[str], auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, metrics: MetricsBackend, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], commit_retry_policy: Optional[RetryPolicy] = None, profile_path: Optional[str] = None, ) -> None: self.storage = get_writable_storage(storage_key) self.bootstrap_servers = bootstrap_servers topic = (self.storage.get_table_writer().get_stream_loader(). get_default_topic_spec().topic) self.broker_config = get_default_kafka_configuration( topic, bootstrap_servers=bootstrap_servers) self.producer_broker_config = build_kafka_producer_configuration( topic, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) stream_loader = self.storage.get_table_writer().get_stream_loader() self.raw_topic: Topic if raw_topic is not None: self.raw_topic = Topic(raw_topic) else: self.raw_topic = Topic( stream_loader.get_default_topic_spec().topic_name) self.replacements_topic: Optional[Topic] if replacements_topic is not None: self.replacements_topic = Topic(replacements_topic) else: replacement_topic_spec = stream_loader.get_replacement_topic_spec() if replacement_topic_spec is not None: self.replacements_topic = Topic( replacement_topic_spec.topic_name) else: self.replacements_topic = None self.commit_log_topic: Optional[Topic] if commit_log_topic is not None: self.commit_log_topic = Topic(commit_log_topic) else: commit_log_topic_spec = stream_loader.get_commit_log_topic_spec() if commit_log_topic_spec is not None: self.commit_log_topic = Topic(commit_log_topic_spec.topic_name) else: self.commit_log_topic = None # XXX: This can result in a producer being built in cases where it's # not actually required. self.producer = Producer(self.producer_broker_config) self.metrics = metrics self.max_batch_size = max_batch_size self.max_batch_time_ms = max_batch_time_ms self.group_id = group_id self.auto_offset_reset = auto_offset_reset self.queued_max_messages_kbytes = queued_max_messages_kbytes self.queued_min_messages = queued_min_messages self.processes = processes self.input_block_size = input_block_size self.output_block_size = output_block_size self.__profile_path = profile_path if commit_retry_policy is None: commit_retry_policy = BasicRetryPolicy( 3, 1, lambda e: isinstance(e, KafkaException) and e.args[0].code() in ( KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR, KafkaError._WAIT_COORD, ), ) self.__commit_retry_policy = commit_retry_policy
def test_default_config() -> None: broker_config = get_default_kafka_configuration() assert ( broker_config["bootstrap.servers"] == settings.BROKER_CONFIG["bootstrap.servers"] )
def bootstrap( *, bootstrap_server: Sequence[str], kafka: bool, migrate: bool, force: bool, log_level: Optional[str] = None, ) -> None: """ Warning: Not intended to be used in production yet. """ if not force: raise click.ClickException("Must use --force to run") setup_logging(log_level) logger = logging.getLogger("snuba.bootstrap") import time if kafka: logger.debug("Using Kafka with %r", bootstrap_server) from confluent_kafka.admin import AdminClient override_params = { # Same as above: override socket timeout as we expect Kafka # to not getting ready for a while "socket.timeout.ms": 1000, } if logger.getEffectiveLevel() != logging.DEBUG: # Override rdkafka loglevel to be critical unless we are # debugging as we expect failures when trying to connect # (Kafka may not be up yet) override_params["log_level"] = LOG_CRIT attempts = 0 while True: try: logger.info("Attempting to connect to Kafka (attempt %d)...", attempts) client = AdminClient( get_default_kafka_configuration( bootstrap_servers=bootstrap_server, override_params=override_params, )) client.list_topics(timeout=1) break except KafkaException as err: logger.debug("Connection to Kafka failed (attempt %d)", attempts, exc_info=err) attempts += 1 if attempts == 60: raise time.sleep(1) logger.info("Connected to Kafka on attempt %d", attempts) create_topics(client, [t for t in Topic]) if migrate: check_clickhouse_connections() Runner().run_all(force=True)
class TestBootstrapState: broker_config = get_default_kafka_configuration(bootstrap_servers=["somewhere"]) @patch("snuba.consumers.strict_consumer.StrictConsumer._create_consumer") def test_empty_topic(self, create_consumer: Mock) -> None: kafka_consumer = FakeConfluentKafkaConsumer() kafka_consumer.items = [ build_confluent_kafka_message(0, 0, None, True), ] create_consumer.return_value = kafka_consumer bootstrap = BootstrapState( "cdc_control", self.broker_config, "something", get_cdc_storage(StorageKey.GROUPEDMESSAGES), ) ret = bootstrap.handle(None) assert ret[0] == ConsumerStateCompletionEvent.NO_SNAPSHOT assert kafka_consumer.commit_calls == 0 @patch("snuba.consumers.strict_consumer.StrictConsumer._create_consumer") def test_snapshot_for_other_table(self, create_consumer: Mock) -> None: kafka_consumer = FakeConfluentKafkaConsumer() kafka_consumer.items = [ build_confluent_kafka_message( 0, 0, b'{"snapshot-id":"abc123", "tables": ["someone_else"], "product":"snuba", "event":"snapshot-init"}', False, ), build_confluent_kafka_message(0, 0, None, True), ] create_consumer.return_value = kafka_consumer bootstrap = BootstrapState( "cdc_control", self.broker_config, "something", get_cdc_storage(StorageKey.GROUPEDMESSAGES), ) ret = bootstrap.handle(None) assert ret[0] == ConsumerStateCompletionEvent.NO_SNAPSHOT assert kafka_consumer.commit_calls == 1 @patch("snuba.consumers.strict_consumer.StrictConsumer._create_consumer") def test_init_snapshot(self, create_consumer: Mock) -> None: kafka_consumer = FakeConfluentKafkaConsumer() kafka_consumer.items = [ build_confluent_kafka_message( 0, 0, b'{"snapshot-id":"abc123", "tables": ["sentry_groupedmessage"], "product":"snuba", "event":"snapshot-init"}', False, ), build_confluent_kafka_message(0, 0, None, True), ] create_consumer.return_value = kafka_consumer bootstrap = BootstrapState( "cdc_control", self.broker_config, "something", get_cdc_storage(StorageKey.GROUPEDMESSAGES), ) ret = bootstrap.handle(None) assert ret[0] == ConsumerStateCompletionEvent.SNAPSHOT_INIT_RECEIVED assert kafka_consumer.commit_calls == 0 @patch("snuba.consumers.strict_consumer.StrictConsumer._create_consumer") def test_snapshot_loaded(self, create_consumer: Mock) -> None: kafka_consumer = FakeConfluentKafkaConsumer() kafka_consumer.items = [ build_confluent_kafka_message( 0, 0, b'{"snapshot-id":"abc123", "product":"somewhere-else", "tables": [], "event":"snapshot-init"}', False, ), build_confluent_kafka_message( 1, 0, b'{"snapshot-id":"abc123", "product":"snuba", "tables": ["sentry_groupedmessage"], "event":"snapshot-init"}', False, ), build_confluent_kafka_message( 2, 0, ( b'{"snapshot-id":"abc123", "event":"snapshot-loaded",' b'"transaction-info": {"xmin":123, "xmax":124, "xip-list": []}' b"}" ), False, ), build_confluent_kafka_message(0, 0, None, True), ] create_consumer.return_value = kafka_consumer bootstrap = BootstrapState( "cdc_control", self.broker_config, "something", get_cdc_storage(StorageKey.GROUPEDMESSAGES), ) ret = bootstrap.handle(None) assert ret[0] == ConsumerStateCompletionEvent.SNAPSHOT_READY_RECEIVED assert kafka_consumer.commit_calls == 2
def __init__( self, storage_key: StorageKey, kafka_params: KafkaParameters, processing_params: ProcessingParameters, max_batch_size: int, max_batch_time_ms: int, metrics: MetricsBackend, parallel_collect: bool, stats_callback: Optional[Callable[[str], None]] = None, commit_retry_policy: Optional[RetryPolicy] = None, profile_path: Optional[str] = None, mock_parameters: Optional[MockParameters] = None, cooperative_rebalancing: bool = False, ) -> None: self.storage = get_writable_storage(storage_key) self.bootstrap_servers = kafka_params.bootstrap_servers self.consumer_group = kafka_params.group_id topic = (self.storage.get_table_writer().get_stream_loader(). get_default_topic_spec().topic) self.broker_config = get_default_kafka_configuration( topic, bootstrap_servers=kafka_params.bootstrap_servers) logger.info( f"librdkafka log level: {self.broker_config.get('log_level', 6)}") self.producer_broker_config = build_kafka_producer_configuration( topic, bootstrap_servers=kafka_params.bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) stream_loader = self.storage.get_table_writer().get_stream_loader() self.raw_topic: Topic if kafka_params.raw_topic is not None: self.raw_topic = Topic(kafka_params.raw_topic) else: self.raw_topic = Topic( stream_loader.get_default_topic_spec().topic_name) self.replacements_topic: Optional[Topic] if kafka_params.replacements_topic is not None: self.replacements_topic = Topic(kafka_params.replacements_topic) else: replacement_topic_spec = stream_loader.get_replacement_topic_spec() if replacement_topic_spec is not None: self.replacements_topic = Topic( replacement_topic_spec.topic_name) else: self.replacements_topic = None self.commit_log_topic: Optional[Topic] if kafka_params.commit_log_topic is not None: self.commit_log_topic = Topic(kafka_params.commit_log_topic) else: commit_log_topic_spec = stream_loader.get_commit_log_topic_spec() if commit_log_topic_spec is not None: self.commit_log_topic = Topic(commit_log_topic_spec.topic_name) else: self.commit_log_topic = None self.stats_callback = stats_callback # XXX: This can result in a producer being built in cases where it's # not actually required. self.producer = Producer(self.producer_broker_config) self.metrics = metrics self.max_batch_size = max_batch_size self.max_batch_time_ms = max_batch_time_ms self.group_id = kafka_params.group_id self.auto_offset_reset = kafka_params.auto_offset_reset self.strict_offset_reset = kafka_params.strict_offset_reset self.queued_max_messages_kbytes = kafka_params.queued_max_messages_kbytes self.queued_min_messages = kafka_params.queued_min_messages self.processes = processing_params.processes self.input_block_size = processing_params.input_block_size self.output_block_size = processing_params.output_block_size self.__profile_path = profile_path self.__mock_parameters = mock_parameters self.__parallel_collect = parallel_collect self.__cooperative_rebalancing = cooperative_rebalancing if commit_retry_policy is None: commit_retry_policy = BasicRetryPolicy( 3, 1, lambda e: isinstance(e, KafkaException) and e.args[0].code() in ( KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR, KafkaError._WAIT_COORD, ), ) self.__commit_retry_policy = commit_retry_policy
def test_executor_consumer() -> None: """ End to end integration test """ state.set_config("subscription_mode_events", "new") admin_client = AdminClient(get_default_kafka_configuration()) create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_SCHEDULED_EVENTS]) create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_RESULTS_EVENTS]) dataset_name = "events" entity_name = "events" entity_key = EntityKey(entity_name) entity = get_entity(entity_key) storage = entity.get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() scheduled_result_topic_spec = stream_loader.get_subscription_result_topic_spec( ) assert scheduled_result_topic_spec is not None result_producer = KafkaProducer( build_kafka_producer_configuration(scheduled_result_topic_spec.topic)) result_consumer = KafkaConsumer( build_kafka_consumer_configuration( scheduled_result_topic_spec.topic, str(uuid.uuid1().hex), auto_offset_reset="latest", strict_offset_reset=False, )) assigned = False def on_partitions_assigned(partitions: Mapping[Partition, int]) -> None: nonlocal assigned assigned = True result_consumer.subscribe( [Topic(scheduled_result_topic_spec.topic_name)], on_assign=on_partitions_assigned, ) attempts = 10 while attempts > 0 and not assigned: result_consumer.poll(1.0) attempts -= 1 # We need to wait for the consumer to receive partitions otherwise, # when we try to consume messages, we will not find anything. # Subscription is an async process. assert assigned == True, "Did not receive assignment within 10 attempts" consumer_group = str(uuid.uuid1().hex) auto_offset_reset = "latest" strict_offset_reset = False executor = build_executor_consumer( dataset_name, [entity_name], consumer_group, result_producer, 2, 2, auto_offset_reset, strict_offset_reset, TestingMetricsBackend(), None, ) for i in range(1, 5): # Give time to the executor to subscribe time.sleep(1) executor._run_once() # Produce a scheduled task to the scheduled subscriptions topic subscription_data = SubscriptionData( project_id=1, query="MATCH (events) SELECT count()", time_window_sec=60, resolution_sec=60, entity_subscription=EventsSubscription(data_dict={}), ) task = ScheduledSubscriptionTask( timestamp=datetime(1970, 1, 1), task=SubscriptionWithMetadata( entity_key, Subscription( SubscriptionIdentifier( PartitionId(1), uuid.UUID("91b46cb6224f11ecb2ddacde48001122")), subscription_data, ), 1, ), ) encoder = SubscriptionScheduledTaskEncoder() encoded_task = encoder.encode(task) scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec( ) assert scheduled_topic_spec is not None tasks_producer = KafkaProducer( build_kafka_producer_configuration(scheduled_topic_spec.topic)) scheduled_topic = Topic(scheduled_topic_spec.topic_name) tasks_producer.produce(scheduled_topic, payload=encoded_task).result() tasks_producer.close() executor._run_once() executor.signal_shutdown() # Call run here so that the executor shuts down itself cleanly. executor.run() result = result_consumer.poll(5) assert result is not None, "Did not receive a result message" data = json.loads(result.payload.value) assert (data["payload"]["subscription_id"] == "1/91b46cb6224f11ecb2ddacde48001122"), "Invalid subscription id" result_producer.close()
def bootstrap( *, bootstrap_server: Sequence[str], kafka: bool, migrate: bool, force: bool, log_level: Optional[str] = None, ) -> None: """ Warning: Not intended to be used in production yet. """ if not force: raise click.ClickException("Must use --force to run") setup_logging(log_level) logger = logging.getLogger("snuba.bootstrap") import time if kafka: logger.debug("Using Kafka with %r", bootstrap_server) from confluent_kafka.admin import AdminClient, NewTopic override_params = { # Same as above: override socket timeout as we expect Kafka # to not getting ready for a while "socket.timeout.ms": 1000, } if logger.getEffectiveLevel() != logging.DEBUG: # Override rdkafka loglevel to be critical unless we are # debugging as we expect failures when trying to connect # (Kafka may not be up yet) override_params["log_level"] = LOG_CRIT attempts = 0 while True: try: logger.info("Attempting to connect to Kafka (attempt %d)...", attempts) client = AdminClient( get_default_kafka_configuration( bootstrap_servers=bootstrap_server, override_params=override_params, ) ) client.list_topics(timeout=1) break except KafkaException as err: logger.debug( "Connection to Kafka failed (attempt %d)", attempts, exc_info=err ) attempts += 1 if attempts == 60: raise time.sleep(1) logger.info("Connected to Kafka on attempt %d", attempts) topics = {} for topic in Topic: topic_spec = KafkaTopicSpec(topic) logger.debug("Adding topic %s to creation list", topic_spec.topic_name) topics[topic_spec.topic_name] = NewTopic( topic_spec.topic_name, num_partitions=topic_spec.partitions_number, replication_factor=topic_spec.replication_factor, config=topic_spec.topic_creation_config, ) logger.info("Creating Kafka topics...") for topic, future in client.create_topics( list(topics.values()), operation_timeout=1 ).items(): try: future.result() logger.info("Topic %s created", topic) except KafkaException as err: if err.args[0].code() != KafkaError.TOPIC_ALREADY_EXISTS: logger.error("Failed to create topic %s", topic, exc_info=err) if migrate: check_clickhouse_connections() Runner().run_all(force=True)