def __build_batch_writer( self, storage: WritableTableStorage ) -> ProcessedMessageBatchWriter: replacement_batch_writer: Optional[ReplacementBatchWriter] replacement_topic_spec = ( storage.get_table_writer().get_stream_loader().get_replacement_topic_spec() ) if replacement_topic_spec is not None: # XXX: The producer is flushed when closed on strategy teardown # after an assignment is revoked, but never explicitly closed. # XXX: This assumes that the Kafka cluster used for the input topic # to the storage is the same as the replacement topic. replacement_batch_writer = ReplacementBatchWriter( ConfluentKafkaProducer( build_kafka_producer_configuration( storage.get_storage_key(), override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) ), Topic(replacement_topic_spec.topic_name), ) else: replacement_batch_writer = None return ProcessedMessageBatchWriter( InsertBatchWriter( storage.get_table_writer().get_batch_writer( self.__metrics, {"load_balancing": "in_order", "insert_distributed_sync": 1}, ), MetricsWrapper( self.__metrics, "insertions", {"storage": storage.get_storage_key().value}, ), ), replacement_batch_writer, )
def __init__( self, storage_key: StorageKey, raw_topic: Optional[str], replacements_topic: Optional[str], max_batch_size: int, max_batch_time_ms: int, bootstrap_servers: Sequence[str], group_id: str, commit_log_topic: Optional[str], auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], commit_retry_policy: Optional[RetryPolicy] = None, profile_path: Optional[str] = None, ) -> None: self.storage = get_writable_storage(storage_key) self.bootstrap_servers = bootstrap_servers self.broker_config = get_default_kafka_configuration( storage_key, bootstrap_servers=bootstrap_servers ) self.producer_broker_config = build_kafka_producer_configuration( storage_key, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) stream_loader = self.storage.get_table_writer().get_stream_loader() self.raw_topic: Topic if raw_topic is not None: self.raw_topic = Topic(raw_topic) else: self.raw_topic = Topic(stream_loader.get_default_topic_spec().topic_name) self.replacements_topic: Optional[Topic] if replacements_topic is not None: self.replacements_topic = Topic(replacements_topic) else: replacement_topic_spec = stream_loader.get_replacement_topic_spec() if replacement_topic_spec is not None: self.replacements_topic = Topic(replacement_topic_spec.topic_name) else: self.replacements_topic = None self.commit_log_topic: Optional[Topic] if commit_log_topic is not None: self.commit_log_topic = Topic(commit_log_topic) else: commit_log_topic_spec = stream_loader.get_commit_log_topic_spec() if commit_log_topic_spec is not None: self.commit_log_topic = Topic(commit_log_topic_spec.topic_name) else: self.commit_log_topic = None # XXX: This can result in a producer being built in cases where it's # not actually required. self.producer = Producer(self.producer_broker_config) self.metrics = MetricsWrapper( environment.metrics, "consumer", tags={"group": group_id, "storage": storage_key.value}, ) self.max_batch_size = max_batch_size self.max_batch_time_ms = max_batch_time_ms self.group_id = group_id self.auto_offset_reset = auto_offset_reset self.queued_max_messages_kbytes = queued_max_messages_kbytes self.queued_min_messages = queued_min_messages self.processes = processes self.input_block_size = input_block_size self.output_block_size = output_block_size self.__profile_path = profile_path if commit_retry_policy is None: commit_retry_policy = BasicRetryPolicy( 3, constant_delay(1), lambda e: isinstance(e, KafkaException) and e.args[0].code() in ( KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR, KafkaError._WAIT_COORD, ), ) self.__commit_retry_policy = commit_retry_policy
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, max_query_workers: Optional[int], schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], delay_seconds: Optional[int], ) -> None: """Evaluates subscribed queries for a dataset.""" assert result_topic is not None setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) storage = dataset.get_default_entity().get_writable_storage() storage_key = storage.get_storage_key() loader = enforce_table_writer(dataset).get_stream_loader() metrics = MetricsWrapper( environment.metrics, "subscriptions", tags={ "group": consumer_group, "dataset": dataset_name }, ) consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( storage_key, consumer_group, auto_offset_reset=auto_offset_reset, bootstrap_servers=bootstrap_servers, ), ), KafkaConsumer( build_kafka_consumer_configuration( storage_key, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", bootstrap_servers=bootstrap_servers, ), ), (Topic(commit_log_topic) if commit_log_topic is not None else Topic(loader.get_commit_log_topic_spec().topic_name)), set(commit_log_groups), ), time_shift=(timedelta(seconds=delay_seconds * -1) if delay_seconds is not None else None), ) producer = ProducerEncodingWrapper( KafkaProducer( build_kafka_producer_configuration( storage_key, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )), SubscriptionTaskResultEncoder(), ) executor = ThreadPoolExecutor(max_workers=max_query_workers) logger.debug("Starting %r with %s workers...", executor, executor._max_workers) metrics.gauge("executor.workers", executor._max_workers) with closing(consumer), executor, closing(producer): batching_consumer = StreamProcessor( consumer, (Topic(topic) if topic is not None else Topic( loader.get_default_topic_spec().topic_name)), BatchProcessingStrategyFactory( SubscriptionWorker( dataset, executor, { index: SubscriptionScheduler( RedisSubscriptionDataStore(redis_client, dataset, PartitionId(index)), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=metrics, ) for index in range(partitions if partitions is not None else loader. get_default_topic_spec().partitions_number) }, producer, Topic(result_topic), metrics, ), max_batch_size, max_batch_time_ms, metrics, ), metrics=metrics, ) def handler(signum, frame) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def multistorage_consumer( storage_names: Sequence[str], consumer_group: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, processes: int, input_block_size: int, output_block_size: int, log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() storages = { key: get_writable_storage(key) for key in (getattr(StorageKey, name.upper()) for name in storage_names) } topics = { storage.get_table_writer().get_stream_loader().get_default_topic_spec( ).topic_name for storage in storages.values() } # XXX: The ``StreamProcessor`` only supports a single topic at this time, # but is easily modified. The topic routing in the processing strategy is a # bit trickier (but also shouldn't be too bad.) topic = Topic(topics.pop()) if topics: raise ValueError("only one topic is supported") # XXX: The ``CommitLogConsumer`` also only supports a single topic at this # time. (It is less easily modified.) This also assumes the commit log # topic is on the same Kafka cluster as the input topic. commit_log_topics = { spec.topic_name for spec in (storage.get_table_writer().get_stream_loader( ).get_commit_log_topic_spec() for storage in storages.values()) if spec is not None } commit_log_topic: Optional[Topic] if commit_log_topics: commit_log_topic = Topic(commit_log_topics.pop()) else: commit_log_topic = None if commit_log_topics: raise ValueError("only one commit log topic is supported") # XXX: This requires that all storages are associated with the same Kafka # cluster so that they can be consumed by the same consumer instance. # Unfortunately, we don't have the concept of independently configurable # Kafka clusters in settings, only consumer configurations that are # associated with storages and/or global default configurations. To avoid # implementing yet another method of configuring Kafka clusters, this just # piggybacks on the existing configuration method(s), with the assumption # that most deployments are going to be using the default configuration. storage_keys = [*storages.keys()] consumer_configuration = build_kafka_consumer_configuration( storage_keys[0], consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ) for storage_key in storage_keys[1:]: if (build_kafka_consumer_configuration( storage_key, consumer_group)["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]): raise ValueError( "storages cannot be located on different Kafka clusters") if commit_log_topic is None: consumer = KafkaConsumer(consumer_configuration) else: # XXX: This relies on the assumptions that a.) the Kafka cluster where # the commit log topic is located is the same as the input topic (there # is no way to specify otherwise, at writing) and b.) all storages are # located on the same Kafka cluster (validated above.) producer = ConfluentKafkaProducer( build_kafka_producer_configuration(storage_keys[0])) consumer = KafkaConsumerWithCommitLog( consumer_configuration, producer=producer, commit_log_topic=commit_log_topic, ) metrics = MetricsWrapper(environment.metrics, "consumer") processor = StreamProcessor( consumer, topic, MultistorageConsumerProcessingStrategyFactory( [*storages.values()], max_batch_size, max_batch_time_ms / 1000.0, processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, metrics=metrics, ), metrics=metrics, ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) processor.run()
def confirm_load( *, control_topic: Optional[str], bootstrap_server: Sequence[str], storage_name: str, source: str, log_level: Optional[str] = None, ) -> None: """ Confirms the snapshot has been loaded by sending the snapshot-loaded message on the control topic. """ setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.loaded-snapshot") logger.info( "Sending load completion message for storage %s, from source %s", storage_name, source, ) storage_key = StorageKey(storage_name) storage = get_cdc_storage(storage_key) control_topic = control_topic or storage.get_default_control_topic() snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) descriptor = snapshot_source.get_descriptor() producer = Producer( build_kafka_producer_configuration( storage_key, bootstrap_servers=bootstrap_server, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )) msg = SnapshotLoaded( id=descriptor.id, transaction_info=TransactionData( xmin=descriptor.xmin, xmax=descriptor.xmax, xip_list=descriptor.xip_list, ), ) json_string = json.dumps(msg.to_dict()) def delivery_callback(error: KafkaError, message: Message) -> None: if error is not None: raise error else: logger.info("Message sent %r", message.value()) producer.produce( control_topic, value=json_string, on_delivery=delivery_callback, ) producer.flush()