def api(*, debug: bool, log_level: Optional[str], processes: int, threads: int) -> None: from snuba import settings if debug: if processes > 1 or threads > 1: raise click.ClickException("processes/threads can only be 1 in debug") from snuba.web.views import application from werkzeug.serving import WSGIRequestHandler setup_logging(log_level) WSGIRequestHandler.protocol_version = "HTTP/1.1" application.run(port=settings.PORT, threaded=True, debug=debug) else: import mywsgi if log_level: os.environ["LOG_LEVEL"] = log_level mywsgi.run( "snuba.web.wsgi:application", f"0.0.0.0:{settings.PORT}", processes=processes, threads=threads, )
def perf( *, events_file: Optional[str], repeat: int, profile_process: bool, profile_write: bool, dataset_name: str, log_level: Optional[str] = None, ) -> None: from snuba.perf import run, logger setup_logging(log_level) dataset = get_dataset(dataset_name) if not local_dataset_mode(): logger.error( "The perf tool is only intended for local dataset environment.") sys.exit(1) run( events_file, dataset, repeat=repeat, profile_process=profile_process, profile_write=profile_write, )
def optimize( *, clickhouse_host: str, clickhouse_port: int, database: str, dataset_name: str, timeout: int, log_level: Optional[str] = None, ) -> None: from datetime import datetime from snuba.clickhouse.native import ClickhousePool from snuba.optimize import run_optimize, logger setup_logging(log_level) dataset = get_dataset(dataset_name) table = enforce_table_writer(dataset).get_schema().get_local_table_name() today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) clickhouse = ClickhousePool(clickhouse_host, clickhouse_port, send_receive_timeout=timeout) num_dropped = run_optimize(clickhouse, database, table, before=today) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
def bulk_load( *, dataset_name: Optional[str], dest_table: Optional[str], source: Optional[str], log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.load-snapshot") logger.info("Start bulk load process for dataset %s, from source %s", dataset_name, source) dataset = get_dataset(dataset_name) # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = enforce_table_writer(dataset).get_bulk_loader( snapshot_source, dest_table) # TODO: see whether we need to pass options to the writer writer = BufferedWriterWrapper( enforce_table_writer(dataset).get_bulk_writer(table_name=dest_table), settings.BULK_CLICKHOUSE_BUFFER, ) loader.load(writer)
def perf( *, events_file: Optional[str], repeat: int, profile_process: bool, profile_write: bool, dataset_name: str, log_level: Optional[str] = None, ) -> None: from snuba.perf import run, logger setup_logging(log_level) dataset = get_dataset(dataset_name) if not all(storage.get_cluster().is_single_node() for storage in dataset.get_all_storages()): logger.error( "The perf tool is only intended for single node environment.") sys.exit(1) run( events_file, dataset, repeat=repeat, profile_process=profile_process, profile_write=profile_write, )
def migrate(*, log_level: Optional[str] = None) -> None: click.echo("Warning: The migrate command is deprecated and will be removed soon\n") setup_logging(log_level) check_clickhouse_connections() run()
def optimize( *, clickhouse_host: Optional[str], clickhouse_port: Optional[int], storage_name: str, parallel: int, log_level: Optional[str] = None, ) -> None: from datetime import datetime from snuba.clickhouse.native import ClickhousePool from snuba.optimize import logger, run_optimize setup_logging(log_level) setup_sentry() storage: ReadableTableStorage storage_key = StorageKey(storage_name) storage = get_storage(storage_key) (clickhouse_user, clickhouse_password) = storage.get_cluster().get_credentials() today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) database = storage.get_cluster().get_database() # TODO: In distributed mode, optimize currently must be run once for each node # with the host and port of that node provided via the CLI. In the future, # passing this information won't be necessary, and running this command once # will ensure that optimize is performed on all of the individual nodes for # that cluster. if clickhouse_host and clickhouse_port: connection = ClickhousePool( clickhouse_host, clickhouse_port, clickhouse_user, clickhouse_password, database, send_receive_timeout=ClickhouseClientSettings.OPTIMIZE.value.timeout, ) elif not storage.get_cluster().is_single_node(): raise click.ClickException("Provide Clickhouse host and port for optimize") else: connection = storage.get_cluster().get_query_connection( ClickhouseClientSettings.OPTIMIZE ) num_dropped = run_optimize( connection, storage, database, before=today, parallel=parallel, clickhouse_host=clickhouse_host, ) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
def api(*, debug: bool, log_level: Optional[str] = None) -> None: from snuba import settings from snuba.web.views import application from werkzeug.serving import WSGIRequestHandler setup_logging(log_level) WSGIRequestHandler.protocol_version = "HTTP/1.1" host = "0.0.0.0" application.run(host=host, port=settings.PORT, threaded=True, debug=debug)
def migrate(force: bool, log_level: Optional[str] = None) -> None: """ Runs all migrations. Blocking migrations will not be run unless --force is passed. """ setup_logging(log_level) check_clickhouse_connections() runner = Runner() try: runner.run_all(force=force) except MigrationError as e: raise click.ClickException(str(e)) click.echo("Finished running migrations")
def cleanup( *, clickhouse_host: Optional[str], clickhouse_port: Optional[int], dry_run: bool, database: str, storage_name: str, log_level: Optional[str] = None, ) -> None: """ Deletes stale partitions for ClickHouse tables """ setup_logging(log_level) from snuba.cleanup import run_cleanup, logger from snuba.clickhouse.native import ClickhousePool writable_storage = get_writable_storage(StorageKey(storage_name)) ( clickhouse_user, clickhouse_password, ) = writable_storage.get_cluster().get_credentials() table = writable_storage.get_table_writer().get_schema( ).get_local_table_name() if clickhouse_host and clickhouse_port and database: connection = ClickhousePool( clickhouse_host, clickhouse_port, clickhouse_user, clickhouse_password, database, ) elif not writable_storage.get_cluster().is_single_node(): raise click.ClickException( "Provide ClickHouse host and port for cleanup") else: connection = writable_storage.get_cluster().get_query_connection( ClickhouseClientSettings.CLEANUP) num_dropped = run_cleanup(connection, database, table, dry_run=dry_run) logger.info("Dropped %s partitions on %s" % (num_dropped, clickhouse_host))
def api( *, bind: Optional[str], debug: bool, log_level: Optional[str], processes: int, threads: int, ) -> None: from snuba import settings port: Union[int, str] if bind: if ":" in bind: host, port = bind.split(":", 1) port = int(port) else: raise click.ClickException("bind can only be in the format <host>:<port>") else: host, port = settings.HOST, settings.PORT if debug: if processes > 1 or threads > 1: raise click.ClickException("processes/threads can only be 1 in debug") from werkzeug.serving import WSGIRequestHandler from snuba.web.views import application setup_logging(log_level) WSGIRequestHandler.protocol_version = "HTTP/1.1" application.run(host=host, port=port, threaded=True, debug=debug) else: import mywsgi if log_level: os.environ["LOG_LEVEL"] = log_level mywsgi.run( "snuba.web.wsgi:application", f"{host}:{port}", processes=processes, threads=threads, )
def bulk_load( *, storage_name: str, dest_table: str, source: str, log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.load-snapshot") logger.info("Start bulk load process for storage %s, from source %s", storage_name, source) storage = get_cdc_storage(StorageKey(storage_name)) table_writer = storage.get_table_writer() # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = table_writer.get_bulk_loader( snapshot_source, storage.get_postgres_table(), dest_table, storage.get_row_processor(), ) # TODO: see whether we need to pass options to the writer writer = BufferedWriterWrapper( table_writer.get_batch_writer( environment.metrics, table_name=dest_table, chunk_size=settings.BULK_CLICKHOUSE_BUFFER, ), settings.BULK_CLICKHOUSE_BUFFER, JSONRowEncoder(), ) loader.load(writer)
def migrate(*, log_level: Optional[str] = None, dataset_name: Optional[str] = None) -> None: from snuba.migrate import logger, run setup_logging(log_level) if not local_dataset_mode(): logger.error("The migration tool can only work on local dataset mode.") sys.exit(1) dataset_names = [dataset_name] if dataset_name else DATASET_NAMES for name in dataset_names: dataset = get_dataset(name) logger.info("Migrating dataset %s", name) clickhouse = Client( host=settings.CLICKHOUSE_HOST, port=settings.CLICKHOUSE_PORT, ) run(clickhouse, dataset)
def run( group: str, migration_id: str, force: bool, fake: bool, dry_run: bool, log_level: Optional[str] = None, ) -> None: """ Runs a single migration. --force must be passed in order to run blocking migrations. --fake marks a migration as completed without running anything. Migrations that are already in an in-progress or completed status will not be run. """ setup_logging(log_level) if not dry_run: check_clickhouse_connections() runner = Runner() migration_group = MigrationGroup(group) migration_key = MigrationKey(migration_group, migration_id) if dry_run: runner.run_migration(migration_key, dry_run=True) return try: if fake: click.confirm( "This will mark the migration as completed without actually running it. Your database may be in an invalid state. Are you sure?", abort=True, ) runner.run_migration(migration_key, force=force, fake=fake) except MigrationError as e: raise click.ClickException(str(e)) click.echo(f"Finished running migration {migration_key}")
def admin( *, debug: bool, log_level: Optional[str], processes: int, threads: int, ) -> None: from snuba import settings host, port = settings.ADMIN_HOST, settings.ADMIN_PORT if debug: if processes > 1 or threads > 1: raise click.ClickException( "processes/threads can only be 1 in debug") from werkzeug.serving import WSGIRequestHandler from snuba.admin.views import application setup_logging(log_level) WSGIRequestHandler.protocol_version = "HTTP/1.1" application.run(host=host, port=port, threaded=True, debug=debug) else: import mywsgi if log_level: os.environ["LOG_LEVEL"] = log_level mywsgi.run( "snuba.admin.wsgi:application", f"{host}:{port}", processes=processes, threads=threads, )
def cleanup( *, clickhouse_host: str, clickhouse_port: int, dry_run: bool, database: str, dataset_name: str, log_level: Optional[str] = None, ) -> None: """ Deletes stale partitions for ClickHouse tables """ setup_logging(log_level) from snuba.cleanup import run_cleanup, logger from snuba.clickhouse.native import ClickhousePool dataset = get_dataset(dataset_name) table = enforce_table_writer(dataset).get_schema().get_local_table_name() clickhouse = ClickhousePool(clickhouse_host, clickhouse_port) num_dropped = run_cleanup(clickhouse, database, table, dry_run=dry_run) logger.info("Dropped %s partitions on %s" % (num_dropped, clickhouse_host))
def reverse( group: str, migration_id: str, force: bool, fake: bool, dry_run: bool, log_level: Optional[str] = None, ) -> None: """ Reverses a single migration. --force is required to reverse an already completed migration. --fake marks a migration as reversed without doing anything. """ setup_logging(log_level) if not dry_run: check_clickhouse_connections() runner = Runner() migration_group = MigrationGroup(group) migration_key = MigrationKey(migration_group, migration_id) if dry_run: runner.reverse_migration(migration_key, dry_run=True) return try: if fake: click.confirm( "This will mark the migration as not started without actually reversing it. Your database may be in an invalid state. Are you sure?", abort=True, ) runner.reverse_migration(migration_key, force=force, fake=fake) except MigrationError as e: raise click.ClickException(str(e)) click.echo(f"Finished reversing migration {migration_key}")
def multistorage_consumer( storage_names: Sequence[str], consumer_group: str, commit_log_topic: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, no_strict_offset_reset: bool, queued_max_messages_kbytes: int, queued_min_messages: int, parallel_collect: bool, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, dead_letter_topic: Optional[str] = None, cooperative_rebalancing: bool = False, ) -> None: DEFAULT_BLOCK_SIZE = int(32 * 1e6) if processes is not None: if input_block_size is None: input_block_size = DEFAULT_BLOCK_SIZE if output_block_size is None: output_block_size = DEFAULT_BLOCK_SIZE setup_logging(log_level) setup_sentry() logger.info("Consumer Starting") storages = { key: get_writable_storage(key) for key in (getattr(StorageKey, name.upper()) for name in storage_names) } topics = { storage.get_table_writer().get_stream_loader().get_default_topic_spec( ).topic_name for storage in storages.values() } # XXX: The ``StreamProcessor`` only supports a single topic at this time, # but is easily modified. The topic routing in the processing strategy is a # bit trickier (but also shouldn't be too bad.) topic = Topic(topics.pop()) if topics: raise ValueError("only one topic is supported") commit_log: Optional[Topic] if commit_log_topic: commit_log = Topic(commit_log_topic) else: # XXX: The ``CommitLogConsumer`` also only supports a single topic at this # time. (It is less easily modified.) This also assumes the commit log # topic is on the same Kafka cluster as the input topic. commit_log_topics = { spec.topic_name for spec in (storage.get_table_writer().get_stream_loader( ).get_commit_log_topic_spec() for storage in storages.values()) if spec is not None } if commit_log_topics: commit_log = Topic(commit_log_topics.pop()) else: commit_log = None if commit_log_topics: raise ValueError("only one commit log topic is supported") # XXX: This requires that all storages are associated with the same Kafka # cluster so that they can be consumed by the same consumer instance. # Unfortunately, we don't have the concept of independently configurable # Kafka clusters in settings, only consumer configurations that are # associated with storages and/or global default configurations. To avoid # implementing yet another method of configuring Kafka clusters, this just # piggybacks on the existing configuration method(s), with the assumption # that most deployments are going to be using the default configuration. storage_keys = [*storages.keys()] kafka_topic = (storages[storage_keys[0]].get_table_writer(). get_stream_loader().get_default_topic_spec().topic) consumer_configuration = build_kafka_consumer_configuration( kafka_topic, consumer_group, auto_offset_reset=auto_offset_reset, strict_offset_reset=not no_strict_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ) if cooperative_rebalancing is True: consumer_configuration[ "partition.assignment.strategy"] = "cooperative-sticky" for storage_key in storage_keys[1:]: if (build_kafka_consumer_configuration( storages[storage_key].get_table_writer().get_stream_loader(). get_default_topic_spec().topic, consumer_group, )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]): raise ValueError( "storages cannot be located on different Kafka clusters") metrics = MetricsWrapper( environment.metrics, "consumer", tags={ "group": consumer_group, "storage": "_".join([storage_keys[0].value, "m"]), }, ) # Collect metrics from librdkafka if we have stats_collection_freq_ms set # for the consumer group, or use the default. stats_collection_frequency_ms = get_config( f"stats_collection_freq_ms_{consumer_group}", get_config("stats_collection_freq_ms", 0), ) if stats_collection_frequency_ms and stats_collection_frequency_ms > 0: def stats_callback(stats_json: str) -> None: stats = rapidjson.loads(stats_json) metrics.gauge("librdkafka.total_queue_size", stats.get("replyq", 0)) consumer_configuration.update({ "statistics.interval.ms": stats_collection_frequency_ms, "stats_cb": stats_callback, }) if commit_log is None: consumer = KafkaConsumer(consumer_configuration) else: # XXX: This relies on the assumptions that a.) all storages are # located on the same Kafka cluster (validated above.) commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer( ).get_stream_loader().get_commit_log_topic_spec()) assert commit_log_topic_spec is not None producer = ConfluentKafkaProducer( build_kafka_producer_configuration(commit_log_topic_spec.topic)) consumer = KafkaConsumerWithCommitLog( consumer_configuration, producer=producer, commit_log_topic=commit_log, ) dead_letter_producer: Optional[KafkaProducer] = None dead_letter_queue: Optional[Topic] = None if dead_letter_topic: dead_letter_queue = Topic(dead_letter_topic) dead_letter_producer = KafkaProducer( build_kafka_producer_configuration( StreamsTopic(dead_letter_topic))) configure_metrics(StreamMetricsAdapter(metrics)) processor = StreamProcessor( consumer, topic, MultistorageConsumerProcessingStrategyFactory( [*storages.values()], max_batch_size, max_batch_time_ms / 1000.0, parallel_collect=parallel_collect, processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, metrics=metrics, producer=dead_letter_producer, topic=dead_letter_queue, ), ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) if dead_letter_producer: with closing(dead_letter_producer): processor.run() else: processor.run()
def consumer( *, raw_events_topic: Optional[str], replacements_topic: Optional[str], commit_log_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, no_strict_offset_reset: bool, queued_max_messages_kbytes: int, queued_min_messages: int, parallel_collect: bool, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, profile_path: Optional[str] = None, cooperative_rebalancing: bool = False, ) -> None: setup_logging(log_level) setup_sentry() logger.info("Consumer Starting") storage_key = StorageKey(storage_name) metrics = MetricsWrapper( environment.metrics, "consumer", tags={"group": consumer_group, "storage": storage_key.value}, ) configure_metrics(StreamMetricsAdapter(metrics)) def stats_callback(stats_json: str) -> None: stats = rapidjson.loads(stats_json) metrics.gauge("librdkafka.total_queue_size", stats.get("replyq", 0)) consumer_builder = ConsumerBuilder( storage_key=storage_key, kafka_params=KafkaParameters( raw_topic=raw_events_topic, replacements_topic=replacements_topic, bootstrap_servers=bootstrap_server, group_id=consumer_group, commit_log_topic=commit_log_topic, auto_offset_reset=auto_offset_reset, strict_offset_reset=not no_strict_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), processing_params=ProcessingParameters( processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, ), max_batch_size=max_batch_size, max_batch_time_ms=max_batch_time_ms, metrics=metrics, profile_path=profile_path, stats_callback=stats_callback, parallel_collect=parallel_collect, cooperative_rebalancing=cooperative_rebalancing, ) consumer = consumer_builder.build_base_consumer() def handler(signum: int, frame: Any) -> None: consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) consumer.run()
def subscriptions_scheduler_executor( *, dataset_name: str, entity_names: Sequence[str], consumer_group: str, followed_consumer_group: str, max_concurrent_queries: int, total_concurrent_queries: int, auto_offset_reset: str, no_strict_offset_reset: bool, schedule_ttl: int, delay_seconds: Optional[int], stale_threshold_seconds: Optional[int], log_level: Optional[str], # TODO: Temporarily overrides the scheduling mode. # Required for single tenant since some partitions may be empty. # To be removed once transactions is no longer semantically partitioned. scheduling_mode: Optional[str], ) -> None: """ Combined subscriptions scheduler and executor. Alternative to the separate scheduler and executor processes. """ setup_logging(log_level) setup_sentry() metrics = MetricsWrapper( environment.metrics, "subscriptions.scheduler_executor", tags={"dataset": dataset_name}, ) configure_metrics(StreamMetricsAdapter(metrics)) # Just get the result topic configuration from the first entity. Later we # check they all have the same result topic anyway before building the consumer. entity_key = EntityKey(entity_names[0]) storage = get_entity(entity_key).get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() result_topic_spec = stream_loader.get_subscription_scheduled_topic_spec() assert result_topic_spec is not None producer = KafkaProducer( build_kafka_producer_configuration( result_topic_spec.topic, override_params={"partitioner": "consistent"}, ) ) processor = build_scheduler_executor_consumer( dataset_name, entity_names, consumer_group, followed_consumer_group, producer, auto_offset_reset, not no_strict_offset_reset, schedule_ttl, delay_seconds, stale_threshold_seconds, max_concurrent_queries, total_concurrent_queries, metrics, SchedulingWatermarkMode(scheduling_mode) if scheduling_mode is not None else None, ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) with closing(producer), flush_querylog(): processor.run()
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, max_query_workers: Optional[int], schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], delay_seconds: Optional[int], ) -> None: """Evaluates subscribed queries for a dataset.""" setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) storage = dataset.get_default_entity().get_writable_storage() assert ( storage is not None ), f"Dataset {dataset_name} does not have a writable storage by default." loader = enforce_table_writer(dataset).get_stream_loader() commit_log_topic_spec = loader.get_commit_log_topic_spec() assert commit_log_topic_spec is not None result_topic_spec = loader.get_subscription_result_topic_spec() assert result_topic_spec is not None metrics = MetricsWrapper( environment.metrics, "subscriptions", tags={ "group": consumer_group, "dataset": dataset_name }, ) consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( loader.get_default_topic_spec().topic, consumer_group, auto_offset_reset=auto_offset_reset, bootstrap_servers=bootstrap_servers, ), ), KafkaConsumer( build_kafka_consumer_configuration( commit_log_topic_spec.topic, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", bootstrap_servers=bootstrap_servers, ), ), (Topic(commit_log_topic) if commit_log_topic is not None else Topic(commit_log_topic_spec.topic_name)), set(commit_log_groups), ), time_shift=(timedelta(seconds=delay_seconds * -1) if delay_seconds is not None else None), ) producer = ProducerEncodingWrapper( KafkaProducer( build_kafka_producer_configuration( loader.get_default_topic_spec().topic, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )), SubscriptionTaskResultEncoder(), ) executor = ThreadPoolExecutor(max_workers=max_query_workers) logger.debug("Starting %r with %s workers...", executor, getattr(executor, "_max_workers", 0)) metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0)) with closing(consumer), executor, closing(producer): from arroyo import configure_metrics configure_metrics(StreamMetricsAdapter(metrics)) batching_consumer = StreamProcessor( consumer, (Topic(topic) if topic is not None else Topic( loader.get_default_topic_spec().topic_name)), BatchProcessingStrategyFactory( SubscriptionWorker( dataset, executor, { index: SubscriptionScheduler( RedisSubscriptionDataStore(redis_client, dataset, PartitionId(index)), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=metrics, ) for index in range(partitions if partitions is not None else loader. get_default_topic_spec().partitions_number) }, producer, Topic(result_topic) if result_topic is not None else Topic( result_topic_spec.topic_name), metrics, ), max_batch_size, max_batch_time_ms, ), ) def handler(signum: int, frame: Optional[Any]) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def replacer( *, replacements_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, log_level: Optional[str] = None, ) -> None: from snuba.replacer import ReplacerWorker from snuba.utils.streams import Topic from snuba.utils.streams.backends.kafka import ( KafkaConsumer, TransportError, build_kafka_consumer_configuration, ) from snuba.utils.streams.processing import StreamProcessor from snuba.utils.streams.processing.strategies.batching import ( BatchProcessingStrategyFactory, ) setup_logging(log_level) setup_sentry() storage_key = StorageKey(storage_name) storage = get_writable_storage(storage_key) metrics_tags = {"group": consumer_group, "storage": storage_name} stream_loader = storage.get_table_writer().get_stream_loader() default_replacement_topic_spec = stream_loader.get_replacement_topic_spec() assert ( default_replacement_topic_spec is not None ), f"Storage {storage.get_storage_key().value} does not have a replacement topic." replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name metrics = MetricsWrapper( environment.metrics, "replacer", tags=metrics_tags, ) replacer = StreamProcessor( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers=bootstrap_server, group_id=consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), ), Topic(replacements_topic), BatchProcessingStrategyFactory( worker=ReplacerWorker(storage, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, ), metrics=metrics, recoverable_errors=[TransportError], ) def handler(signum: int, frame: Any) -> None: replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) replacer.run()
def confirm_load( *, control_topic: Optional[str], bootstrap_server: Sequence[str], storage_name: str, source: str, log_level: Optional[str] = None, ) -> None: """ Confirms the snapshot has been loaded by sending the snapshot-loaded message on the control topic. """ setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.loaded-snapshot") logger.info( "Sending load completion message for storage %s, from source %s", storage_name, source, ) storage_key = StorageKey(storage_name) storage = get_cdc_storage(storage_key) stream_loader = storage.get_table_writer().get_stream_loader() control_topic = control_topic or storage.get_default_control_topic() snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) descriptor = snapshot_source.get_descriptor() producer = Producer( build_kafka_producer_configuration( stream_loader.get_default_topic_spec().topic, bootstrap_servers=bootstrap_server, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) ) msg = SnapshotLoaded( id=descriptor.id, transaction_info=TransactionData( xmin=descriptor.xmin, xmax=descriptor.xmax, xip_list=descriptor.xip_list, ), ) json_string = json.dumps(msg.to_dict()) def delivery_callback(error: KafkaError, message: Message) -> None: if error is not None: raise error else: logger.info("Message sent %r", message.value()) producer.produce( control_topic, value=json_string, on_delivery=delivery_callback, ) producer.flush()
def bulk_load( *, storage_name: str, dest_table: Optional[str], source: str, ignore_existing_data: bool, pre_processed: bool, show_progress: bool, log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.load-snapshot") logger.info( "Start bulk load process for storage %s, from source %s", storage_name, source ) storage = get_cdc_storage(StorageKey(storage_name)) table_writer = storage.get_table_writer() # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = table_writer.get_bulk_loader( snapshot_source, storage.get_postgres_table(), storage.get_row_processor(), dest_table, ) # TODO: see whether we need to pass options to the writer def progress_callback(bar: progressbar.ProgressBar, progress: int) -> None: bar.update(progress) if show_progress: progress = progressbar.ProgressBar( max_value=snapshot_source.get_table_file_size(storage.get_postgres_table()) ) progress_func: Optional[ProgressCallback] = partial(progress_callback, progress) else: progress_func = None table_descriptor = snapshot_source.get_descriptor().get_table( storage.get_postgres_table() ) if pre_processed: writer = table_writer.get_bulk_writer( metrics=environment.metrics, encoding="gzip" if table_descriptor.zip else None, column_names=[c.name for c in table_descriptor.columns or []], table_name=dest_table, ) loader.load_preprocessed( writer, ignore_existing_data, progress_callback=progress_func ) else: buffer_writer = BufferedWriterWrapper( table_writer.get_batch_writer( environment.metrics, table_name=dest_table, chunk_size=settings.BULK_CLICKHOUSE_BUFFER, ), settings.BULK_CLICKHOUSE_BUFFER, JSONRowEncoder(), ) loader.load( buffer_writer, ignore_existing_data, progress_callback=progress_func )
def consumer( *, raw_events_topic: Optional[str], replacements_topic: Optional[str], commit_log_topic: Optional[str], control_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], dataset_name: Optional[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, stateful_consumer: bool, rapidjson_deserialize: bool, rapidjson_serialize: bool, log_level: Optional[str] = None, ) -> None: if not bootstrap_server: if dataset_name: bootstrap_server = settings.DEFAULT_DATASET_BROKERS.get( dataset_name, settings.DEFAULT_BROKERS, ) else: bootstrap_server = settings.DEFAULT_STORAGE_BROKERS.get( storage_name, settings.DEFAULT_BROKERS, ) setup_logging(log_level) setup_sentry() # TODO: Remove this once dataset_name is no longer being passed if dataset_name: dataset_writable_storage = get_dataset( dataset_name).get_writable_storage() if not dataset_writable_storage: raise click.ClickException( f"Dataset {dataset_name} has no writable storage") storage_name = {v: k for k, v in WRITABLE_STORAGES.items() }[dataset_writable_storage] consumer_builder = ConsumerBuilder( storage_name=storage_name, raw_topic=raw_events_topic, replacements_topic=replacements_topic, max_batch_size=max_batch_size, max_batch_time_ms=max_batch_time_ms, bootstrap_servers=bootstrap_server, group_id=consumer_group, commit_log_topic=commit_log_topic, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, rapidjson_deserialize=rapidjson_deserialize, rapidjson_serialize=rapidjson_serialize, ) if stateful_consumer: storage = get_cdc_storage(storage_name) assert storage is not None, "Only CDC storages have a control topic thus are supported." context = ConsumerStateMachine( consumer_builder=consumer_builder, topic=control_topic or storage.get_default_control_topic(), group_id=consumer_group, storage=storage, ) def handler(signum, frame) -> None: context.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) context.run() else: consumer = consumer_builder.build_base_consumer() def handler(signum, frame) -> None: consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) consumer.run()
def confirm_load( *, control_topic: Optional[str], bootstrap_server: Sequence[str], dataset_name: str, source: Optional[str], log_level: Optional[str] = None, ) -> None: """ Confirms the snapshot has been loaded by sending the snapshot-loaded message on the control topic. """ setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.loaded-snapshot") logger.info( "Sending load completion message for dataset %s, from source %s", dataset_name, source, ) dataset = get_dataset(dataset_name) storage = dataset.get_writable_storage() assert isinstance( storage, CdcStorage ), "Only CDC storages have a control topic thus are supported." control_topic = control_topic or storage.get_default_control_topic() snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) descriptor = snapshot_source.get_descriptor() if not bootstrap_server: bootstrap_server = settings.DEFAULT_DATASET_BROKERS.get( dataset, settings.DEFAULT_BROKERS, ) producer = Producer({ "bootstrap.servers": ",".join(bootstrap_server), "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }) msg = SnapshotLoaded( id=descriptor.id, transaction_info=TransactionData( xmin=descriptor.xmin, xmax=descriptor.xmax, xip_list=descriptor.xip_list, ), ) json_string = json.dumps(msg.to_dict()) def delivery_callback(error, message) -> None: if error is not None: raise error else: logger.info("Message sent %r", message.value()) producer.produce( control_topic, value=json_string, on_delivery=delivery_callback, ) producer.flush()
def replacer( *, replacements_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], dataset_name: Optional[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, log_level: Optional[str] = None, ) -> None: from snuba.clickhouse.native import ClickhousePool from snuba.replacer import ReplacerWorker from snuba.utils.codecs import PassthroughCodec from snuba.utils.streams.batching import BatchingConsumer from snuba.utils.streams.kafka import ( KafkaConsumer, KafkaPayload, TransportError, build_kafka_consumer_configuration, ) from snuba.utils.streams.types import Topic setup_logging(log_level) setup_sentry() storage = get_writable_storage(storage_name) metrics_tags = {"group": consumer_group, "storage": storage_name} # If dataset_name is provided, use the writable storage from that dataset. # This can be removed once we are passing storage_name instead of # dataset_name everywhere if dataset_name: dataset = get_dataset(dataset_name) storage = dataset.get_writable_storage() metrics_tags = {"group": consumer_group, "dataset": dataset_name} stream_loader = storage.get_table_writer().get_stream_loader() default_replacement_topic_spec = stream_loader.get_replacement_topic_spec() assert (default_replacement_topic_spec is not None ), f"Storage {type(storage)} does not have a replacement topic." replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name metrics = MetricsWrapper( environment.metrics, "replacer", tags=metrics_tags, ) client_settings = { # Replacing existing rows requires reconstructing the entire tuple for each # event (via a SELECT), which is a Hard Thing (TM) for columnstores to do. With # the default settings it's common for ClickHouse to go over the default max_memory_usage # of 10GB per query. Lowering the max_block_size reduces memory usage, and increasing the # max_memory_usage gives the query more breathing room. "max_block_size": settings.REPLACER_MAX_BLOCK_SIZE, "max_memory_usage": settings.REPLACER_MAX_MEMORY_USAGE, # Don't use up production cache for the count() queries. "use_uncompressed_cache": 0, } clickhouse = ClickhousePool( settings.CLICKHOUSE_HOST, settings.CLICKHOUSE_PORT, client_settings=client_settings, ) codec: PassthroughCodec[KafkaPayload] = PassthroughCodec() replacer = BatchingConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers=bootstrap_server, group_id=consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), codec=codec, ), Topic(replacements_topic), worker=ReplacerWorker(clickhouse, storage, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, recoverable_errors=[TransportError], ) def handler(signum, frame) -> None: replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) replacer.run()
def bootstrap( *, bootstrap_server: Sequence[str], kafka: bool, migrate: bool, force: bool, log_level: Optional[str] = None, ) -> None: """ Warning: Not intended to be used in production yet. """ if not force: raise click.ClickException("Must use --force to run") setup_logging(log_level) logger = logging.getLogger("snuba.bootstrap") import time if kafka: logger.debug("Using Kafka with %r", bootstrap_server) from confluent_kafka.admin import AdminClient, NewTopic attempts = 0 while True: try: logger.debug("Attempting to connect to Kafka (attempt %d)", attempts) client = AdminClient({ "bootstrap.servers": ",".join(bootstrap_server), "socket.timeout.ms": 1000, }) client.list_topics(timeout=1) break except Exception as e: logger.error("Connection to Kafka failed (attempt %d)", attempts, exc_info=e) attempts += 1 if attempts == 60: raise time.sleep(1) topics = {} for name in ACTIVE_DATASET_NAMES: dataset = get_dataset(name) for entity in dataset.get_all_entities(): writable_storage = entity.get_writable_storage() if writable_storage: table_writer = writable_storage.get_table_writer() stream_loader = table_writer.get_stream_loader() for topic_spec in stream_loader.get_all_topic_specs(): if topic_spec.topic_name in topics: continue logger.debug("Adding topic %s to creation list", topic_spec.topic_name) topics[topic_spec.topic_name] = NewTopic( topic_spec.topic_name, num_partitions=topic_spec.partitions_number, replication_factor=topic_spec.replication_factor, ) logger.debug("Initiating topic creation") for topic, future in client.create_topics(list(topics.values()), operation_timeout=1).items(): try: future.result() logger.info("Topic %s created", topic) except Exception as e: logger.error("Failed to create topic %s", topic, exc_info=e) if migrate: check_clickhouse_connections() Runner().run_all(force=True)
def multistorage_consumer( storage_names: Sequence[str], consumer_group: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, ) -> None: DEFAULT_BLOCK_SIZE = int(32 * 1e6) if processes is not None: if input_block_size is None: input_block_size = DEFAULT_BLOCK_SIZE if output_block_size is None: output_block_size = DEFAULT_BLOCK_SIZE setup_logging(log_level) setup_sentry() storages = { key: get_writable_storage(key) for key in (getattr(StorageKey, name.upper()) for name in storage_names) } topics = { storage.get_table_writer().get_stream_loader().get_default_topic_spec( ).topic_name for storage in storages.values() } # XXX: The ``StreamProcessor`` only supports a single topic at this time, # but is easily modified. The topic routing in the processing strategy is a # bit trickier (but also shouldn't be too bad.) topic = Topic(topics.pop()) if topics: raise ValueError("only one topic is supported") # XXX: The ``CommitLogConsumer`` also only supports a single topic at this # time. (It is less easily modified.) This also assumes the commit log # topic is on the same Kafka cluster as the input topic. commit_log_topics = { spec.topic_name for spec in (storage.get_table_writer().get_stream_loader( ).get_commit_log_topic_spec() for storage in storages.values()) if spec is not None } commit_log_topic: Optional[Topic] if commit_log_topics: commit_log_topic = Topic(commit_log_topics.pop()) else: commit_log_topic = None if commit_log_topics: raise ValueError("only one commit log topic is supported") # XXX: This requires that all storages are associated with the same Kafka # cluster so that they can be consumed by the same consumer instance. # Unfortunately, we don't have the concept of independently configurable # Kafka clusters in settings, only consumer configurations that are # associated with storages and/or global default configurations. To avoid # implementing yet another method of configuring Kafka clusters, this just # piggybacks on the existing configuration method(s), with the assumption # that most deployments are going to be using the default configuration. storage_keys = [*storages.keys()] kafka_topic = (storages[storage_keys[0]].get_table_writer(). get_stream_loader().get_default_topic_spec().topic) consumer_configuration = build_kafka_consumer_configuration( kafka_topic, consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ) for storage_key in storage_keys[1:]: if (build_kafka_consumer_configuration( storages[storage_key].get_table_writer().get_stream_loader(). get_default_topic_spec().topic, consumer_group, )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]): raise ValueError( "storages cannot be located on different Kafka clusters") if commit_log_topic is None: consumer = KafkaConsumer(consumer_configuration) else: # XXX: This relies on the assumptions that a.) all storages are # located on the same Kafka cluster (validated above.) commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer( ).get_stream_loader().get_commit_log_topic_spec()) assert commit_log_topic_spec is not None producer = ConfluentKafkaProducer( build_kafka_producer_configuration(commit_log_topic_spec.topic)) consumer = KafkaConsumerWithCommitLog( consumer_configuration, producer=producer, commit_log_topic=commit_log_topic, ) metrics = MetricsWrapper(environment.metrics, "consumer") configure_metrics(StreamMetricsAdapter(metrics)) processor = StreamProcessor( consumer, topic, MultistorageConsumerProcessingStrategyFactory( [*storages.values()], max_batch_size, max_batch_time_ms / 1000.0, processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, metrics=metrics, ), ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) processor.run()
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], ) -> None: """Evaluates subscribed queries for a dataset.""" assert result_topic is not None setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) if not bootstrap_servers: bootstrap_servers = settings.DEFAULT_DATASET_BROKERS.get( dataset_name, settings.DEFAULT_BROKERS ) loader = enforce_table_writer(dataset).get_stream_loader() consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, consumer_group, auto_offset_reset=auto_offset_reset, ), PassthroughCodec(), ), KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", ), CommitCodec(), ), ( Topic(commit_log_topic) if commit_log_topic is not None else Topic(loader.get_commit_log_topic_spec().topic_name) ), set(commit_log_groups), ) ) producer = KafkaProducer( { "bootstrap.servers": ",".join(bootstrap_servers), "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, SubscriptionResultCodec(), ) with closing(consumer), closing(producer): batching_consumer = BatchingConsumer( consumer, ( Topic(topic) if topic is not None else Topic(loader.get_default_topic_spec().topic_name) ), SubscriptionWorker( SubscriptionExecutor( dataset, ThreadPoolExecutor( max_workers=settings.SUBSCRIPTIONS_MAX_CONCURRENT_QUERIES ), ), { index: SubscriptionScheduler( RedisSubscriptionDataStore( redis_client, dataset, PartitionId(index) ), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), ) for index in range( partitions if partitions is not None else loader.get_default_topic_spec().partitions_number ) }, producer, Topic(result_topic), ), max_batch_size, max_batch_time_ms, create_metrics( "snuba.subscriptions", tags={"group": consumer_group, "dataset": dataset_name}, ), ) def handler(signum, frame) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()