def test_clusters() -> None: importlib.reload(cluster) assert (get_storage(StorageKey("events")).get_cluster() == get_storage( StorageKey("errors")).get_cluster()) assert (get_storage(StorageKey("events")).get_cluster() != get_storage( StorageKey("transactions")).get_cluster())
def test_cache_partition() -> None: get_storage( StorageKey("transactions") ).get_cluster().get_reader().cache_partition_id == "host_2_cache" get_storage( StorageKey("errors") ).get_cluster().get_reader().cache_partition_id is None
def test_clusters() -> None: assert ( get_storage(StorageKey("events")).get_cluster() == get_storage(StorageKey("errors")).get_cluster() ) assert ( get_storage(StorageKey("events")).get_cluster() != get_storage(StorageKey("transactions")).get_cluster() )
def test_get_local_nodes() -> None: with patch.object(ClickhousePool, "execute") as execute: execute.return_value = [ ("host_1", 9000, 1, 1), ("host_2", 9000, 2, 1), ] local_cluster = get_storage(StorageKey("events")).get_cluster() assert len(local_cluster.get_local_nodes()) == 1 assert local_cluster.get_local_nodes()[0].host_name == "host_1" assert local_cluster.get_local_nodes()[0].port == 9000 assert local_cluster.get_local_nodes()[0].shard is None assert local_cluster.get_local_nodes()[0].replica is None distributed_cluster = get_storage(StorageKey("transactions")).get_cluster() assert len(distributed_cluster.get_local_nodes()) == 2 assert distributed_cluster.get_local_nodes()[0].host_name == "host_1" assert distributed_cluster.get_local_nodes()[1].host_name == "host_2"
def optimize( *, clickhouse_host: Optional[str], clickhouse_port: Optional[int], storage_name: str, parallel: int, log_level: Optional[str] = None, ) -> None: from datetime import datetime from snuba.clickhouse.native import ClickhousePool from snuba.optimize import logger, run_optimize setup_logging(log_level) setup_sentry() storage: ReadableTableStorage storage_key = StorageKey(storage_name) storage = get_storage(storage_key) (clickhouse_user, clickhouse_password) = storage.get_cluster().get_credentials() today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) database = storage.get_cluster().get_database() # TODO: In distributed mode, optimize currently must be run once for each node # with the host and port of that node provided via the CLI. In the future, # passing this information won't be necessary, and running this command once # will ensure that optimize is performed on all of the individual nodes for # that cluster. if clickhouse_host and clickhouse_port: connection = ClickhousePool( clickhouse_host, clickhouse_port, clickhouse_user, clickhouse_password, database, send_receive_timeout=ClickhouseClientSettings.OPTIMIZE.value.timeout, ) elif not storage.get_cluster().is_single_node(): raise click.ClickException("Provide Clickhouse host and port for optimize") else: connection = storage.get_cluster().get_query_connection( ClickhouseClientSettings.OPTIMIZE ) num_dropped = run_optimize( connection, storage, database, before=today, parallel=parallel, clickhouse_host=clickhouse_host, ) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
def test_default_config_legacy_override_storage_servers_fallback(): default_broker = "my.other.broker:9092" default_brokers = ["my.broker:9092", "my.second.broker:9092"] settings.DEFAULT_BROKERS = [default_broker] settings.DEFAULT_STORAGE_BROKERS = { StorageKey.EVENTS.value: default_brokers, } storage_key = StorageKey(StorageKey.ERRORS) broker_config = get_default_kafka_configuration(storage_key=storage_key) assert broker_config["bootstrap.servers"] == default_broker
def get_ro_node_connection( clickhouse_host: str, clickhouse_port: int, storage_name: str, client_settings: ClickhouseClientSettings, ) -> ClickhousePool: storage_key = None try: storage_key = StorageKey(storage_name) except ValueError: raise InvalidStorageError( f"storage {storage_name} is not a valid storage name", extra_data={"storage_name": storage_name}, ) key = f"{storage_key}-{clickhouse_host}" if key in NODE_CONNECTIONS: return NODE_CONNECTIONS[key] storage = get_storage(storage_key) cluster = storage.get_cluster() if not is_valid_node(clickhouse_host, clickhouse_port, cluster): raise InvalidNodeError( f"host {clickhouse_host} and port {clickhouse_port} are not valid", extra_data={"host": clickhouse_host, "port": clickhouse_port}, ) database = cluster.get_database() assert client_settings in { ClickhouseClientSettings.QUERY, ClickhouseClientSettings.TRACING, }, "admin can only use QUERY or TRACING ClickhouseClientSettings" if client_settings == ClickhouseClientSettings.QUERY: username = settings.CLICKHOUSE_READONLY_USER password = settings.CLICKHOUSE_READONLY_PASSWORD else: username = settings.CLICKHOUSE_TRACE_USER password = settings.CLICKHOUSE_TRACE_PASSWORD connection = ClickhousePool( clickhouse_host, clickhouse_port, username, password, database, max_pool_size=2, client_settings=client_settings.value.settings, ) NODE_CONNECTIONS[key] = connection return connection
def test_default_config_legacy_override_storage_servers(): storage_name = StorageKey.EVENTS.value storage_key = StorageKey(storage_name) default_broker = "my.broker:9092" settings.DEFAULT_STORAGE_BROKERS = {storage_name: [default_broker]} broker_config = get_default_kafka_configuration(storage_key=storage_key) assert broker_config["bootstrap.servers"] == default_broker default_brokers = ["my.broker:9092", "my.second.broker:9092"] settings.DEFAULT_STORAGE_BROKERS = {storage_name: default_brokers} broker_config = get_default_kafka_configuration(storage_key=storage_key) assert broker_config["bootstrap.servers"] == ",".join(default_brokers)
def cleanup( *, clickhouse_host: Optional[str], clickhouse_port: Optional[int], dry_run: bool, database: str, storage_name: str, log_level: Optional[str] = None, ) -> None: """ Deletes stale partitions for ClickHouse tables """ setup_logging(log_level) from snuba.cleanup import run_cleanup, logger from snuba.clickhouse.native import ClickhousePool writable_storage = get_writable_storage(StorageKey(storage_name)) ( clickhouse_user, clickhouse_password, ) = writable_storage.get_cluster().get_credentials() table = writable_storage.get_table_writer().get_schema( ).get_local_table_name() if clickhouse_host and clickhouse_port and database: connection = ClickhousePool( clickhouse_host, clickhouse_port, clickhouse_user, clickhouse_password, database, ) elif not writable_storage.get_cluster().is_single_node(): raise click.ClickException( "Provide ClickHouse host and port for cleanup") else: connection = writable_storage.get_cluster().get_query_connection( ClickhouseClientSettings.CLEANUP) num_dropped = run_cleanup(connection, database, table, dry_run=dry_run) logger.info("Dropped %s partitions on %s" % (num_dropped, clickhouse_host))
def bulk_load( *, storage_name: str, dest_table: str, source: str, log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.load-snapshot") logger.info("Start bulk load process for storage %s, from source %s", storage_name, source) storage = get_cdc_storage(StorageKey(storage_name)) table_writer = storage.get_table_writer() # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = table_writer.get_bulk_loader( snapshot_source, storage.get_postgres_table(), dest_table, storage.get_row_processor(), ) # TODO: see whether we need to pass options to the writer writer = BufferedWriterWrapper( table_writer.get_batch_writer( environment.metrics, table_name=dest_table, chunk_size=settings.BULK_CLICKHOUSE_BUFFER, ), settings.BULK_CLICKHOUSE_BUFFER, JSONRowEncoder(), ) loader.load(writer)
def get_ro_query_node_connection( storage_name: str, client_settings: ClickhouseClientSettings ) -> ClickhousePool: if storage_name in CLUSTER_CONNECTIONS: return CLUSTER_CONNECTIONS[storage_name] try: storage_key = StorageKey(storage_name) except ValueError: raise InvalidStorageError( f"storage {storage_name} is not a valid storage name", extra_data={"storage_name": storage_name}, ) storage = get_storage(storage_key) cluster = storage.get_cluster() connection_id = cluster.get_connection_id() connection = get_ro_node_connection( connection_id.hostname, connection_id.tcp_port, storage_name, client_settings ) CLUSTER_CONNECTIONS[storage_name] = connection return connection
def replacer( *, replacements_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, log_level: Optional[str] = None, ) -> None: from snuba.replacer import ReplacerWorker from snuba.utils.streams import Topic from snuba.utils.streams.backends.kafka import ( KafkaConsumer, TransportError, build_kafka_consumer_configuration, ) from snuba.utils.streams.processing import StreamProcessor from snuba.utils.streams.processing.strategies.batching import ( BatchProcessingStrategyFactory, ) setup_logging(log_level) setup_sentry() storage_key = StorageKey(storage_name) storage = get_writable_storage(storage_key) metrics_tags = {"group": consumer_group, "storage": storage_name} stream_loader = storage.get_table_writer().get_stream_loader() default_replacement_topic_spec = stream_loader.get_replacement_topic_spec() assert ( default_replacement_topic_spec is not None ), f"Storage {storage.get_storage_key().value} does not have a replacement topic." replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name metrics = MetricsWrapper( environment.metrics, "replacer", tags=metrics_tags, ) replacer = StreamProcessor( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers=bootstrap_server, group_id=consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), ), Topic(replacements_topic), BatchProcessingStrategyFactory( worker=ReplacerWorker(storage, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, ), metrics=metrics, recoverable_errors=[TransportError], ) def handler(signum: int, frame: Any) -> None: replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) replacer.run()
def bulk_load( *, storage_name: str, dest_table: Optional[str], source: str, ignore_existing_data: bool, pre_processed: bool, show_progress: bool, log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.load-snapshot") logger.info( "Start bulk load process for storage %s, from source %s", storage_name, source ) storage = get_cdc_storage(StorageKey(storage_name)) table_writer = storage.get_table_writer() # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = table_writer.get_bulk_loader( snapshot_source, storage.get_postgres_table(), storage.get_row_processor(), dest_table, ) # TODO: see whether we need to pass options to the writer def progress_callback(bar: progressbar.ProgressBar, progress: int) -> None: bar.update(progress) if show_progress: progress = progressbar.ProgressBar( max_value=snapshot_source.get_table_file_size(storage.get_postgres_table()) ) progress_func: Optional[ProgressCallback] = partial(progress_callback, progress) else: progress_func = None table_descriptor = snapshot_source.get_descriptor().get_table( storage.get_postgres_table() ) if pre_processed: writer = table_writer.get_bulk_writer( metrics=environment.metrics, encoding="gzip" if table_descriptor.zip else None, column_names=[c.name for c in table_descriptor.columns or []], table_name=dest_table, ) loader.load_preprocessed( writer, ignore_existing_data, progress_callback=progress_func ) else: buffer_writer = BufferedWriterWrapper( table_writer.get_batch_writer( environment.metrics, table_name=dest_table, chunk_size=settings.BULK_CLICKHOUSE_BUFFER, ), settings.BULK_CLICKHOUSE_BUFFER, JSONRowEncoder(), ) loader.load( buffer_writer, ignore_existing_data, progress_callback=progress_func )
def consumer( *, raw_events_topic: Optional[str], replacements_topic: Optional[str], commit_log_topic: Optional[str], control_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, stateful_consumer: bool, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, profile_path: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() storage_key = StorageKey(storage_name) consumer_builder = ConsumerBuilder( storage_key=storage_key, raw_topic=raw_events_topic, replacements_topic=replacements_topic, max_batch_size=max_batch_size, max_batch_time_ms=max_batch_time_ms, bootstrap_servers=bootstrap_server, group_id=consumer_group, commit_log_topic=commit_log_topic, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, profile_path=profile_path, ) if stateful_consumer: storage = get_cdc_storage(storage_key) assert (storage is not None ), "Only CDC storages have a control topic thus are supported." context = ConsumerStateMachine( consumer_builder=consumer_builder, topic=control_topic or storage.get_default_control_topic(), group_id=consumer_group, storage=storage, ) def handler(signum: int, frame: Any) -> None: context.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) context.run() else: consumer = consumer_builder.build_base_consumer() def handler(signum: int, frame: Any) -> None: consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) consumer.run()
def confirm_load( *, control_topic: Optional[str], bootstrap_server: Sequence[str], storage_name: str, source: str, log_level: Optional[str] = None, ) -> None: """ Confirms the snapshot has been loaded by sending the snapshot-loaded message on the control topic. """ setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.loaded-snapshot") logger.info( "Sending load completion message for storage %s, from source %s", storage_name, source, ) storage_key = StorageKey(storage_name) storage = get_cdc_storage(storage_key) stream_loader = storage.get_table_writer().get_stream_loader() control_topic = control_topic or storage.get_default_control_topic() snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) descriptor = snapshot_source.get_descriptor() producer = Producer( build_kafka_producer_configuration( stream_loader.get_default_topic_spec().topic, bootstrap_servers=bootstrap_server, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) ) msg = SnapshotLoaded( id=descriptor.id, transaction_info=TransactionData( xmin=descriptor.xmin, xmax=descriptor.xmax, xip_list=descriptor.xip_list, ), ) json_string = json.dumps(msg.to_dict()) def delivery_callback(error: KafkaError, message: Message) -> None: if error is not None: raise error else: logger.info("Message sent %r", message.value()) producer.produce( control_topic, value=json_string, on_delivery=delivery_callback, ) producer.flush()
def consumer( *, raw_events_topic: Optional[str], replacements_topic: Optional[str], commit_log_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, no_strict_offset_reset: bool, queued_max_messages_kbytes: int, queued_min_messages: int, parallel_collect: bool, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, profile_path: Optional[str] = None, cooperative_rebalancing: bool = False, ) -> None: setup_logging(log_level) setup_sentry() logger.info("Consumer Starting") storage_key = StorageKey(storage_name) metrics = MetricsWrapper( environment.metrics, "consumer", tags={"group": consumer_group, "storage": storage_key.value}, ) configure_metrics(StreamMetricsAdapter(metrics)) def stats_callback(stats_json: str) -> None: stats = rapidjson.loads(stats_json) metrics.gauge("librdkafka.total_queue_size", stats.get("replyq", 0)) consumer_builder = ConsumerBuilder( storage_key=storage_key, kafka_params=KafkaParameters( raw_topic=raw_events_topic, replacements_topic=replacements_topic, bootstrap_servers=bootstrap_server, group_id=consumer_group, commit_log_topic=commit_log_topic, auto_offset_reset=auto_offset_reset, strict_offset_reset=not no_strict_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), processing_params=ProcessingParameters( processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, ), max_batch_size=max_batch_size, max_batch_time_ms=max_batch_time_ms, metrics=metrics, profile_path=profile_path, stats_callback=stats_callback, parallel_collect=parallel_collect, cooperative_rebalancing=cooperative_rebalancing, ) consumer = consumer_builder.build_base_consumer() def handler(signum: int, frame: Any) -> None: consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) consumer.run()
def test_consumer( *, commit_log_topic: Optional[str], consumer_group: str, storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, no_strict_offset_reset: bool, queued_max_messages_kbytes: int, queued_min_messages: int, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], avg_latency_ms: int, latency_std_deviation_ms: int, parallel_collect: bool, log_level: Optional[str] = None, profile_path: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() storage_key = StorageKey(storage_name) metrics = MetricsWrapper( environment.metrics, "test_consumer", tags={ "group": consumer_group, "storage": storage_key.value }, ) configure_metrics(StreamMetricsAdapter(metrics)) consumer_builder = ConsumerBuilder( storage_key=storage_key, kafka_params=KafkaParameters( raw_topic=None, replacements_topic=None, bootstrap_servers=None, group_id=consumer_group, commit_log_topic=commit_log_topic, auto_offset_reset=auto_offset_reset, strict_offset_reset=not no_strict_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), processing_params=ProcessingParameters( processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, ), max_batch_size=max_batch_size, max_batch_time_ms=max_batch_time_ms, metrics=metrics, parallel_collect=parallel_collect, profile_path=profile_path, mock_parameters=MockParameters( avg_write_latency=avg_latency_ms, std_deviation=latency_std_deviation_ms, ), ) consumer = consumer_builder.build_base_consumer() def handler(signum: int, frame: Any) -> None: consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) consumer.run()