def create( self, commit: Callable[[Mapping[Partition, int]], None] ) -> ProcessingStrategy[KafkaPayload]: collect = CollectStep( self.__build_write_step, commit, self.__max_batch_size, self.__max_batch_time, ) transform_function = functools.partial(process_message, self.__processor) strategy: ProcessingStrategy[KafkaPayload] if self.__processes is None: strategy = TransformStep(transform_function, collect) else: assert self.__input_block_size is not None assert self.__output_block_size is not None strategy = ParallelTransformStep( transform_function, collect, self.__processes, max_batch_size=self.__max_batch_size, max_batch_time=self.__max_batch_time, input_block_size=self.__input_block_size, output_block_size=self.__output_block_size, metrics=MetricsWrapper(self.__metrics, "process"), ) if self.__prefilter is not None: strategy = FilterStep(self.__should_accept, strategy) return strategy
def write_step() -> ProcessedMessageBatchWriter: return ProcessedMessageBatchWriter( insert_batch_writer=InsertBatchWriter( writer, MetricsWrapper(metrics, "insertions") ), replacement_batch_writer=ReplacementBatchWriter( replacements_producer, Topic("replacements") ), )
def build_writer() -> ProcessedMessageBatchWriter: return ProcessedMessageBatchWriter( InsertBatchWriter( MockBatchWriter(storage.get_storage_key(), avg_write_latency, std_deviation), MetricsWrapper( metrics, "mock_insertions", {"storage": storage.get_storage_key().value}, ), ), MockReplacementBatchWriter() if with_replacements is not None else None, )
def build_writer() -> ProcessedMessageBatchWriter: insert_batch_writer = InsertBatchWriter( writer, MetricsWrapper(metrics, "insertions")) replacement_batch_writer: Optional[ReplacementBatchWriter] if supports_replacements: assert replacements_producer is not None assert replacements_topic is not None replacement_batch_writer = ReplacementBatchWriter( replacements_producer, replacements_topic) else: replacement_batch_writer = None return ProcessedMessageBatchWriter(insert_batch_writer, replacement_batch_writer)
def __build_batch_writer( self, storage: WritableTableStorage) -> ProcessedMessageBatchWriter: replacement_batch_writer: Optional[ReplacementBatchWriter] stream_loader = storage.get_table_writer().get_stream_loader() replacement_topic_spec = stream_loader.get_replacement_topic_spec() default_topic_spec = stream_loader.get_default_topic_spec() if replacement_topic_spec is not None: # XXX: The producer is flushed when closed on strategy teardown # after an assignment is revoked, but never explicitly closed. # XXX: This assumes that the Kafka cluster used for the input topic # to the storage is the same as the replacement topic. replacement_batch_writer = ReplacementBatchWriter( ConfluentKafkaProducer( build_kafka_producer_configuration( default_topic_spec.topic, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )), Topic(replacement_topic_spec.topic_name), ) else: replacement_batch_writer = None return ProcessedMessageBatchWriter( InsertBatchWriter( storage.get_table_writer().get_batch_writer( self.__metrics, { "load_balancing": "in_order", "insert_distributed_sync": 1 }, ), MetricsWrapper( self.__metrics, "insertions", {"storage": storage.get_storage_key().value}, ), ), replacement_batch_writer, )
from snuba.query.data_source.visitor import DataSourceVisitor from snuba.querylog import record_query from snuba.querylog.query_metadata import SnubaQueryMetadata from snuba.reader import Reader from snuba.request import Request from snuba.request.request_settings import RequestSettings from snuba.util import with_span from snuba.utils.metrics.gauge import Gauge from snuba.utils.metrics.timer import Timer from snuba.utils.metrics.wrapper import MetricsWrapper from snuba.web import QueryException, QueryResult, transform_column_names from snuba.web.db_query import raw_query logger = logging.getLogger("snuba.query") metrics = MetricsWrapper(environment.metrics, "api") MAX_QUERY_SIZE_BYTES = 256 * 1024 # 256 KiB by default class SampleClauseFinder(DataSourceVisitor[bool, Entity], JoinVisitor[bool, Entity]): """ Traverses a query to find FROM clauses that have a sampling rate set to check if turbo is set as well. """ def _visit_simple_source(self, data_source: Entity) -> bool: return data_source.sample is not None and data_source.sample != 1.0 def _visit_join(self, data_source: JoinClause[Entity]) -> bool: return self.visit_join_clause(data_source)
def subscriptions_executor( *, dataset_name: str, entity_names: Sequence[str], consumer_group: str, max_concurrent_queries: int, total_concurrent_queries: int, auto_offset_reset: str, no_strict_offset_reset: bool, log_level: Optional[str], stale_threshold_seconds: Optional[int], cooperative_rebalancing: bool, ) -> None: """ The subscription's executor consumes scheduled subscriptions from the scheduled subscription topic for that entity, executes the queries on ClickHouse and publishes results on the results topic. """ setup_logging(log_level) setup_sentry() metrics = MetricsWrapper( environment.metrics, "subscriptions.executor", tags={"dataset": dataset_name}, ) configure_metrics(StreamMetricsAdapter(metrics)) # Just get the result topic configuration from the first entity. Later we # check they all have the same result topic anyway before building the consumer. entity_key = EntityKey(entity_names[0]) storage = get_entity(entity_key).get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() result_topic_spec = stream_loader.get_subscription_result_topic_spec() assert result_topic_spec is not None producer = KafkaProducer( build_kafka_producer_configuration( result_topic_spec.topic, override_params={"partitioner": "consistent"}, )) # TODO: Consider removing and always passing via CLI. # If a value provided via config, it overrides the one provided via CLI. # This is so we can quickly change this in an emergency. stale_threshold_seconds = state.get_config( f"subscriptions_stale_threshold_sec_{dataset_name}", stale_threshold_seconds) processor = build_executor_consumer( dataset_name, entity_names, consumer_group, producer, max_concurrent_queries, total_concurrent_queries, auto_offset_reset, not no_strict_offset_reset, metrics, stale_threshold_seconds, cooperative_rebalancing, ) def handler(signum: int, frame: Any) -> None: # TODO: Temporary code for debugging executor shutdown logger = logging.getLogger() logger.setLevel(logging.DEBUG) processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) with closing(producer), flush_querylog(): processor.run()
from dateutil.tz import tz from snuba import environment, settings, state from snuba.clickhouse.errors import ClickhouseError from snuba.clickhouse.formatter.nodes import FormattedQuery from snuba.reader import Reader, Result, build_result_transformer from snuba.utils.metrics.gauge import ThreadSafeGauge from snuba.utils.metrics.wrapper import MetricsWrapper logger = logging.getLogger("snuba.clickhouse") trace_logger = logging.getLogger("clickhouse_driver.log") trace_logger.setLevel("INFO") Params = Optional[Union[Sequence[Any], Mapping[str, Any]]] metrics = MetricsWrapper(environment.metrics, "clickhouse.native") class ClickhouseProfile(TypedDict): bytes: int blocks: int rows: int elapsed: float @dataclass(frozen=True) class ClickhouseResult: results: Sequence[Any] = field(default_factory=list) meta: Sequence[Any] | None = None profile: ClickhouseProfile | None = None trace_output: str = ""
flatten_nested_field, ) from snuba.processor import ( InsertBatch, MessageProcessor, ProcessedMessage, _as_dict_safe, _ensure_valid_date, _ensure_valid_ip, _unicodify, ) from snuba.utils.metrics.wrapper import MetricsWrapper logger = logging.getLogger(__name__) metrics = MetricsWrapper(environment.metrics, "transactions.processor") UNKNOWN_SPAN_STATUS = 2 class TransactionsMessageProcessor(MessageProcessor): PROMOTED_TAGS = { "environment", "sentry:release", "sentry:user", "sentry:dist", } def __extract_timestamp(self, field): timestamp = _ensure_valid_date(datetime.fromtimestamp(field)) if timestamp is None:
from typing import Any, Mapping, Optional from sentry_relay import DataCategory from snuba import environment, settings from snuba.consumers.types import KafkaMessageMetadata from snuba.processor import ( InsertBatch, MessageProcessor, ProcessedMessage, _ensure_valid_date, _unicodify, ) from snuba.utils.metrics.wrapper import MetricsWrapper metrics = MetricsWrapper(environment.metrics, "outcomes.processor") class OutcomesProcessor(MessageProcessor): def process_message( self, value: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: assert isinstance(value, dict) v_uuid = value.get("event_id") if value["outcome"] != 4: # we dont care about abuse outcomes for these metrics if "category" not in value: metrics.increment("missing_category") if "quantity" not in value: metrics.increment("missing_quantity")
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, max_query_workers: Optional[int], schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], delay_seconds: Optional[int], ) -> None: """Evaluates subscribed queries for a dataset.""" setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) storage = dataset.get_default_entity().get_writable_storage() assert ( storage is not None ), f"Dataset {dataset_name} does not have a writable storage by default." loader = enforce_table_writer(dataset).get_stream_loader() commit_log_topic_spec = loader.get_commit_log_topic_spec() assert commit_log_topic_spec is not None result_topic_spec = loader.get_subscription_result_topic_spec() assert result_topic_spec is not None metrics = MetricsWrapper( environment.metrics, "subscriptions", tags={ "group": consumer_group, "dataset": dataset_name }, ) consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( loader.get_default_topic_spec().topic, consumer_group, auto_offset_reset=auto_offset_reset, bootstrap_servers=bootstrap_servers, ), ), KafkaConsumer( build_kafka_consumer_configuration( commit_log_topic_spec.topic, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", bootstrap_servers=bootstrap_servers, ), ), (Topic(commit_log_topic) if commit_log_topic is not None else Topic(commit_log_topic_spec.topic_name)), set(commit_log_groups), ), time_shift=(timedelta(seconds=delay_seconds * -1) if delay_seconds is not None else None), ) producer = ProducerEncodingWrapper( KafkaProducer( build_kafka_producer_configuration( loader.get_default_topic_spec().topic, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )), SubscriptionTaskResultEncoder(), ) executor = ThreadPoolExecutor(max_workers=max_query_workers) logger.debug("Starting %r with %s workers...", executor, getattr(executor, "_max_workers", 0)) metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0)) with closing(consumer), executor, closing(producer): from arroyo import configure_metrics configure_metrics(StreamMetricsAdapter(metrics)) batching_consumer = StreamProcessor( consumer, (Topic(topic) if topic is not None else Topic( loader.get_default_topic_spec().topic_name)), BatchProcessingStrategyFactory( SubscriptionWorker( dataset, executor, { index: SubscriptionScheduler( RedisSubscriptionDataStore(redis_client, dataset, PartitionId(index)), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=metrics, ) for index in range(partitions if partitions is not None else loader. get_default_topic_spec().partitions_number) }, producer, Topic(result_topic) if result_topic is not None else Topic( result_topic_spec.topic_name), metrics, ), max_batch_size, max_batch_time_ms, ), ) def handler(signum: int, frame: Optional[Any]) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
"contexts", "geo.country_code", nullable=True, ), ColumnToMapping( None, "geo_region", None, "contexts", "geo.region", nullable=True ), ColumnToMapping(None, "geo_city", None, "contexts", "geo.city", nullable=True), ], subscriptables=[ SubscriptableMapper(None, "tags", None, "tags"), SubscriptableMapper(None, "contexts", None, "contexts"), ], ) metrics = MetricsWrapper(environment.metrics, "snuplicator") def callback_func( storage: str, query: Query, request_settings: RequestSettings, referrer: str, results: List[Result[QueryResult]], ) -> None: cache_hit = False is_duplicate = False # Captures if any of the queries involved was a cache hit or duplicate, as cache # hits may a cause of inconsistency between results. # Doesn't attempt to distinguish between all of the specific scenarios (one or both
from snuba import environment, state from snuba.query.conditions import ( BooleanFunctions, ConditionFunctions, binary_condition, ) from snuba.query.expressions import Column, Literal from snuba.query.extensions import QueryExtension from snuba.query.logical import Query from snuba.query.processors import ExtensionData, ExtensionQueryProcessor from snuba.request.request_settings import RequestSettings from snuba.util import parse_datetime from snuba.utils.metrics.decorators import track_calls from snuba.utils.metrics.wrapper import MetricsWrapper timeseries_metrics = MetricsWrapper(environment.metrics, "extensions.timeseries") def get_time_series_extension_properties(default_granularity: int, default_window: timedelta): return { "type": "object", "properties": { "from_date": { "type": "string", "format": "date-time", "default": track_calls( timeseries_metrics,
import logging from typing import Sequence from snuba import environment, state from snuba.query.data_source import DataSource from snuba.query.expressions import Expression from snuba.query.functions import is_valid_global_function from snuba.query.validation import FunctionCallValidator, InvalidFunctionCall from snuba.utils.metrics.wrapper import MetricsWrapper logger = logging.getLogger(__name__) metrics = MetricsWrapper(environment.metrics, "validation.functions") class AllowedFunctionValidator(FunctionCallValidator): """ Validates that the function itself is allowed. This does not validate that the function is correctly formed. """ def validate(self, func_name: str, parameters: Sequence[Expression], data_source: DataSource) -> None: if is_valid_global_function(func_name): return if state.get_config("function-validator.enabled", False): raise InvalidFunctionCall(f"Invalid function name: {func_name}") else: metrics.increment("invalid_funcs", tags={"func_name": func_name})
def multistorage_consumer( storage_names: Sequence[str], consumer_group: str, commit_log_topic: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, no_strict_offset_reset: bool, queued_max_messages_kbytes: int, queued_min_messages: int, parallel_collect: bool, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, dead_letter_topic: Optional[str] = None, cooperative_rebalancing: bool = False, ) -> None: DEFAULT_BLOCK_SIZE = int(32 * 1e6) if processes is not None: if input_block_size is None: input_block_size = DEFAULT_BLOCK_SIZE if output_block_size is None: output_block_size = DEFAULT_BLOCK_SIZE setup_logging(log_level) setup_sentry() logger.info("Consumer Starting") storages = { key: get_writable_storage(key) for key in (getattr(StorageKey, name.upper()) for name in storage_names) } topics = { storage.get_table_writer().get_stream_loader().get_default_topic_spec( ).topic_name for storage in storages.values() } # XXX: The ``StreamProcessor`` only supports a single topic at this time, # but is easily modified. The topic routing in the processing strategy is a # bit trickier (but also shouldn't be too bad.) topic = Topic(topics.pop()) if topics: raise ValueError("only one topic is supported") commit_log: Optional[Topic] if commit_log_topic: commit_log = Topic(commit_log_topic) else: # XXX: The ``CommitLogConsumer`` also only supports a single topic at this # time. (It is less easily modified.) This also assumes the commit log # topic is on the same Kafka cluster as the input topic. commit_log_topics = { spec.topic_name for spec in (storage.get_table_writer().get_stream_loader( ).get_commit_log_topic_spec() for storage in storages.values()) if spec is not None } if commit_log_topics: commit_log = Topic(commit_log_topics.pop()) else: commit_log = None if commit_log_topics: raise ValueError("only one commit log topic is supported") # XXX: This requires that all storages are associated with the same Kafka # cluster so that they can be consumed by the same consumer instance. # Unfortunately, we don't have the concept of independently configurable # Kafka clusters in settings, only consumer configurations that are # associated with storages and/or global default configurations. To avoid # implementing yet another method of configuring Kafka clusters, this just # piggybacks on the existing configuration method(s), with the assumption # that most deployments are going to be using the default configuration. storage_keys = [*storages.keys()] kafka_topic = (storages[storage_keys[0]].get_table_writer(). get_stream_loader().get_default_topic_spec().topic) consumer_configuration = build_kafka_consumer_configuration( kafka_topic, consumer_group, auto_offset_reset=auto_offset_reset, strict_offset_reset=not no_strict_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ) if cooperative_rebalancing is True: consumer_configuration[ "partition.assignment.strategy"] = "cooperative-sticky" for storage_key in storage_keys[1:]: if (build_kafka_consumer_configuration( storages[storage_key].get_table_writer().get_stream_loader(). get_default_topic_spec().topic, consumer_group, )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]): raise ValueError( "storages cannot be located on different Kafka clusters") metrics = MetricsWrapper( environment.metrics, "consumer", tags={ "group": consumer_group, "storage": "_".join([storage_keys[0].value, "m"]), }, ) # Collect metrics from librdkafka if we have stats_collection_freq_ms set # for the consumer group, or use the default. stats_collection_frequency_ms = get_config( f"stats_collection_freq_ms_{consumer_group}", get_config("stats_collection_freq_ms", 0), ) if stats_collection_frequency_ms and stats_collection_frequency_ms > 0: def stats_callback(stats_json: str) -> None: stats = rapidjson.loads(stats_json) metrics.gauge("librdkafka.total_queue_size", stats.get("replyq", 0)) consumer_configuration.update({ "statistics.interval.ms": stats_collection_frequency_ms, "stats_cb": stats_callback, }) if commit_log is None: consumer = KafkaConsumer(consumer_configuration) else: # XXX: This relies on the assumptions that a.) all storages are # located on the same Kafka cluster (validated above.) commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer( ).get_stream_loader().get_commit_log_topic_spec()) assert commit_log_topic_spec is not None producer = ConfluentKafkaProducer( build_kafka_producer_configuration(commit_log_topic_spec.topic)) consumer = KafkaConsumerWithCommitLog( consumer_configuration, producer=producer, commit_log_topic=commit_log, ) dead_letter_producer: Optional[KafkaProducer] = None dead_letter_queue: Optional[Topic] = None if dead_letter_topic: dead_letter_queue = Topic(dead_letter_topic) dead_letter_producer = KafkaProducer( build_kafka_producer_configuration( StreamsTopic(dead_letter_topic))) configure_metrics(StreamMetricsAdapter(metrics)) processor = StreamProcessor( consumer, topic, MultistorageConsumerProcessingStrategyFactory( [*storages.values()], max_batch_size, max_batch_time_ms / 1000.0, parallel_collect=parallel_collect, processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, metrics=metrics, producer=dead_letter_producer, topic=dead_letter_queue, ), ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) if dead_letter_producer: with closing(dead_letter_producer): processor.run() else: processor.run()
from datetime import datetime from typing import Any, Mapping, Optional from uuid import UUID from snuba import environment from snuba.consumers.types import KafkaMessageMetadata from snuba.processor import InsertBatch, MessageProcessor, ProcessedMessage from snuba.utils.metrics.wrapper import MetricsWrapper metrics = MetricsWrapper(environment.metrics, "profiles.processor") RETENTION_DAYS_ALLOWED = frozenset([30, 90]) class ProfilesMessageProcessor(MessageProcessor): def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: try: retention_days = message["retention_days"] if retention_days not in RETENTION_DAYS_ALLOWED: retention_days = 30 processed = { "organization_id": message["organization_id"], "project_id": message["project_id"], "transaction_id": str(UUID(message["transaction_id"])), "profile_id": str(UUID(message["profile_id"])), "received": datetime.utcfromtimestamp(message["received"]), "profile": message["profile"], "android_api_level": message.get("android_api_level"), "device_classification": message["device_classification"],
def __init__(self) -> None: super().__init__(default_entity=EntityKey.DISCOVER) # XXX: This is temporary code that will eventually need to be ported to Sentry # since SnQL will require an entity to always be specified by the user. def select_entity(self, query: Query) -> EntityKey: selected_entity = match_query_to_entity(query, EVENTS_COLUMNS, TRANSACTIONS_COLUMNS) track_bad_query(query, selected_entity, EVENTS_COLUMNS, TRANSACTIONS_COLUMNS) return selected_entity metrics = MetricsWrapper(environment.metrics, "api.discover") logger = logging.getLogger(__name__) EVENT_CONDITION = FunctionCallMatch( Param("function", Or([StringMatch(op) for op in BINARY_OPERATORS])), ( Or([ColumnMatch(None, StringMatch("type")), LiteralMatch(None)]), Param("event_type", Or([ColumnMatch(), LiteralMatch()])), ), ) TRANSACTION_FUNCTIONS = FunctionCallMatch( Or([StringMatch("apdex"), StringMatch("failure_rate")]), None)
from snuba.request.request_settings import RequestSettings from snuba.state.cache.abstract import Cache, ExecutionTimeoutError from snuba.state.cache.redis.backend import RESULT_VALUE, RESULT_WAIT, RedisCache from snuba.state.rate_limit import ( GLOBAL_RATE_LIMIT_NAME, PROJECT_RATE_LIMIT_NAME, RateLimitAggregator, RateLimitExceeded, ) from snuba.util import force_bytes, with_span from snuba.utils.codecs import Codec from snuba.utils.metrics.timer import Timer from snuba.utils.metrics.wrapper import MetricsWrapper from snuba.web import QueryException, QueryResult metrics = MetricsWrapper(environment.metrics, "db_query") class ResultCacheCodec(Codec[bytes, Result]): def encode(self, value: Result) -> bytes: return cast(str, rapidjson.dumps(value)).encode("utf-8") def decode(self, value: bytes) -> Result: ret = rapidjson.loads(value) if not isinstance(ret, Mapping) or "meta" not in ret or "data" not in ret: raise ValueError("Invalid value type in result cache") return cast(Result, ret) cache: Cache[Result] = RedisCache(redis_client, "snuba-query-cache:",
String as StringMatch, ) from snuba.query.parser.conditions import parse_conditions_to_expr from snuba.query.parser.exceptions import ( AliasShadowingException, CyclicAliasException, ParsingException, ) from snuba.query.parser.expressions import parse_aggregation, parse_expression from snuba.query.parser.validation import validate_query from snuba.util import is_function, to_list, tuplify from snuba.utils.metrics.wrapper import MetricsWrapper logger = logging.getLogger(__name__) metrics = MetricsWrapper(environment.metrics, "parser") def parse_query(body: MutableMapping[str, Any], dataset: Dataset) -> Query: """ Parses the query body generating the AST. This only takes into account the initial query body. Extensions are parsed by extension processors and are supposed to update the AST. Parsing includes two phases. The first transforms the json body into a minimal query Object resolving expressions, conditions, etc. The second phase performs some query processing to provide a sane query to the dataset specific section. - It prevents alias shadowing. - It transforms columns from the tags[asd] form into SubscriptableReference.
def consumer( *, raw_events_topic: Optional[str], replacements_topic: Optional[str], commit_log_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, no_strict_offset_reset: bool, queued_max_messages_kbytes: int, queued_min_messages: int, parallel_collect: bool, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, profile_path: Optional[str] = None, cooperative_rebalancing: bool = False, ) -> None: setup_logging(log_level) setup_sentry() logger.info("Consumer Starting") storage_key = StorageKey(storage_name) metrics = MetricsWrapper( environment.metrics, "consumer", tags={"group": consumer_group, "storage": storage_key.value}, ) configure_metrics(StreamMetricsAdapter(metrics)) def stats_callback(stats_json: str) -> None: stats = rapidjson.loads(stats_json) metrics.gauge("librdkafka.total_queue_size", stats.get("replyq", 0)) consumer_builder = ConsumerBuilder( storage_key=storage_key, kafka_params=KafkaParameters( raw_topic=raw_events_topic, replacements_topic=replacements_topic, bootstrap_servers=bootstrap_server, group_id=consumer_group, commit_log_topic=commit_log_topic, auto_offset_reset=auto_offset_reset, strict_offset_reset=not no_strict_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), processing_params=ProcessingParameters( processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, ), max_batch_size=max_batch_size, max_batch_time_ms=max_batch_time_ms, metrics=metrics, profile_path=profile_path, stats_callback=stats_callback, parallel_collect=parallel_collect, cooperative_rebalancing=cooperative_rebalancing, ) consumer = consumer_builder.build_base_consumer() def handler(signum: int, frame: Any) -> None: consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) consumer.run()
from snuba.clickhouse.processors import QueryProcessor from snuba.clickhouse.query import Query from snuba.clickhouse.query_dsl.accessors import ( get_object_ids_in_query_ast, get_time_range, ) from snuba.datasets.errors_replacer import ProjectsQueryFlags from snuba.query.conditions import not_in_condition from snuba.query.expressions import Column, FunctionCall, Literal from snuba.query.query_settings import QuerySettings from snuba.replacers.replacer_processor import ReplacerState from snuba.state import get_config from snuba.utils.metrics.wrapper import MetricsWrapper logger = logging.getLogger(__name__) metrics = MetricsWrapper(environment.metrics, "processors.replaced_groups") FINAL_METRIC = "final" class PostReplacementConsistencyEnforcer(QueryProcessor): """ This processor tweaks the query to ensure that groups that have been manipulated by a replacer (like after a deletion) are excluded if they need to be. There is a period of time between the replacement executing its query and Clickhouse merging the rows to achieve consistency. During this period of time we either have to remove those rows manually or to run the query in FINAL mode. """ def __init__( self, project_column: str, replacer_state_name: Optional[ReplacerState]
ReplacementMessage, ReplacementMessageMetadata, ReplacerProcessor, ReplacerState, ) from snuba.state import get_config from snuba.utils.metrics.wrapper import MetricsWrapper """ Disambiguate the dataset/storage when there are multiple tables representing errors that perform event replacements. In theory this will be needed only during the events to errors migration. """ logger = logging.getLogger(__name__) metrics = MetricsWrapper(environment.metrics, "errors.replacer") @dataclass(frozen=True) class NeedsFinal: pass @dataclass(frozen=True) class ExcludeGroups: group_ids: Sequence[int] QueryTimeFlags = Union[NeedsFinal, ExcludeGroups]
ProjectReferrerRateLimiter, ReferrerRateLimiterProcessor, ) from snuba.query.processors.quota_processor import ResourceQuotaProcessor from snuba.query.processors.timeseries_processor import ( TimeSeriesProcessor, extract_granularity_from_query, ) from snuba.query.query_settings import QuerySettings, SubscriptionQuerySettings from snuba.query.validation.validators import ( ColumnValidationMode, EntityRequiredColumnValidator, ) from snuba.utils.metrics.wrapper import MetricsWrapper metrics = MetricsWrapper(environment.metrics, "api.sessions") def function_column(col_name: str, function_name: str) -> ColumnToFunction: return ColumnToFunction( None, col_name, function_name, (Column(None, None, col_name), ), ) def function_call(col_name: str, function_name: str) -> FunctionCall: return FunctionCall(None, function_name, (Column(None, None, col_name), ))
def replacer( *, replacements_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, log_level: Optional[str] = None, ) -> None: from snuba.replacer import ReplacerWorker from snuba.utils.streams import Topic from snuba.utils.streams.backends.kafka import ( KafkaConsumer, TransportError, build_kafka_consumer_configuration, ) from snuba.utils.streams.processing import StreamProcessor from snuba.utils.streams.processing.strategies.batching import ( BatchProcessingStrategyFactory, ) setup_logging(log_level) setup_sentry() storage_key = StorageKey(storage_name) storage = get_writable_storage(storage_key) metrics_tags = {"group": consumer_group, "storage": storage_name} stream_loader = storage.get_table_writer().get_stream_loader() default_replacement_topic_spec = stream_loader.get_replacement_topic_spec() assert ( default_replacement_topic_spec is not None ), f"Storage {storage.get_storage_key().value} does not have a replacement topic." replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name metrics = MetricsWrapper( environment.metrics, "replacer", tags=metrics_tags, ) replacer = StreamProcessor( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers=bootstrap_server, group_id=consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), ), Topic(replacements_topic), BatchProcessingStrategyFactory( worker=ReplacerWorker(storage, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, ), metrics=metrics, recoverable_errors=[TransportError], ) def handler(signum: int, frame: Any) -> None: replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) replacer.run()
def __init__( self, storage_key: StorageKey, raw_topic: Optional[str], replacements_topic: Optional[str], max_batch_size: int, max_batch_time_ms: int, bootstrap_servers: Sequence[str], group_id: str, commit_log_topic: Optional[str], auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], commit_retry_policy: Optional[RetryPolicy] = None, profile_path: Optional[str] = None, ) -> None: self.storage = get_writable_storage(storage_key) self.bootstrap_servers = bootstrap_servers self.broker_config = get_default_kafka_configuration( storage_key, bootstrap_servers=bootstrap_servers ) self.producer_broker_config = build_kafka_producer_configuration( storage_key, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) stream_loader = self.storage.get_table_writer().get_stream_loader() self.raw_topic: Topic if raw_topic is not None: self.raw_topic = Topic(raw_topic) else: self.raw_topic = Topic(stream_loader.get_default_topic_spec().topic_name) self.replacements_topic: Optional[Topic] if replacements_topic is not None: self.replacements_topic = Topic(replacements_topic) else: replacement_topic_spec = stream_loader.get_replacement_topic_spec() if replacement_topic_spec is not None: self.replacements_topic = Topic(replacement_topic_spec.topic_name) else: self.replacements_topic = None self.commit_log_topic: Optional[Topic] if commit_log_topic is not None: self.commit_log_topic = Topic(commit_log_topic) else: commit_log_topic_spec = stream_loader.get_commit_log_topic_spec() if commit_log_topic_spec is not None: self.commit_log_topic = Topic(commit_log_topic_spec.topic_name) else: self.commit_log_topic = None # XXX: This can result in a producer being built in cases where it's # not actually required. self.producer = Producer(self.producer_broker_config) self.metrics = MetricsWrapper( environment.metrics, "consumer", tags={"group": group_id, "storage": storage_key.value}, ) self.max_batch_size = max_batch_size self.max_batch_time_ms = max_batch_time_ms self.group_id = group_id self.auto_offset_reset = auto_offset_reset self.queued_max_messages_kbytes = queued_max_messages_kbytes self.queued_min_messages = queued_min_messages self.processes = processes self.input_block_size = input_block_size self.output_block_size = output_block_size self.__profile_path = profile_path if commit_retry_policy is None: commit_retry_policy = BasicRetryPolicy( 3, constant_delay(1), lambda e: isinstance(e, KafkaException) and e.args[0].code() in ( KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR, KafkaError._WAIT_COORD, ), ) self.__commit_retry_policy = commit_retry_policy
def subscriptions_scheduler_executor( *, dataset_name: str, entity_names: Sequence[str], consumer_group: str, followed_consumer_group: str, max_concurrent_queries: int, total_concurrent_queries: int, auto_offset_reset: str, no_strict_offset_reset: bool, schedule_ttl: int, delay_seconds: Optional[int], stale_threshold_seconds: Optional[int], log_level: Optional[str], # TODO: Temporarily overrides the scheduling mode. # Required for single tenant since some partitions may be empty. # To be removed once transactions is no longer semantically partitioned. scheduling_mode: Optional[str], ) -> None: """ Combined subscriptions scheduler and executor. Alternative to the separate scheduler and executor processes. """ setup_logging(log_level) setup_sentry() metrics = MetricsWrapper( environment.metrics, "subscriptions.scheduler_executor", tags={"dataset": dataset_name}, ) configure_metrics(StreamMetricsAdapter(metrics)) # Just get the result topic configuration from the first entity. Later we # check they all have the same result topic anyway before building the consumer. entity_key = EntityKey(entity_names[0]) storage = get_entity(entity_key).get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() result_topic_spec = stream_loader.get_subscription_scheduled_topic_spec() assert result_topic_spec is not None producer = KafkaProducer( build_kafka_producer_configuration( result_topic_spec.topic, override_params={"partitioner": "consistent"}, ) ) processor = build_scheduler_executor_consumer( dataset_name, entity_names, consumer_group, followed_consumer_group, producer, auto_offset_reset, not no_strict_offset_reset, schedule_ttl, delay_seconds, stale_threshold_seconds, max_concurrent_queries, total_concurrent_queries, metrics, SchedulingWatermarkMode(scheduling_mode) if scheduling_mode is not None else None, ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) with closing(producer), flush_querylog(): processor.run()
mapping_pattern, ) from snuba.query.conditions import ( BooleanFunctions, get_first_level_and_conditions, get_first_level_or_conditions, ) from snuba.query.expressions import Column, Expression from snuba.query.expressions import FunctionCall as FunctionExpr from snuba.query.expressions import Literal as LiteralExpr from snuba.query.matchers import Any, FunctionCall, Literal, Or, Param, String from snuba.request.request_settings import RequestSettings from snuba.state import get_config from snuba.utils.metrics.wrapper import MetricsWrapper metrics = MetricsWrapper(environment.metrics, "processors.tags_hash_map") ESCAPE_TRANSLATION = str.maketrans({"\\": "\\\\", "=": "\="}) class ConditionClass(Enum): IRRELEVANT = 1 OPTIMIZABLE = 2 NOT_OPTIMIZABLE = 3 class MappingOptimizer(QueryProcessor): """ Optimize tags conditions by relying on the tags_hash_map column. Such column is an array of hashes of `key=value` strings. This processor transforms tags conditions that are in the form of
from snuba import environment from snuba.datasets.events_format import enforce_retention, extract_extra_tags from snuba.processor import ( InsertBatch, MessageProcessor, ProcessedMessage, _as_dict_safe, _ensure_valid_date, _unicodify, ) from snuba.utils.metrics.wrapper import MetricsWrapper UNKNOWN_SPAN_STATUS = 2 metrics = MetricsWrapper(environment.metrics, "spans.processor") class SpansMessageProcessor(MessageProcessor): def __extract_timestamp(self, field: float) -> Tuple[datetime, int]: timestamp = _ensure_valid_date(datetime.fromtimestamp(field)) if timestamp is None: timestamp = datetime.utcnow() nanoseconds = int(timestamp.microsecond * 1000) return (timestamp, nanoseconds) def __init_span(self, event: Mapping[str, Any]) -> MutableMapping[str, Any]: """ Initializes the fields that are the same for all spans within a transaction. """
def multistorage_consumer( storage_names: Sequence[str], consumer_group: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, ) -> None: DEFAULT_BLOCK_SIZE = int(32 * 1e6) if processes is not None: if input_block_size is None: input_block_size = DEFAULT_BLOCK_SIZE if output_block_size is None: output_block_size = DEFAULT_BLOCK_SIZE setup_logging(log_level) setup_sentry() storages = { key: get_writable_storage(key) for key in (getattr(StorageKey, name.upper()) for name in storage_names) } topics = { storage.get_table_writer().get_stream_loader().get_default_topic_spec( ).topic_name for storage in storages.values() } # XXX: The ``StreamProcessor`` only supports a single topic at this time, # but is easily modified. The topic routing in the processing strategy is a # bit trickier (but also shouldn't be too bad.) topic = Topic(topics.pop()) if topics: raise ValueError("only one topic is supported") # XXX: The ``CommitLogConsumer`` also only supports a single topic at this # time. (It is less easily modified.) This also assumes the commit log # topic is on the same Kafka cluster as the input topic. commit_log_topics = { spec.topic_name for spec in (storage.get_table_writer().get_stream_loader( ).get_commit_log_topic_spec() for storage in storages.values()) if spec is not None } commit_log_topic: Optional[Topic] if commit_log_topics: commit_log_topic = Topic(commit_log_topics.pop()) else: commit_log_topic = None if commit_log_topics: raise ValueError("only one commit log topic is supported") # XXX: This requires that all storages are associated with the same Kafka # cluster so that they can be consumed by the same consumer instance. # Unfortunately, we don't have the concept of independently configurable # Kafka clusters in settings, only consumer configurations that are # associated with storages and/or global default configurations. To avoid # implementing yet another method of configuring Kafka clusters, this just # piggybacks on the existing configuration method(s), with the assumption # that most deployments are going to be using the default configuration. storage_keys = [*storages.keys()] kafka_topic = (storages[storage_keys[0]].get_table_writer(). get_stream_loader().get_default_topic_spec().topic) consumer_configuration = build_kafka_consumer_configuration( kafka_topic, consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ) for storage_key in storage_keys[1:]: if (build_kafka_consumer_configuration( storages[storage_key].get_table_writer().get_stream_loader(). get_default_topic_spec().topic, consumer_group, )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]): raise ValueError( "storages cannot be located on different Kafka clusters") if commit_log_topic is None: consumer = KafkaConsumer(consumer_configuration) else: # XXX: This relies on the assumptions that a.) all storages are # located on the same Kafka cluster (validated above.) commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer( ).get_stream_loader().get_commit_log_topic_spec()) assert commit_log_topic_spec is not None producer = ConfluentKafkaProducer( build_kafka_producer_configuration(commit_log_topic_spec.topic)) consumer = KafkaConsumerWithCommitLog( consumer_configuration, producer=producer, commit_log_topic=commit_log_topic, ) metrics = MetricsWrapper(environment.metrics, "consumer") configure_metrics(StreamMetricsAdapter(metrics)) processor = StreamProcessor( consumer, topic, MultistorageConsumerProcessingStrategyFactory( [*storages.values()], max_batch_size, max_batch_time_ms / 1000.0, processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, metrics=metrics, ), ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) processor.run()
) import simplejson as json from confluent_kafka import KafkaError from confluent_kafka import Message as KafkaMessage from confluent_kafka import Producer from snuba import environment, settings from snuba.redis import redis_client from snuba.utils.metrics.wrapper import MetricsWrapper from snuba.utils.streams.configuration_builder import ( build_default_kafka_producer_configuration, ) from snuba.utils.streams.topics import Topic metrics = MetricsWrapper(environment.metrics, "snuba.state") logger = logging.getLogger("snuba.state") kfk = None rds = redis_client ratelimit_prefix = "snuba-ratelimit:" query_lock_prefix = "snuba-query-lock:" config_hash = "snuba-config" config_history_hash = "snuba-config-history" config_changes_list = "snuba-config-changes" config_changes_list_limit = 25 queries_list = "snuba-queries" # Rate Limiting and Deduplication