def process_query(self, query: Query, query_settings: QuerySettings) -> None: # If the settings don't already have an object rate limit, add one if self._is_already_applied(query_settings): return per_second_name = self.get_per_second_name(query, query_settings) concurrent_name = self.get_concurrent_name(query, query_settings) object_rate_limit, object_concurrent_limit = get_configs( [ (per_second_name, self.default_limit), (concurrent_name, self.default_limit), ] ) obj_id = self.get_object_id(query, query_settings) if obj_id is None: return # Specific objects can have their rate limits overridden (per_second, concurr) = get_configs( [ (f"{per_second_name}_{obj_id}", object_rate_limit), (f"{concurrent_name}_{obj_id}", object_concurrent_limit), ] ) rate_limit = RateLimitParameters( rate_limit_name=self.rate_limit_name, bucket=str(obj_id), per_second_limit=per_second, concurrent_limit=concurr, ) query_settings.add_rate_limit(rate_limit)
def execute_query_with_rate_limits( clickhouse_query: Union[Query, CompositeQuery[Table]], query_settings: QuerySettings, formatted_query: FormattedQuery, reader: Reader, timer: Timer, stats: MutableMapping[str, Any], clickhouse_query_settings: MutableMapping[str, Any], robust: bool, ) -> Result: # Global rate limiter is added at the end of the chain to be # the last for evaluation. # This allows us not to borrow capacity from the global quota # during the evaluation if one of the more specific limiters # (like the project rate limiter) rejects the query first. query_settings.add_rate_limit(get_global_rate_limit_params()) # XXX: We should consider moving this that it applies to the logical query, # not the physical query. with RateLimitAggregator( query_settings.get_rate_limit_params() ) as rate_limit_stats_container: stats.update(rate_limit_stats_container.to_dict()) timer.mark("rate_limit") project_rate_limit_stats = rate_limit_stats_container.get_stats( PROJECT_RATE_LIMIT_NAME ) thread_quota = query_settings.get_resource_quota() if ( ("max_threads" in clickhouse_query_settings or thread_quota is not None) and project_rate_limit_stats is not None and project_rate_limit_stats.concurrent > 1 ): maxt = ( clickhouse_query_settings["max_threads"] if thread_quota is None else thread_quota.max_threads ) clickhouse_query_settings["max_threads"] = max( 1, maxt - project_rate_limit_stats.concurrent + 1 ) _record_rate_limit_metrics(rate_limit_stats_container, reader, stats) return execute_query( clickhouse_query, query_settings, formatted_query, reader, timer, stats, clickhouse_query_settings, robust=robust, )
def process_query(self, query: Query, query_settings: QuerySettings) -> None: table_name = query.get_from_clause().table_name (per_second, concurr) = get_configs([ (f"table_per_second_limit_{table_name}{self.__suffix}", 5000), (f"table_concurrent_limit_{table_name}{self.__suffix}", 1000), ]) rate_limit = RateLimitParameters( rate_limit_name=TABLE_RATE_LIMIT_NAME, bucket=table_name, per_second_limit=per_second, concurrent_limit=concurr, ) query_settings.add_rate_limit(rate_limit)
def process_query(self, query: Query, query_settings: QuerySettings) -> None: enabled = get_config(ENABLED_CONFIG, 1) if not enabled: return project_ids = get_object_ids_in_query_ast(query, self.__project_field) if not project_ids: return # TODO: Like for the rate limiter Add logic for multiple IDs project_id = str(project_ids.pop()) thread_quota = get_config( f"{REFERRER_PROJECT_CONFIG}_{query_settings.referrer}_{project_id}" ) if not thread_quota: return assert isinstance(thread_quota, int) query_settings.set_resource_quota(ResourceQuota(max_threads=thread_quota))
def execute_query( # TODO: Passing the whole clickhouse query here is needed as long # as the execute method depends on it. Otherwise we can make this # file rely either entirely on clickhouse query or entirely on # the formatter. clickhouse_query: Union[Query, CompositeQuery[Table]], query_settings: QuerySettings, formatted_query: FormattedQuery, reader: Reader, timer: Timer, stats: MutableMapping[str, Any], clickhouse_query_settings: MutableMapping[str, Any], robust: bool, ) -> Result: """ Execute a query and return a result. """ # Experiment, if we are going to grab more than X columns worth of data, # don't use uncompressed_cache in ClickHouse. uc_max = state.get_config("uncompressed_cache_max_cols", 5) assert isinstance(uc_max, int) column_counter = ReferencedColumnsCounter() column_counter.visit(clickhouse_query.get_from_clause()) if column_counter.count_columns() > uc_max: clickhouse_query_settings["use_uncompressed_cache"] = 0 # Force query to use the first shard replica, which # should have synchronously received any cluster writes # before this query is run. consistent = query_settings.get_consistent() stats["consistent"] = consistent if consistent: clickhouse_query_settings["load_balancing"] = "in_order" clickhouse_query_settings["max_threads"] = 1 result = reader.execute( formatted_query, clickhouse_query_settings, with_totals=clickhouse_query.has_totals(), robust=robust, ) timer.mark("execute") stats.update( {"result_rows": len(result["data"]), "result_cols": len(result["meta"])} ) return result
def _apply_turbo_sampling_if_needed( clickhouse_query: Union[Query, CompositeQuery[Table]], query_settings: QuerySettings, ) -> None: """ TODO: Remove this method entirely and move the sampling logic into a query processor. """ if isinstance(clickhouse_query, Query): if (query_settings.get_turbo() and not clickhouse_query.get_from_clause().sampling_rate): clickhouse_query.set_from_clause( replace( clickhouse_query.get_from_clause(), sampling_rate=snuba_settings.TURBO_SAMPLE_RATE, ))
def process_query(self, query: Query, query_settings: QuerySettings) -> None: if query_settings.get_turbo(): return project_ids = get_object_ids_in_query_ast(query, self.__project_column) if project_ids is None: self._set_query_final(query, False) return flags: ProjectsQueryFlags = ProjectsQueryFlags.load_from_redis( list(project_ids), self.__replacer_state_name ) query_overlaps_replacement = self._query_overlaps_replacements( query, flags.latest_replacement_time ) if not query_overlaps_replacement: self._set_query_final(query, False) return tags = self._initialize_tags(query_settings, flags) set_final = False if flags.needs_final: tags["cause"] = "final_flag" metrics.increment( name=FINAL_METRIC, tags=tags, ) set_final = True elif flags.group_ids_to_exclude: # If the number of groups to exclude exceeds our limit, the query # should just use final instead of the exclusion set. max_group_ids_exclude = get_config( "max_group_ids_exclude", settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE, ) assert isinstance(max_group_ids_exclude, int) groups_to_exclude = self._groups_to_exclude( query, flags.group_ids_to_exclude ) if len(groups_to_exclude) > max_group_ids_exclude: tags["cause"] = "max_groups" metrics.increment( name=FINAL_METRIC, tags=tags, ) set_final = True elif groups_to_exclude: query.add_condition_to_ast( not_in_condition( FunctionCall( None, "assumeNotNull", (Column(None, None, self.__groups_column),), ), [Literal(None, p) for p in groups_to_exclude], ) ) self._set_query_final(query, set_final)
def _is_already_applied(self, query_settings: QuerySettings) -> bool: existing = query_settings.get_rate_limit_params() for ex in existing: if ex.rate_limit_name == self.rate_limit_name: return True return False