Exemple #1
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        # If the settings don't already have a project rate limit, add one
        existing = request_settings.get_rate_limit_params()
        for ex in existing:
            if ex.rate_limit_name == PROJECT_RATE_LIMIT_NAME:
                return

        project_ids = get_project_ids_in_query_ast(query, self.project_column)
        if not project_ids:
            return

        # TODO: Use all the projects, not just one
        project_id = project_ids.pop()

        prl, pcl = get_configs([("project_per_second_limit", 1000),
                                ("project_concurrent_limit", 1000)])

        # Specific projects can have their rate limits overridden
        (per_second, concurr) = get_configs([
            ("project_per_second_limit_{}".format(project_id), prl),
            ("project_concurrent_limit_{}".format(project_id), pcl),
        ])

        rate_limit = RateLimitParameters(
            rate_limit_name=PROJECT_RATE_LIMIT_NAME,
            bucket=str(project_id),
            per_second_limit=per_second,
            concurrent_limit=concurr,
        )

        request_settings.add_rate_limit(rate_limit)
    def process_query(self, query: Query, query_settings: QuerySettings) -> None:
        # If the settings don't already have an object rate limit, add one
        if self._is_already_applied(query_settings):
            return
        per_second_name = self.get_per_second_name(query, query_settings)
        concurrent_name = self.get_concurrent_name(query, query_settings)
        object_rate_limit, object_concurrent_limit = get_configs(
            [
                (per_second_name, self.default_limit),
                (concurrent_name, self.default_limit),
            ]
        )
        obj_id = self.get_object_id(query, query_settings)
        if obj_id is None:
            return
        # Specific objects can have their rate limits overridden
        (per_second, concurr) = get_configs(
            [
                (f"{per_second_name}_{obj_id}", object_rate_limit),
                (f"{concurrent_name}_{obj_id}", object_concurrent_limit),
            ]
        )

        rate_limit = RateLimitParameters(
            rate_limit_name=self.rate_limit_name,
            bucket=str(obj_id),
            per_second_limit=per_second,
            concurrent_limit=concurr,
        )

        query_settings.add_rate_limit(rate_limit)
Exemple #3
0
    def wrapper(dataset, request: Request, *args, **kwargs):
        use_split = state.get_configs([
            ('use_split', 0),
        ])
        limit = request.query.get_limit()
        remaining_offset = request.query.get_offset()
        orderby = util.to_list(request.query.get_orderby())

        common_conditions = use_split and limit and not request.query.get_groupby(
        )

        if common_conditions:
            # TODO: Move all_referenced_columns into query and remove this dependency.
            # In order to do this we need to break a circular dependency first
            total_col_count = len(
                util.all_referenced_columns(request.query.get_body()))
            min_col_count = len(
                util.all_referenced_columns({
                    **request.query.get_body(), 'selected_columns':
                    MIN_COLS
                }))

            if (request.query.get_selected_columns()
                    and not request.query.get_aggregations()
                    and total_col_count > min_col_count):
                return col_split(dataset, request, *args, **kwargs)
            elif orderby[:1] == ['-timestamp'] and remaining_offset < 1000:
                return time_split(dataset, request, *args, **kwargs)

        return query_func(dataset, request, *args, **kwargs)
Exemple #4
0
    def wrapper(dataset, request: Request, *args, **kwargs):
        use_split = state.get_configs([
            ('use_split', 0),
        ])
        query_limit = request.query.get_limit()
        limit = query_limit if query_limit is not None else 0
        remaining_offset = request.query.get_offset()
        orderby = util.to_list(request.query.get_orderby())

        common_conditions = use_split and limit and not request.query.get_groupby()

        if common_conditions:
            # TODO: Move all_referenced_columns into query and remove this dependency.
            # In order to do this we need to break a circular dependency first
            total_col_count = len(all_referenced_columns(request.query))
            column_split_spec = dataset.get_split_query_spec()
            if column_split_spec:
                copied_query = copy.deepcopy(request.query)
                copied_query.set_selected_columns(column_split_spec.get_min_columns())
                min_col_count = len(all_referenced_columns(copied_query))
            else:
                min_col_count = None

            if (
                column_split_spec
                and request.query.get_selected_columns()
                and not request.query.get_aggregations()
                and total_col_count > min_col_count
            ):
                return col_split(dataset, request, column_split_spec, *args, **kwargs)
            elif orderby[:1] == ['-timestamp'] and remaining_offset < 1000:
                return time_split(dataset, request, *args, **kwargs)

        return query_func(dataset, request, *args, **kwargs)
Exemple #5
0
def _replace_time_condition(
    query: Union[CompositeQuery[QueryEntity], LogicalQuery]
) -> None:
    condition = query.get_condition()
    top_level = (
        get_first_level_and_conditions(condition) if condition is not None else []
    )
    max_days, date_align = state.get_configs(
        [("max_days", None), ("date_align_seconds", 1)]
    )
    assert isinstance(date_align, int)
    if max_days is not None:
        max_days = int(max_days)

    if isinstance(query, LogicalQuery):
        new_top_level = _align_max_days_date_align(
            query.get_from_clause().key, top_level, max_days, date_align
        )
        query.set_ast_condition(combine_and_conditions(new_top_level))
    else:
        from_clause = query.get_from_clause()
        if not isinstance(from_clause, JoinClause):
            return

        alias_map = from_clause.get_alias_node_map()
        for alias, node in alias_map.items():
            assert isinstance(node.data_source, QueryEntity)  # mypy
            new_top_level = _align_max_days_date_align(
                node.data_source.key, top_level, max_days, date_align, alias
            )
            top_level = new_top_level
            query.set_ast_condition(combine_and_conditions(new_top_level))
Exemple #6
0
def disable_query_cache() -> Generator[None, None, None]:
    cache, readthrough = state.get_configs(
        [("use_cache", settings.USE_RESULT_CACHE), ("use_readthrough_query_cache", 1)]
    )
    state.set_configs({"use_cache": 0, "use_readthrough_query_cache": 0})
    yield
    state.set_configs({"use_cache": cache, "use_readthrough_query_cache": readthrough})
Exemple #7
0
    def wrapper(dataset, request: Request, *args, **kwargs):
        (use_split, ) = state.get_configs([("use_split", 0)])
        query_limit = request.query.get_limit()
        limit = query_limit if query_limit is not None else 0
        remaining_offset = request.query.get_offset()
        orderby = util.to_list(request.query.get_orderby())

        common_conditions = use_split and limit and not request.query.get_groupby(
        )

        if common_conditions:
            total_col_count = len(request.query.get_all_referenced_columns())
            column_split_spec = dataset.get_split_query_spec()
            if column_split_spec:
                copied_query = copy.deepcopy(request.query)
                copied_query.set_selected_columns(
                    column_split_spec.get_min_columns())
                min_col_count = len(copied_query.get_all_referenced_columns())
            else:
                min_col_count = None

            if (column_split_spec and request.query.get_selected_columns()
                    and not request.query.get_aggregations()
                    and total_col_count > min_col_count):
                return col_split(dataset, request, column_split_spec, *args,
                                 **kwargs)
            elif orderby[:1] == ["-timestamp"] and remaining_offset < 1000:
                return time_split(dataset, request, *args, **kwargs)

        return query_func(dataset, request, *args, **kwargs)
Exemple #8
0
    def _get_rate_limit_params(
            self, project_ids: Sequence[int]) -> RateLimitParameters:
        project_id = (project_ids[0] if project_ids else 0
                      )  # TODO rate limit on every project in the list?

        prl, pcl = get_configs([("project_per_second_limit", 1000),
                                ("project_concurrent_limit", 1000)])

        # Specific projects can have their rate limits overridden
        (per_second, concurr) = get_configs([
            ("project_per_second_limit_{}".format(project_id), prl),
            ("project_concurrent_limit_{}".format(project_id), pcl),
        ])

        return RateLimitParameters(
            rate_limit_name=PROJECT_RATE_LIMIT_NAME,
            bucket=str(project_id),
            per_second_limit=per_second,
            concurrent_limit=concurr,
        )
Exemple #9
0
def execute_query_with_caching(
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    query_settings: QuerySettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    clickhouse_query_settings: MutableMapping[str, Any],
    robust: bool,
) -> Result:
    # XXX: ``uncompressed_cache_max_cols`` is used to control both the result
    # cache, as well as the uncompressed cache. These should be independent.
    use_cache, uc_max = state.get_configs(
        [("use_cache", settings.USE_RESULT_CACHE), ("uncompressed_cache_max_cols", 5)]
    )

    column_counter = ReferencedColumnsCounter()
    column_counter.visit(clickhouse_query.get_from_clause())
    assert isinstance(uc_max, int)
    if column_counter.count_columns() > uc_max:
        use_cache = False

    execute = partial(
        execute_query_with_rate_limits,
        clickhouse_query,
        query_settings,
        formatted_query,
        reader,
        timer,
        stats,
        clickhouse_query_settings,
        robust=robust,
    )

    with sentry_sdk.start_span(description="execute", op="db") as span:
        key = get_query_cache_key(formatted_query)
        clickhouse_query_settings["query_id"] = key
        if use_cache:
            cache_partition = _get_cache_partition(reader)
            result = cache_partition.get(key)
            timer.mark("cache_get")
            stats["cache_hit"] = result is not None
            if result is not None:
                span.set_tag("cache", "hit")
                return result

            span.set_tag("cache", "miss")
            result = execute()
            cache_partition.set(key, result)
            timer.mark("cache_set")
            return result
        else:
            return execute()
Exemple #10
0
def execute_query_with_caching(
    clickhouse_query: Query,
    request_settings: RequestSettings,
    formatted_query: SqlQuery,
    reader: Reader[SqlQuery],
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    # XXX: ``uncompressed_cache_max_cols`` is used to control both the result
    # cache, as well as the uncompressed cache. These should be independent.
    use_cache, uc_max = state.get_configs([("use_cache",
                                            settings.USE_RESULT_CACHE),
                                           ("uncompressed_cache_max_cols", 5)])

    if (len(
            set((
                # Skip aliases when counting columns
                (c.table_name, c.column_name)
                for c in clickhouse_query.get_all_ast_referenced_columns()))) >
            uc_max):
        use_cache = False

    execute = partial(
        execute_query_with_rate_limits,
        clickhouse_query,
        request_settings,
        formatted_query,
        reader,
        timer,
        stats,
        query_settings,
    )

    with sentry_sdk.start_span(description="execute", op="db") as span:
        if use_cache:
            key = get_query_cache_key(formatted_query)
            result = cache.get(key)
            timer.mark("cache_get")
            stats["cache_hit"] = result is not None
            if result is not None:
                span.set_tag("cache", "hit")
                return result

            span.set_tag("cache", "miss")
            result = execute()
            cache.set(key, result)
            timer.mark("cache_set")
            return result
        else:
            return execute()
Exemple #11
0
def get_global_rate_limit_params() -> RateLimitParameters:
    """
    Returns the configuration object for the global rate limit
    """

    (per_second,
     concurr) = state.get_configs([("global_per_second_limit", None),
                                   ("global_concurrent_limit", 1000)])

    return RateLimitParameters(
        rate_limit_name=GLOBAL_RATE_LIMIT_NAME,
        bucket="global",
        per_second_limit=per_second,
        concurrent_limit=concurr,
    )
Exemple #12
0
    def get_time_limit(
        cls, timeseries_extension: Mapping[str, Any]
    ) -> Tuple[datetime, datetime]:
        max_days, date_align = state.get_configs(
            [("max_days", None), ("date_align_seconds", 1)]
        )

        to_date = parse_datetime(timeseries_extension["to_date"], date_align)
        from_date = parse_datetime(timeseries_extension["from_date"], date_align)
        assert from_date <= to_date

        if max_days is not None and (to_date - from_date).days > max_days:
            from_date = to_date - timedelta(days=max_days)

        return (from_date, to_date)
Exemple #13
0
    def test_config(self):
        state.set_config('foo', 1)
        state.set_configs({'bar': 2, 'baz': 3})
        assert state.get_config('foo') == 1
        assert state.get_config('bar') == 2
        assert state.get_config('noexist', 4) == 4
        all_configs = state.get_all_configs()
        assert all(all_configs[k] == v
                   for k, v in [('foo', 1), ('bar', 2), ('baz', 3)])
        assert state.get_configs([('foo', 100), ('bar', 200), ('noexist', 300),
                                  ('noexist-2', None)]) == [1, 2, 300, None]

        state.set_configs({'bar': 'quux'})
        all_configs = state.get_all_configs()
        assert all(all_configs[k] == v
                   for k, v in [('foo', 1), ('bar', 'quux'), ('baz', 3)])
Exemple #14
0
    def test_config(self):
        state.set_config("foo", 1)
        state.set_configs({"bar": 2, "baz": 3})
        assert state.get_config("foo") == 1
        assert state.get_config("bar") == 2
        assert state.get_config("noexist", 4) == 4
        all_configs = state.get_all_configs()
        assert all(all_configs[k] == v
                   for k, v in [("foo", 1), ("bar", 2), ("baz", 3)])
        assert state.get_configs([("foo", 100), ("bar", 200), ("noexist", 300),
                                  ("noexist-2", None)]) == [1, 2, 300, None]

        state.set_configs({"bar": "quux"})
        all_configs = state.get_all_configs()
        assert all(all_configs[k] == v
                   for k, v in [("foo", 1), ("bar", "quux"), ("baz", 3)])
Exemple #15
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:
        table_name = query.get_from_clause().table_name
        (per_second, concurr) = get_configs([
            (f"table_per_second_limit_{table_name}{self.__suffix}", 5000),
            (f"table_concurrent_limit_{table_name}{self.__suffix}", 1000),
        ])

        rate_limit = RateLimitParameters(
            rate_limit_name=TABLE_RATE_LIMIT_NAME,
            bucket=table_name,
            per_second_limit=per_second,
            concurrent_limit=concurr,
        )

        query_settings.add_rate_limit(rate_limit)
Exemple #16
0
    def _replace_time_condition(
        self,
        query: Query,
        from_date: datetime,
        from_exp: FunctionCall,
        to_date: datetime,
        to_exp: FunctionCall,
    ) -> None:
        max_days, date_align = state.get_configs(
            [("max_days", None), ("date_align_seconds", 1)]
        )

        def align_fn(dt: datetime) -> datetime:
            assert isinstance(date_align, int)
            return dt - timedelta(seconds=(dt - dt.min).seconds % date_align)

        from_date, to_date = align_fn(from_date), align_fn(to_date)
        assert from_date <= to_date

        if max_days is not None and (to_date - from_date).days > max_days:
            from_date = to_date - timedelta(days=max_days)

        def replace_cond(exp: Expression) -> Expression:
            if not isinstance(exp, FunctionCall):
                return exp
            elif exp == from_exp:
                return replace(
                    exp, parameters=(from_exp.parameters[0], Literal(None, from_date)),
                )
            elif exp == to_exp:
                return replace(
                    exp, parameters=(to_exp.parameters[0], Literal(None, to_date))
                )

            return exp

        condition = query.get_condition_from_ast()
        top_level = get_first_level_and_conditions(condition) if condition else []
        new_top_level = list(map(replace_cond, top_level))
        query.set_ast_condition(combine_and_conditions(new_top_level))
Exemple #17
0
def parse_and_run_query(validated_body, timer):
    body = deepcopy(validated_body)
    turbo = body.get('turbo', False)
    max_days, table, date_align, config_sample, force_final, max_group_ids_exclude = state.get_configs([
        ('max_days', None),
        ('clickhouse_table', settings.CLICKHOUSE_TABLE),
        ('date_align_seconds', 1),
        ('sample', 1),
        # 1: always use FINAL, 0: never use final, undefined/None: use project setting.
        ('force_final', 0 if turbo else None),
        ('max_group_ids_exclude', settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE),
    ])
    stats = {}
    to_date = util.parse_datetime(body['to_date'], date_align)
    from_date = util.parse_datetime(body['from_date'], date_align)
    assert from_date <= to_date

    if max_days is not None and (to_date - from_date).days > max_days:
        from_date = to_date - timedelta(days=max_days)

    where_conditions = body.get('conditions', [])
    where_conditions.extend([
        ('timestamp', '>=', from_date),
        ('timestamp', '<', to_date),
        ('deleted', '=', 0),
    ])
    # NOTE: we rely entirely on the schema to make sure that regular snuba
    # queries are required to send a project_id filter. Some other special
    # internal query types do not require a project_id filter.
    project_ids = util.to_list(body['project'])
    if project_ids:
        where_conditions.append(('project_id', 'IN', project_ids))

    having_conditions = body.get('having', [])

    aggregate_exprs = [
        util.column_expr(col, body, alias, agg)
        for (agg, col, alias) in body['aggregations']
    ]
    groupby = util.to_list(body['groupby'])
    group_exprs = [util.column_expr(gb, body) for gb in groupby]

    selected_cols = [util.column_expr(util.tuplify(colname), body)
                     for colname in body.get('selected_columns', [])]

    select_exprs = group_exprs + aggregate_exprs + selected_cols
    select_clause = u'SELECT {}'.format(', '.join(select_exprs))

    from_clause = u'FROM {}'.format(table)

    # For now, we only need FINAL if:
    #    1. The project has been marked as needing FINAL (in redis) because of recent
    #       replacements (and it affects too many groups for us just to exclude
    #       those groups from the query)
    #    OR
    #    2. the force_final setting = 1
    needs_final, exclude_group_ids = get_projects_query_flags(project_ids)
    if len(exclude_group_ids) > max_group_ids_exclude:
        # Cap the number of groups to exclude by query and flip to using FINAL if necessary
        needs_final = True
        exclude_group_ids = []

    used_final = False
    if force_final == 1 or (force_final is None and needs_final):
        from_clause = u'{} FINAL'.format(from_clause)
        used_final = True
    elif exclude_group_ids:
        where_conditions.append(('group_id', 'NOT IN', exclude_group_ids))

    sample = body.get('sample', settings.TURBO_SAMPLE_RATE if turbo else config_sample)
    if sample != 1:
        from_clause = u'{} SAMPLE {}'.format(from_clause, sample)

    joins = []

    if 'arrayjoin' in body:
        joins.append(u'ARRAY JOIN {}'.format(body['arrayjoin']))
    join_clause = ' '.join(joins)

    where_clause = ''
    if where_conditions:
        where_conditions = list(set(util.tuplify(where_conditions)))
        where_clause = u'WHERE {}'.format(util.conditions_expr(where_conditions, body))

    prewhere_conditions = []
    if settings.PREWHERE_KEYS:
        # Add any condition to PREWHERE if:
        # - It is a single top-level condition (not OR-nested), and
        # - Any of its referenced columns are in PREWHERE_KEYS
        prewhere_candidates = [
            (util.columns_in_expr(cond[0]), cond)
            for cond in where_conditions if util.is_condition(cond) and
            any(col in settings.PREWHERE_KEYS for col in util.columns_in_expr(cond[0]))
        ]
        # Use the condition that has the highest priority (based on the
        # position of its columns in the PREWHERE_KEYS list)
        prewhere_candidates = sorted([
            (min(settings.PREWHERE_KEYS.index(col) for col in cols if col in settings.PREWHERE_KEYS), cond)
            for cols, cond in prewhere_candidates
        ])
        if prewhere_candidates:
            prewhere_conditions = [cond for _, cond in prewhere_candidates][:settings.MAX_PREWHERE_CONDITIONS]

    prewhere_clause = ''
    if prewhere_conditions:
        prewhere_clause = u'PREWHERE {}'.format(util.conditions_expr(prewhere_conditions, body))

    having_clause = ''
    if having_conditions:
        assert groupby, 'found HAVING clause with no GROUP BY'
        having_clause = u'HAVING {}'.format(util.conditions_expr(having_conditions, body))

    group_clause = ', '.join(util.column_expr(gb, body) for gb in groupby)
    if group_clause:
        if body.get('totals', False):
            group_clause = 'GROUP BY ({}) WITH TOTALS'.format(group_clause)
        else:
            group_clause = 'GROUP BY ({})'.format(group_clause)

    order_clause = ''
    if body.get('orderby'):
        orderby = [util.column_expr(util.tuplify(ob), body) for ob in util.to_list(body['orderby'])]
        orderby = [u'{} {}'.format(
            ob.lstrip('-'),
            'DESC' if ob.startswith('-') else 'ASC'
        ) for ob in orderby]
        order_clause = u'ORDER BY {}'.format(', '.join(orderby))

    limitby_clause = ''
    if 'limitby' in body:
        limitby_clause = 'LIMIT {} BY {}'.format(*body['limitby'])

    limit_clause = ''
    if 'limit' in body:
        limit_clause = 'LIMIT {}, {}'.format(body.get('offset', 0), body['limit'])

    sql = ' '.join([c for c in [
        select_clause,
        from_clause,
        join_clause,
        prewhere_clause,
        where_clause,
        group_clause,
        having_clause,
        order_clause,
        limitby_clause,
        limit_clause
    ] if c])

    timer.mark('prepare_query')

    stats.update({
        'clickhouse_table': table,
        'final': used_final,
        'referrer': request.referrer,
        'num_days': (to_date - from_date).days,
        'num_projects': len(project_ids),
        'sample': sample,
    })

    return util.raw_query(
        validated_body, sql, clickhouse_ro, timer, stats
    )
Exemple #18
0
def raw_query(request: Request, sql, client, timer, stats=None):
    """
    Submit a raw SQL query to clickhouse and do some post-processing on it to
    fix some of the formatting issues in the result JSON
    """
    from snuba.clickhouse.native import NativeDriverReader

    project_ids = to_list(request.extensions['project']['project'])
    project_id = project_ids[
        0] if project_ids else 0  # TODO rate limit on every project in the list?
    stats = stats or {}
    grl, gcl, prl, pcl, use_cache, use_deduper, uc_max = state.get_configs([
        ('global_per_second_limit', None),
        ('global_concurrent_limit', 1000),
        ('project_per_second_limit', 1000),
        ('project_concurrent_limit', 1000),
        ('use_cache', 0),
        ('use_deduper', 1),
        ('uncompressed_cache_max_cols', 5),
    ])

    # Specific projects can have their rate limits overridden
    prl, pcl = state.get_configs([
        ('project_per_second_limit_{}'.format(project_id), prl),
        ('project_concurrent_limit_{}'.format(project_id), pcl),
    ])

    all_confs = state.get_all_configs()
    query_settings = {
        k.split('/', 1)[1]: v
        for k, v in all_confs.items() if k.startswith('query_settings/')
    }

    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in clickhouse, or result cache in snuba.
    if len(all_referenced_columns(request.query.get_body())) > uc_max:
        query_settings['use_uncompressed_cache'] = 0
        use_cache = 0

    timer.mark('get_configs')

    query_id = md5(force_bytes(sql)).hexdigest()
    with state.deduper(query_id if use_deduper else None) as is_dupe:
        timer.mark('dedupe_wait')

        result = state.get_result(query_id) if use_cache else None
        timer.mark('cache_get')

        stats.update({
            'is_duplicate': is_dupe,
            'query_id': query_id,
            'use_cache': bool(use_cache),
            'cache_hit': bool(result)
        }),

        if result:
            status = 200
        else:
            with state.rate_limit('global', grl,
                                  gcl) as (g_allowed, g_rate, g_concurr):
                metrics.gauge('query.global_concurrent', g_concurr)
                stats.update({
                    'global_rate': g_rate,
                    'global_concurrent': g_concurr
                })

                with state.rate_limit(project_id, prl,
                                      pcl) as (p_allowed, p_rate, p_concurr):
                    stats.update({
                        'project_rate': p_rate,
                        'project_concurrent': p_concurr
                    })
                    timer.mark('rate_limit')

                    if g_allowed and p_allowed:

                        # Experiment, reduce max threads by 1 for each extra concurrent query
                        # that a project has running beyond the first one
                        if 'max_threads' in query_settings and p_concurr > 1:
                            maxt = query_settings['max_threads']
                            query_settings['max_threads'] = max(
                                1, maxt - p_concurr + 1)

                        # Force query to use the first shard replica, which
                        # should have synchronously received any cluster writes
                        # before this query is run.
                        consistent = request.extensions['performance'].get(
                            'consistent', False)
                        stats['consistent'] = consistent
                        if consistent:
                            query_settings['load_balancing'] = 'in_order'
                            query_settings['max_threads'] = 1

                        try:
                            result = NativeDriverReader(client).execute(
                                sql,
                                query_settings,
                                # All queries should already be deduplicated at this point
                                # But the query_id will let us know if they aren't
                                query_id=query_id if use_deduper else None,
                                with_totals=request.query.get_body().get(
                                    'totals', False),
                            )
                            status = 200

                            logger.debug(sql)
                            timer.mark('execute')
                            stats.update({
                                'result_rows': len(result['data']),
                                'result_cols': len(result['meta']),
                            })

                            if use_cache:
                                state.set_result(query_id, result)
                                timer.mark('cache_set')

                        except BaseException as ex:
                            error = str(ex)
                            status = 500
                            logger.exception("Error running query: %s\n%s",
                                             sql, error)
                            if isinstance(ex, ClickHouseError):
                                result = {
                                    'error': {
                                        'type': 'clickhouse',
                                        'code': ex.code,
                                        'message': error,
                                    }
                                }
                            else:
                                result = {
                                    'error': {
                                        'type': 'unknown',
                                        'message': error,
                                    }
                                }

                    else:
                        status = 429
                        Reason = namedtuple('reason', 'scope name val limit')
                        reasons = [
                            Reason('global', 'concurrent', g_concurr, gcl),
                            Reason('global', 'per-second', g_rate, grl),
                            Reason('project', 'concurrent', p_concurr, pcl),
                            Reason('project', 'per-second', p_rate, prl)
                        ]
                        reason = next(
                            (r for r in reasons
                             if r.limit is not None and r.val > r.limit), None)
                        result = {
                            'error': {
                                'type':
                                'ratelimit',
                                'message':
                                'rate limit exceeded',
                                'detail':
                                reason and
                                '{r.scope} {r.name} of {r.val:.0f} exceeds limit of {r.limit:.0f}'
                                .format(r=reason)
                            }
                        }

    stats.update(query_settings)

    if settings.RECORD_QUERIES:
        # send to redis
        state.record_query({
            'request': request.body,
            'sql': sql,
            'timing': timer,
            'stats': stats,
            'status': status,
        })

        # send to datadog
        tags = [
            'status:{}'.format(status),
            'referrer:{}'.format(stats.get('referrer', 'none')),
            'final:{}'.format(stats.get('final', False))
        ]
        mark_tags = ['final:{}'.format(stats.get('final', False))]
        timer.send_metrics_to(metrics, tags=tags, mark_tags=mark_tags)

    result['timing'] = timer

    if settings.STATS_IN_RESPONSE or request.extensions['performance'].get(
            'debug', False):
        result['stats'] = stats
        result['sql'] = sql

    return (result, status)
Exemple #19
0
def raw_query(
    request: Request,
    query: ClickhouseQuery,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    stats: MutableMapping[str, Any],
    trace_id: Optional[str] = None,
) -> RawQueryResult:
    """
    Submits a raw SQL query to the DB and does some post-processing on it to
    fix some of the formatting issues in the result JSON.
    This function is not supposed to depend on anything higher level than the storage
    query (ClickhouseQuery as of now). If this function ends up depending on the
    dataset, something is wrong.

    TODO: As soon as we have a StorageQuery abstraction remove all the references
    to the original query from the request.
    """

    use_cache, use_deduper, uc_max = state.get_configs([
        ("use_cache", settings.USE_RESULT_CACHE),
        ("use_deduper", 1),
        ("uncompressed_cache_max_cols", 5),
    ])

    all_confs = state.get_all_configs()
    query_settings: MutableMapping[str, Any] = {
        k.split("/", 1)[1]: v
        for k, v in all_confs.items() if k.startswith("query_settings/")
    }

    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in clickhouse, or result cache in snuba.
    if len(request.query.get_all_referenced_columns()) > uc_max:
        query_settings["use_uncompressed_cache"] = 0
        use_cache = 0

    timer.mark("get_configs")

    sql = query.format_sql()
    query_id = md5(force_bytes(sql)).hexdigest()
    with state.deduper(query_id if use_deduper else None) as is_dupe:
        timer.mark("dedupe_wait")

        result = cache.get(query_id) if use_cache else None
        timer.mark("cache_get")

        stats.update({
            "is_duplicate": is_dupe,
            "query_id": query_id,
            "use_cache": bool(use_cache),
            "cache_hit": bool(result),
        }),

        update_with_status = partial(
            update_query_metadata_and_stats,
            request,
            sql,
            timer,
            stats,
            query_metadata,
            query_settings,
            trace_id,
        )

        if not result:
            try:
                with RateLimitAggregator(
                        request.settings.get_rate_limit_params(
                        )) as rate_limit_stats_container:
                    stats.update(rate_limit_stats_container.to_dict())
                    timer.mark("rate_limit")

                    project_rate_limit_stats = rate_limit_stats_container.get_stats(
                        PROJECT_RATE_LIMIT_NAME)

                    if ("max_threads" in query_settings
                            and project_rate_limit_stats is not None
                            and project_rate_limit_stats.concurrent > 1):
                        maxt = query_settings["max_threads"]
                        query_settings["max_threads"] = max(
                            1, maxt - project_rate_limit_stats.concurrent + 1)

                    # Force query to use the first shard replica, which
                    # should have synchronously received any cluster writes
                    # before this query is run.
                    consistent = request.settings.get_consistent()
                    stats["consistent"] = consistent
                    if consistent:
                        query_settings["load_balancing"] = "in_order"
                        query_settings["max_threads"] = 1

                    try:
                        result = reader.execute(
                            query,
                            query_settings,
                            # All queries should already be deduplicated at this point
                            # But the query_id will let us know if they aren't
                            query_id=query_id if use_deduper else None,
                            with_totals=request.query.has_totals(),
                        )

                        timer.mark("execute")
                        stats.update({
                            "result_rows": len(result["data"]),
                            "result_cols": len(result["meta"]),
                        })

                        if use_cache:
                            cache.set(query_id, result)
                            timer.mark("cache_set")

                    except BaseException as ex:
                        error = str(ex)
                        logger.exception("Error running query: %s\n%s", sql,
                                         error)
                        stats = update_with_status("error")
                        meta = {}
                        if isinstance(ex, ClickhouseError):
                            err_type = "clickhouse"
                            meta["code"] = ex.code
                        else:
                            err_type = "unknown"
                        raise RawQueryException(
                            err_type=err_type,
                            message=error,
                            stats=stats,
                            sql=sql,
                            **meta,
                        )
            except RateLimitExceeded as ex:
                stats = update_with_status("rate-limited")
                raise RawQueryException(
                    err_type="rate-limited",
                    message="rate limit exceeded",
                    stats=stats,
                    sql=sql,
                    detail=str(ex),
                )

    stats = update_with_status("success")

    return RawQueryResult(result, {"stats": stats, "sql": sql})
Exemple #20
0
    def wrapper(*args, **kwargs):
        body = args[0]
        use_split, date_align, split_step = state.get_configs([
            ('use_split', 0),
            ('date_align_seconds', 1),
            ('split_step', 3600),  # default 1 hour
        ])
        to_date = util.parse_datetime(body['to_date'], date_align)
        from_date = util.parse_datetime(body['from_date'], date_align)
        limit = body.get('limit', 0)
        remaining_offset = body.get('offset', 0)

        if (use_split and limit and not body.get('groupby')
                and body.get('orderby') == '-timestamp'):
            overall_result = None
            split_end = to_date
            split_start = max(split_end - timedelta(seconds=split_step),
                              from_date)
            total_results = 0
            status = 0
            while split_start < split_end and total_results < limit:
                body['from_date'] = split_start.isoformat()
                body['to_date'] = split_end.isoformat()
                # Because its paged, we have to ask for (limit+offset) results
                # and set offset=0 so we can then trim them ourselves.
                body['offset'] = 0
                body['limit'] = limit - total_results + remaining_offset
                result, status = query_func(*args, **kwargs)

                # If something failed, discard all progress and just return that
                if status != 200:
                    overall_result = result
                    break

                if overall_result is None:
                    overall_result = result
                else:
                    overall_result['data'].extend(result['data'])

                if remaining_offset > 0 and len(overall_result['data']) > 0:
                    to_trim = min(remaining_offset,
                                  len(overall_result['data']))
                    overall_result['data'] = overall_result['data'][to_trim:]
                    remaining_offset -= to_trim

                total_results = len(overall_result['data'])

                if total_results < limit:
                    if len(result['data']) == 0:
                        # If we got nothing from the last query, jump straight to the max time range
                        split_end = split_start
                        split_start = from_date
                    else:
                        # Estimate how big the time range should be for the next query based on
                        # how many results we got for our last query and its time range, and how
                        # many we have left to fetch
                        remaining = limit - total_results
                        split_step = split_step * math.ceil(
                            remaining / float(len(result['data'])))
                        split_end = split_start
                        try:
                            split_start = max(
                                split_end - timedelta(seconds=split_step),
                                from_date)
                        except OverflowError:
                            split_start = from_date
            return overall_result, status
        else:
            return query_func(*args, **kwargs)
Exemple #21
0
def rate_limit(
    rate_limit_params: RateLimitParameters,
) -> Iterator[Optional[RateLimitStats]]:
    """
    A context manager for rate limiting that allows for limiting based on
    on a rolling-window per-second rate as well as the number of requests
    concurrently running.

    Uses a single redis sorted set per rate-limiting bucket to track both the
    concurrency and rate, the score is the query timestamp. Queries are thrown
    ahead in time when they start so we can count them as concurrent, and
    thrown back to their start time once they finish so we can count them
    towards the historical rate.

               time >>----->
    +-----------------------------+--------------------------------+
    | historical query window     | currently executing queries    |
    +-----------------------------+--------------------------------+
                                  ^
                                 now
    """

    bucket = "{}{}".format(state.ratelimit_prefix, rate_limit_params.bucket)
    query_id = uuid.uuid4()

    now = time.time()
    bypass_rate_limit, rate_history_s = state.get_configs([
        ("bypass_rate_limit", 0), ("rate_history_sec", 3600)
    ])
    assert isinstance(rate_history_s, (int, float))

    if bypass_rate_limit == 1:
        yield None
        return

    pipe = rds.pipeline(transaction=False)
    pipe.zremrangebyscore(bucket, "-inf",
                          "({:f}".format(now - rate_history_s))  # cleanup
    pipe.zadd(bucket, now + state.max_query_duration_s,
              query_id)  # type: ignore
    if rate_limit_params.per_second_limit is None:
        pipe.exists("nosuchkey")  # no-op if we don't need per-second
    else:
        pipe.zcount(bucket, now - state.rate_lookback_s, now)  # get historical
    if rate_limit_params.concurrent_limit is None:
        pipe.exists("nosuchkey")  # no-op if we don't need concurrent
    else:
        pipe.zcount(bucket, "({:f}".format(now), "+inf")  # get concurrent

    try:
        _, _, historical, concurrent = pipe.execute()
        historical = int(historical)
        concurrent = int(concurrent)
    except Exception as ex:
        logger.exception(ex)
        yield None  # fail open if redis is having issues
        return

    per_second = historical / float(state.rate_lookback_s)

    stats = RateLimitStats(rate=per_second, concurrent=concurrent)

    rate_limit_name = rate_limit_params.rate_limit_name

    Reason = namedtuple("Reason", "scope name val limit")
    reasons = [
        Reason(
            rate_limit_name,
            "concurrent",
            concurrent,
            rate_limit_params.concurrent_limit,
        ),
        Reason(
            rate_limit_name,
            "per-second",
            per_second,
            rate_limit_params.per_second_limit,
        ),
    ]

    reason = next(
        (r for r in reasons if r.limit is not None and r.val > r.limit), None)

    if reason:
        try:
            rds.zrem(bucket, query_id)  # not allowed / not counted
        except Exception as ex:
            logger.exception(ex)

        raise RateLimitExceeded(
            "{r.scope} {r.name} of {r.val:.0f} exceeds limit of {r.limit:.0f}".
            format(r=reason))

    try:
        yield stats
    finally:
        try:
            # return the query to its start time
            rds.zincrby(bucket, query_id, -float(state.max_query_duration_s))
        except Exception as ex:
            logger.exception(ex)
Exemple #22
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        If a query is:
            - ORDER BY timestamp DESC
            - has no grouping
            - has an offset/limit
            - has a large time range
        We know we have to reverse-sort the entire set of rows to return the small
        chunk at the end of the time range, so optimistically split the time range
        into smaller increments, and start with the last one, so that we can potentially
        avoid querying the entire range.
        """
        limit = query.get_limit()
        if limit is None or query.get_groupby_from_ast():
            return None

        if query.get_offset() >= 1000:
            return None

        orderby = query.get_orderby_from_ast()
        if (not orderby or orderby[0].direction != OrderByDirection.DESC
                or not isinstance(orderby[0].expression, ColumnExpr) or
                not orderby[0].expression.column_name == self.__timestamp_col):
            return None

        from_date_ast, to_date_ast = get_time_range(query,
                                                    self.__timestamp_col)

        if from_date_ast is None or to_date_ast is None:
            return None

        date_align, split_step = state.get_configs([("date_align_seconds", 1),
                                                    ("split_step", 3600)
                                                    ]  # default 1 hour
                                                   )
        assert isinstance(split_step, int)
        remaining_offset = query.get_offset()

        overall_result: Optional[QueryResult] = None
        split_end = to_date_ast
        split_start = max(split_end - timedelta(seconds=split_step),
                          from_date_ast)
        total_results = 0
        while split_start < split_end and total_results < limit:
            # We need to make a copy to use during the query execution because we replace
            # the start-end conditions on the query at each iteration of this loop.
            split_query = copy.deepcopy(query)

            _replace_ast_condition(split_query, self.__timestamp_col, ">=",
                                   LiteralExpr(None, split_start))
            _replace_ast_condition(split_query, self.__timestamp_col, "<",
                                   LiteralExpr(None, split_end))

            # Because its paged, we have to ask for (limit+offset) results
            # and set offset=0 so we can then trim them ourselves.
            split_query.set_offset(0)
            split_query.set_limit(limit - total_results + remaining_offset)

            # At every iteration we only append the "data" key from the results returned by
            # the runner. The "extra" key is only populated at the first iteration of the
            # loop and never changed.
            result = runner(split_query, request_settings)

            if overall_result is None:
                overall_result = result
            else:
                overall_result.result["data"].extend(result.result["data"])

            if remaining_offset > 0 and len(overall_result.result["data"]) > 0:
                to_trim = min(remaining_offset,
                              len(overall_result.result["data"]))
                overall_result.result["data"] = overall_result.result["data"][
                    to_trim:]
                remaining_offset -= to_trim

            total_results = len(overall_result.result["data"])

            if total_results < limit:
                if len(result.result["data"]) == 0:
                    # If we got nothing from the last query, expand the range by a static factor
                    split_step = split_step * STEP_GROWTH
                else:
                    # If we got some results but not all of them, estimate how big the time
                    # range should be for the next query based on how many results we got for
                    # our last query and its time range, and how many we have left to fetch.
                    remaining = limit - total_results
                    split_step = split_step * math.ceil(
                        remaining / float(len(result.result["data"])))

                # Set the start and end of the next query based on the new range.
                split_end = split_start
                try:
                    split_start = max(
                        split_end - timedelta(seconds=split_step),
                        from_date_ast)
                except OverflowError:
                    split_start = from_date_ast

        return overall_result
Exemple #23
0
def raw_query(body, sql, client, timer, stats=None):
    """
    Submit a raw SQL query to clickhouse and do some post-processing on it to
    fix some of the formatting issues in the result JSON
    """
    project_ids = to_list(body['project'])
    project_id = project_ids[0] if project_ids else 0  # TODO rate limit on every project in the list?
    stats = stats or {}
    grl, gcl, prl, pcl, use_cache = state.get_configs([
        ('global_per_second_limit', 1000),
        ('global_concurrent_limit', 1000),
        ('project_per_second_limit', 1000),
        ('project_concurrent_limit', 1000),
        ('use_cache', 0),
    ])

    # Specific projects can have their rate limits overridden
    prl, pcl = state.get_configs([
        ('project_per_second_limit_{}'.format(project_id), prl),
        ('project_concurrent_limit_{}'.format(project_id), pcl),
    ])

    all_confs = state.get_all_configs()
    query_settings = {
        k.split('/', 1)[1]: v
        for k, v in six.iteritems(all_confs)
        if k.startswith('query_settings/')
    }

    timer.mark('get_configs')

    query_id = md5(force_bytes(sql)).hexdigest()
    with state.deduper(query_id) as is_dupe:
        timer.mark('dedupe_wait')

        result = state.get_result(query_id) if use_cache else None
        timer.mark('cache_get')

        stats.update({
            'is_duplicate': is_dupe,
            'query_id': query_id,
            'use_cache': bool(use_cache),
            'cache_hit': bool(result)}
        ),

        if result:
            status = 200
        else:
            with state.rate_limit('global', grl, gcl) as (g_allowed, g_rate, g_concurr):
                metrics.gauge('query.global_concurrent', g_concurr)
                stats.update({'global_rate': g_rate, 'global_concurrent': g_concurr})

                with state.rate_limit(project_id, prl, pcl) as (p_allowed, p_rate, p_concurr):
                    stats.update({'project_rate': p_rate, 'project_concurrent': p_concurr})
                    timer.mark('rate_limit')

                    if g_allowed and p_allowed:

                        # Experiment, reduce max threads by 1 for each extra concurrent query
                        # that a project has running beyond the first one
                        if 'max_threads' in query_settings and p_concurr > 1:
                            maxt = query_settings['max_threads']
                            query_settings['max_threads'] = max(1, maxt - p_concurr + 1)

                        # Force query to use the first shard replica, which
                        # should have synchronously received any cluster writes
                        # before this query is run.
                        consistent = body.get('consistent', False)
                        stats['consistent'] = consistent
                        if consistent:
                            query_settings['load_balancing'] = 'in_order'
                            query_settings['max_threads'] = 1

                        try:
                            data, meta = client.execute(
                                sql,
                                with_column_types=True,
                                settings=query_settings,
                                # All queries should already be deduplicated at this point
                                # But the query_id will let us know if they aren't
                                query_id=query_id
                            )
                            data, meta = scrub_ch_data(data, meta)
                            status = 200
                            if body.get('totals', False):
                                assert len(data) > 0
                                data, totals = data[:-1], data[-1]
                                result = {'data': data, 'meta': meta, 'totals': totals}
                            else:
                                result = {'data': data, 'meta': meta}

                            logger.debug(sql)
                            timer.mark('execute')
                            stats.update({
                                'result_rows': len(data),
                                'result_cols': len(meta),
                            })

                            if use_cache:
                                state.set_result(query_id, result)
                                timer.mark('cache_set')

                        except BaseException as ex:
                            error = six.text_type(ex)
                            status = 500
                            logger.exception("Error running query: %s\n%s", sql, error)
                            if isinstance(ex, ClickHouseError):
                                result = {'error': {
                                    'type': 'clickhouse',
                                    'code': ex.code,
                                    'message': error,
                                }}
                            else:
                                result = {'error': {
                                    'type': 'unknown',
                                    'message': error,
                                }}

                    else:
                        status = 429
                        reasons = [
                            ('global', 'concurrent', g_concurr, gcl),
                            ('global', 'per-second', g_rate, grl),
                            ('project', 'concurrent', p_concurr, pcl),
                            ('project', 'per-second', p_rate, prl)
                        ]
                        reason = next((r for r in reasons if r[2] > r[3]), None)
                        result = {'error': {
                            'type': 'ratelimit',
                            'message': 'rate limit exceeded',
                            'detail': reason and '{} {} of {:.0f} exceeds limit of {:.0f}'.format(*reason)
                        }}

    stats.update(query_settings)

    if settings.RECORD_QUERIES:
        # send to redis
        state.record_query({
            'request': body,
            'sql': sql,
            'timing': timer,
            'stats': stats,
            'status': status,
        })

        # send to datadog
        tags = [
            'status:{}'.format(status),
            'referrer:{}'.format(stats.get('referrer', 'none')),
            'final:{}'.format(stats.get('final', False))
        ]
        mark_tags = [
            'final:{}'.format(stats.get('final', False))
        ]
        timer.send_metrics_to(metrics, tags=tags, mark_tags=mark_tags)

    result['timing'] = timer

    if settings.STATS_IN_RESPONSE or body.get('debug', False):
        result['stats'] = stats
        result['sql'] = sql

    return (result, status)
Exemple #24
0
def raw_query(
    request: Request,
    query: DictClickhouseQuery,
    reader: Reader[ClickhouseQuery],
    timer: Timer,
    stats: Optional[MutableMapping[str, Any]] = None,
) -> ClickhouseQueryResult:
    """
    Submit a raw SQL query to clickhouse and do some post-processing on it to
    fix some of the formatting issues in the result JSON
    """

    stats = stats or {}
    use_cache, use_deduper, uc_max = state.get_configs(
        [("use_cache", 0), ("use_deduper", 1), ("uncompressed_cache_max_cols", 5)]
    )

    all_confs = state.get_all_configs()
    query_settings = {
        k.split("/", 1)[1]: v
        for k, v in all_confs.items()
        if k.startswith("query_settings/")
    }

    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in clickhouse, or result cache in snuba.
    if len(request.query.get_all_referenced_columns()) > uc_max:
        query_settings["use_uncompressed_cache"] = 0
        use_cache = 0

    timer.mark("get_configs")

    sql = query.format_sql()
    query_id = md5(force_bytes(sql)).hexdigest()
    with state.deduper(query_id if use_deduper else None) as is_dupe:
        timer.mark("dedupe_wait")

        result = cache.get(query_id) if use_cache else None
        timer.mark("cache_get")

        stats.update(
            {
                "is_duplicate": is_dupe,
                "query_id": query_id,
                "use_cache": bool(use_cache),
                "cache_hit": bool(result),
            }
        ),

        if not result:
            try:
                with RateLimitAggregator(
                    request.settings.get_rate_limit_params()
                ) as rate_limit_stats_container:
                    stats.update(rate_limit_stats_container.to_dict())
                    timer.mark("rate_limit")

                    project_rate_limit_stats = rate_limit_stats_container.get_stats(
                        PROJECT_RATE_LIMIT_NAME
                    )

                    if (
                        "max_threads" in query_settings
                        and project_rate_limit_stats is not None
                        and project_rate_limit_stats.concurrent > 1
                    ):
                        maxt = query_settings["max_threads"]
                        query_settings["max_threads"] = max(
                            1, maxt - project_rate_limit_stats.concurrent + 1
                        )

                    # Force query to use the first shard replica, which
                    # should have synchronously received any cluster writes
                    # before this query is run.
                    consistent = request.settings.get_consistent()
                    stats["consistent"] = consistent
                    if consistent:
                        query_settings["load_balancing"] = "in_order"
                        query_settings["max_threads"] = 1

                    try:
                        result = reader.execute(
                            query,
                            query_settings,
                            # All queries should already be deduplicated at this point
                            # But the query_id will let us know if they aren't
                            query_id=query_id if use_deduper else None,
                            with_totals=request.query.has_totals(),
                        )

                        timer.mark("execute")
                        stats.update(
                            {
                                "result_rows": len(result["data"]),
                                "result_cols": len(result["meta"]),
                            }
                        )

                        if use_cache:
                            cache.set(query_id, result)
                            timer.mark("cache_set")

                    except BaseException as ex:
                        error = str(ex)
                        logger.exception("Error running query: %s\n%s", sql, error)
                        stats = log_query_and_update_stats(
                            request, sql, timer, stats, "error", query_settings
                        )
                        meta = {}
                        if isinstance(ex, ClickHouseError):
                            err_type = "clickhouse"
                            meta["code"] = ex.code
                        else:
                            err_type = "unknown"
                        raise RawQueryException(
                            err_type=err_type,
                            message=error,
                            stats=stats,
                            sql=sql,
                            **meta,
                        )
            except RateLimitExceeded as ex:
                stats = log_query_and_update_stats(
                    request, sql, timer, stats, "rate-limited", query_settings
                )
                raise RawQueryException(
                    err_type="rate-limited",
                    message="rate limit exceeded",
                    stats=stats,
                    sql=sql,
                    detail=str(ex),
                )

    stats = log_query_and_update_stats(
        request, sql, timer, stats, "success", query_settings
    )

    if settings.STATS_IN_RESPONSE or request.settings.get_debug():
        result["stats"] = stats
        result["sql"] = sql

    return result
Exemple #25
0
def rate_limit(
    rate_limit_params: RateLimitParameters,
) -> Iterator[Optional[RateLimitStats]]:
    """
    A context manager for rate limiting that allows for limiting based on:
        * a rolling-window per-second rate
        * the number of queries concurrently running.

    It uses one redis sorted set to keep track of both of these limits
    The following mapping is kept in redis:

        bucket: SortedSet([(timestamp1, query_id1), (timestamp2, query_id2) ...])


    Queries are thrown ahead in time when they start so we can count them
    as concurrent, and thrown back to their start time once they finish so
    we can count them towards the historical rate. See the comments for
    an example.

               time >>----->
    +-----------------------------+--------------------------------+
    | historical query window     | currently executing queries    |
    +-----------------------------+--------------------------------+
                                  ^
                                 now
    """

    bucket = "{}{}".format(state.ratelimit_prefix, rate_limit_params.bucket)
    query_id = str(uuid.uuid4())

    now = time.time()
    bypass_rate_limit, rate_history_s = state.get_configs(
        [("bypass_rate_limit", 0), ("rate_history_sec", 3600)]
        #                               ^ number of seconds the timestamps are kept
    )
    assert isinstance(rate_history_s, (int, float))

    if bypass_rate_limit == 1:
        yield None
        return

    pipe = rds.pipeline(transaction=False)
    # cleanup old query timestamps past our retention window
    stale_queries = pipe.zremrangebyscore(bucket, "-inf",
                                          "({:f}".format(now - rate_history_s))
    metrics.increment("rate_limit.stale",
                      stale_queries,
                      tags={"bucket": bucket})

    # Now for the tricky bit:
    # ======================
    # The query's *deadline* is added to the sorted set of timestamps, therefore
    # labeling its execution as in the future.

    # All queries with timestamps in the future are considered to be executing *right now*
    # Example:

    # now = 100
    # max_query_duration_s = 30
    # rate_lookback_s = 10
    # sorted_set (timestamps only for clarity) = [91, 94, 97, 103, 105, 130]

    # EXPLANATION:
    # ===========

    # queries that have finished running
    # (in this example there are 3 queries in the last 10 seconds
    #  thus the per second rate is 3/10 = 0.3)
    #      |
    #      v
    #  -----------              v--- the current query, vaulted into the future
    #  [91, 94, 97, 103, 105, 130]
    #               -------------- < - queries currently running
    #                                (how many queries are
    #                                   running concurrently; in this case 3)
    #              ^
    #              | current time
    pipe.zadd(bucket, {query_id: now + state.max_query_duration_s})
    if rate_limit_params.per_second_limit is None:
        pipe.exists("nosuchkey")  # no-op if we don't need per-second
    else:
        # count queries that have finished for the per-second rate
        pipe.zcount(bucket, now - state.rate_lookback_s, now)
    if rate_limit_params.concurrent_limit is None:
        pipe.exists("nosuchkey")  # no-op if we don't need concurrent
    else:
        # count the amount queries in the "future" which tells us the amount
        # of concurrent queries
        pipe.zcount(bucket, "({:f}".format(now), "+inf")

    try:
        _, _, historical, concurrent = pipe.execute()
        historical = int(historical)
        concurrent = int(concurrent)
    except Exception as ex:
        logger.exception(ex)
        yield None  # fail open if redis is having issues
        return

    per_second = historical / float(state.rate_lookback_s)

    stats = RateLimitStats(rate=per_second, concurrent=concurrent)

    rate_limit_name = rate_limit_params.rate_limit_name

    Reason = namedtuple("Reason", "scope name val limit")
    reasons = [
        Reason(
            rate_limit_name,
            "concurrent",
            concurrent,
            rate_limit_params.concurrent_limit,
        ),
        Reason(
            rate_limit_name,
            "per-second",
            per_second,
            rate_limit_params.per_second_limit,
        ),
    ]
    reason = next(
        (r for r in reasons if r.limit is not None and r.val > r.limit), None)
    if reason:
        try:
            # Remove the query from the sorted set
            # because we rate limited it. It shouldn't count towards
            # rate limiting future queries in this bucket.
            rds.zrem(bucket, query_id)
        except Exception as ex:
            logger.exception(ex)

        raise RateLimitExceeded(
            "{r.scope} {r.name} of {r.val:.0f} exceeds limit of {r.limit:.0f}".
            format(r=reason),
            scope=reason.scope,
            name=reason.name,
        )

    rate_limited = False
    try:
        yield stats
        _, err, _ = sys.exc_info()
        if isinstance(err, RateLimitExceeded):
            # If another rate limiter throws an exception, it won't be propagated
            # through this context. So check for the exception explicitly.
            # If another rate limit was hit, we don't want to count this query
            # against this limit.
            try:
                rds.zrem(bucket, query_id)  # not allowed / not counted
                rate_limited = True
            except Exception as ex:
                logger.exception(ex)
    finally:
        try:
            # return the query to its start time, if the query_id was actually added.
            if not rate_limited:
                rds.zincrby(bucket, -float(state.max_query_duration_s),
                            query_id)
        except Exception as ex:
            logger.exception(ex)
Exemple #26
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        If a query is:
            - ORDER BY timestamp DESC
            - has no grouping
            - has an offset/limit
            - has a large time range
        We know we have to reverse-sort the entire set of rows to return the small
        chunk at the end of the time range, so optimistically split the time range
        into smaller increments, and start with the last one, so that we can potentially
        avoid querying the entire range.
        """
        limit = query.get_limit()
        if limit is None or query.get_groupby():
            return None

        if query.get_offset() >= 1000:
            return None

        orderby = query.get_orderby()
        if not orderby or orderby[0] != f"-{self.__timestamp_col}":
            return None

        conditions = query.get_conditions() or []
        from_date_str = next(
            (condition[2] for condition in conditions
             if _identify_condition(condition, self.__timestamp_col, ">=")),
            None,
        )

        to_date_str = next(
            (condition[2] for condition in conditions
             if _identify_condition(condition, self.__timestamp_col, "<")),
            None,
        )
        from_date_ast, to_date_ast = get_time_range(query,
                                                    self.__timestamp_col)

        if not from_date_str or not to_date_str:
            return None

        date_align, split_step = state.get_configs([("date_align_seconds", 1),
                                                    ("split_step", 3600)
                                                    ]  # default 1 hour
                                                   )
        to_date = util.parse_datetime(to_date_str, date_align)
        from_date = util.parse_datetime(from_date_str, date_align)

        if from_date != from_date_ast:
            logger.warning(
                "Mismatch in start date on time splitter.",
                extra={
                    "ast": str(from_date_ast),
                    "legacy": str(from_date)
                },
                exc_info=True,
            )
            metrics.increment("mismatch.ast_from_date")

        remaining_offset = query.get_offset()

        overall_result = None
        split_end = to_date
        split_start = max(split_end - timedelta(seconds=split_step), from_date)
        total_results = 0
        while split_start < split_end and total_results < limit:
            # We need to make a copy to use during the query execution because we replace
            # the start-end conditions on the query at each iteration of this loop.
            split_query = copy.deepcopy(query)

            _replace_condition(split_query, self.__timestamp_col, ">=",
                               split_start.isoformat())
            _replace_ast_condition(split_query, self.__timestamp_col, ">=",
                                   LiteralExpr(None, split_start))
            _replace_condition(split_query, self.__timestamp_col, "<",
                               split_end.isoformat())
            _replace_ast_condition(split_query, self.__timestamp_col, "<",
                                   LiteralExpr(None, split_end))

            # Because its paged, we have to ask for (limit+offset) results
            # and set offset=0 so we can then trim them ourselves.
            split_query.set_offset(0)
            split_query.set_limit(limit - total_results + remaining_offset)

            # At every iteration we only append the "data" key from the results returned by
            # the runner. The "extra" key is only populated at the first iteration of the
            # loop and never changed.
            result = runner(split_query, request_settings)

            if overall_result is None:
                overall_result = result
            else:
                overall_result.result["data"].extend(result.result["data"])

            if remaining_offset > 0 and len(overall_result.result["data"]) > 0:
                to_trim = min(remaining_offset,
                              len(overall_result.result["data"]))
                overall_result.result["data"] = overall_result.result["data"][
                    to_trim:]
                remaining_offset -= to_trim

            total_results = len(overall_result.result["data"])

            if total_results < limit:
                if len(result.result["data"]) == 0:
                    # If we got nothing from the last query, expand the range by a static factor
                    split_step = split_step * STEP_GROWTH
                else:
                    # If we got some results but not all of them, estimate how big the time
                    # range should be for the next query based on how many results we got for
                    # our last query and its time range, and how many we have left to fetch.
                    remaining = limit - total_results
                    split_step = split_step * math.ceil(
                        remaining / float(len(result.result["data"])))

                # Set the start and end of the next query based on the new range.
                split_end = split_start
                try:
                    split_start = max(
                        split_end - timedelta(seconds=split_step), from_date)
                except OverflowError:
                    split_start = from_date

        return overall_result
Exemple #27
0
    def time_split(dataset, request: Request, *args, **kwargs):
        """
        If a query is:
            - ORDER BY timestamp DESC
            - has no grouping
            - has an offset/limit
            - has a large time range
        We know we have to reverse-sort the entire set of rows to return the small
        chunk at the end of the time range, so optimistically split the time range
        into smaller increments, and start with the last one, so that we can potentially
        avoid querying the entire range.
        """
        date_align, split_step = state.get_configs([("date_align_seconds", 1),
                                                    ("split_step", 3600)
                                                    ]  # default 1 hour
                                                   )

        query_limit = request.query.get_limit()
        limit = query_limit if query_limit is not None else 0
        remaining_offset = request.query.get_offset()

        to_date = util.parse_datetime(
            request.extensions["timeseries"]["to_date"], date_align)
        from_date = util.parse_datetime(
            request.extensions["timeseries"]["from_date"], date_align)

        overall_result = None
        split_end = to_date
        split_start = max(split_end - timedelta(seconds=split_step), from_date)
        total_results = 0
        while split_start < split_end and total_results < limit:
            request.extensions["timeseries"][
                "from_date"] = split_start.isoformat()
            request.extensions["timeseries"]["to_date"] = split_end.isoformat()
            # Because its paged, we have to ask for (limit+offset) results
            # and set offset=0 so we can then trim them ourselves.
            request.query.set_offset(0)
            request.query.set_limit(limit - total_results + remaining_offset)

            # The query function may mutate the request body during query
            # evaluation, so we need to copy the body to ensure that the query
            # has not been modified in between this call and the next loop
            # iteration, if needed.
            # XXX: The extra data is carried across from the initial response
            # and never updated.
            result = query_func(dataset, copy.deepcopy(request), *args,
                                **kwargs)

            if overall_result is None:
                overall_result = result
            else:
                overall_result.result["data"].extend(result.result["data"])

            if remaining_offset > 0 and len(overall_result.result["data"]) > 0:
                to_trim = min(remaining_offset,
                              len(overall_result.result["data"]))
                overall_result.result["data"] = overall_result.result["data"][
                    to_trim:]
                remaining_offset -= to_trim

            total_results = len(overall_result.result["data"])

            if total_results < limit:
                if len(result.result["data"]) == 0:
                    # If we got nothing from the last query, expand the range by a static factor
                    split_step = split_step * STEP_GROWTH
                else:
                    # If we got some results but not all of them, estimate how big the time
                    # range should be for the next query based on how many results we got for
                    # our last query and its time range, and how many we have left to fetch.
                    remaining = limit - total_results
                    split_step = split_step * math.ceil(
                        remaining / float(len(result.result["data"])))

                # Set the start and end of the next query based on the new range.
                split_end = split_start
                try:
                    split_start = max(
                        split_end - timedelta(seconds=split_step), from_date)
                except OverflowError:
                    split_start = from_date

        return overall_result
Exemple #28
0
def raw_query(
    request: Request,
    query: ClickhouseQuery,
    client: ClickhousePool,
    timer: Timer,
    stats=None,
) -> QueryResult:
    """
    Submit a raw SQL query to clickhouse and do some post-processing on it to
    fix some of the formatting issues in the result JSON
    """
    from snuba.clickhouse.native import NativeDriverReader

    stats = stats or {}
    use_cache, use_deduper, uc_max = state.get_configs([
        ('use_cache', 0),
        ('use_deduper', 1),
        ('uncompressed_cache_max_cols', 5),
    ])

    all_confs = state.get_all_configs()
    query_settings = {
        k.split('/', 1)[1]: v
        for k, v in all_confs.items() if k.startswith('query_settings/')
    }

    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in clickhouse, or result cache in snuba.
    if len(all_referenced_columns(request.query)) > uc_max:
        query_settings['use_uncompressed_cache'] = 0
        use_cache = 0

    timer.mark('get_configs')

    sql = query.format_sql()
    query_id = md5(force_bytes(sql)).hexdigest()
    with state.deduper(query_id if use_deduper else None) as is_dupe:
        timer.mark('dedupe_wait')

        result = state.get_result(query_id) if use_cache else None
        timer.mark('cache_get')

        stats.update({
            'is_duplicate': is_dupe,
            'query_id': query_id,
            'use_cache': bool(use_cache),
            'cache_hit': bool(result)
        }),

        if result:
            status = 200
        else:
            try:
                with RateLimitAggregator(
                        request.settings.get_rate_limit_params(
                        )) as rate_limit_stats_container:
                    stats.update(rate_limit_stats_container.to_dict())
                    timer.mark('rate_limit')

                    project_rate_limit_stats = rate_limit_stats_container.get_stats(
                        PROJECT_RATE_LIMIT_NAME)

                    if 'max_threads' in query_settings and \
                            project_rate_limit_stats is not None and \
                            project_rate_limit_stats.concurrent > 1:
                        maxt = query_settings['max_threads']
                        query_settings['max_threads'] = max(
                            1, maxt - project_rate_limit_stats.concurrent + 1)

                    # Force query to use the first shard replica, which
                    # should have synchronously received any cluster writes
                    # before this query is run.
                    consistent = request.settings.get_consistent()
                    stats['consistent'] = consistent
                    if consistent:
                        query_settings['load_balancing'] = 'in_order'
                        query_settings['max_threads'] = 1

                    try:
                        result = NativeDriverReader(client).execute(
                            query,
                            query_settings,
                            # All queries should already be deduplicated at this point
                            # But the query_id will let us know if they aren't
                            query_id=query_id if use_deduper else None,
                            with_totals=request.query.has_totals(),
                        )
                        status = 200

                        logger.debug(sql)
                        timer.mark('execute')
                        stats.update({
                            'result_rows': len(result['data']),
                            'result_cols': len(result['meta']),
                        })

                        if use_cache:
                            state.set_result(query_id, result)
                            timer.mark('cache_set')

                    except BaseException as ex:
                        error = str(ex)
                        status = 500
                        logger.exception("Error running query: %s\n%s", sql,
                                         error)
                        if isinstance(ex, ClickHouseError):
                            result = {
                                'error': {
                                    'type': 'clickhouse',
                                    'code': ex.code,
                                    'message': error,
                                }
                            }
                        else:
                            result = {
                                'error': {
                                    'type': 'unknown',
                                    'message': error,
                                }
                            }

            except RateLimitExceeded as ex:
                error = str(ex)
                status = 429
                result = {
                    'error': {
                        'type': 'ratelimit',
                        'message': 'rate limit exceeded',
                        'detail': error
                    }
                }

    stats.update(query_settings)

    if settings.RECORD_QUERIES:
        # send to redis
        state.record_query({
            'request': request.body,
            'sql': sql,
            'timing': timer,
            'stats': stats,
            'status': status,
        })

        timer.send_metrics_to(metrics,
                              tags={
                                  'status': str(status),
                                  'referrer': stats.get('referrer', 'none'),
                                  'final': str(stats.get('final', False)),
                              },
                              mark_tags={
                                  'final': str(stats.get('final', False)),
                              })

    result['timing'] = timer

    if settings.STATS_IN_RESPONSE or request.settings.get_debug():
        result['stats'] = stats
        result['sql'] = sql

    return QueryResult(result, status)