Example #1
0
def record_query(request: Request, timer: Timer,
                 query_metadata: SnubaQueryMetadata) -> None:
    """
    Records a request after it has been parsed and validated, whether
    we actually ran a query or not.
    """
    if settings.RECORD_QUERIES:
        # Send to redis
        # We convert this to a dict before passing it to state in order to avoid a
        # circular dependency, where state would depend on the higher level
        # QueryMetadata class
        state.record_query(query_metadata.to_dict())

        final = str(request.query.get_final())
        referrer = request.referrer or "none"
        timer.send_metrics_to(
            metrics,
            tags={
                "status": query_metadata.status.value,
                "referrer": referrer,
                "final": final,
            },
            mark_tags={
                "final": final,
                "referrer": referrer
            },
        )

        _add_tags(timer, request)
Example #2
0
def execute_query_with_rate_limits(
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    # XXX: We should consider moving this that it applies to the logical query,
    # not the physical query.
    with RateLimitAggregator(request_settings.get_rate_limit_params()
                             ) as rate_limit_stats_container:
        stats.update(rate_limit_stats_container.to_dict())
        timer.mark("rate_limit")

        project_rate_limit_stats = rate_limit_stats_container.get_stats(
            PROJECT_RATE_LIMIT_NAME)

        if ("max_threads" in query_settings
                and project_rate_limit_stats is not None
                and project_rate_limit_stats.concurrent > 1):
            maxt = query_settings["max_threads"]
            query_settings["max_threads"] = max(
                1, maxt - project_rate_limit_stats.concurrent + 1)

        return execute_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            stats,
            query_settings,
        )
Example #3
0
def _format_storage_query_and_run(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    referrer: str,
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    reader: Reader,
    robust: bool,
    concurrent_queries_gauge: Optional[Gauge] = None,
) -> QueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    """
    from_clause = clickhouse_query.get_from_clause()
    visitor = TablesCollector()
    visitor.visit(from_clause)
    table_names = ",".join(sorted(visitor.get_tables()))
    with sentry_sdk.start_span(description="create_query", op="db") as span:
        _apply_turbo_sampling_if_needed(clickhouse_query, request_settings)

        formatted_query = format_query(clickhouse_query)
        span.set_data("query", formatted_query.structured())
        span.set_data("query_size_bytes",
                      _string_size_in_bytes(formatted_query.get_sql()))
        sentry_sdk.set_tag("query_size_group",
                           get_query_size_group(formatted_query.get_sql()))
        metrics.increment("execute")

    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": table_names,
        "final": visitor.any_final(),
        "referrer": referrer,
        "sample": visitor.get_sample_rate(),
    }

    with sentry_sdk.start_span(description=formatted_query.get_sql(),
                               op="db") as span:
        span.set_tag("table", table_names)

        def execute() -> QueryResult:
            return raw_query(
                clickhouse_query,
                request_settings,
                formatted_query,
                reader,
                timer,
                query_metadata,
                stats,
                span.trace_id,
                robust=robust,
            )

        if concurrent_queries_gauge is not None:
            with concurrent_queries_gauge:
                return execute()
        else:
            return execute()
Example #4
0
def execute_query_with_rate_limits(
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    query_settings: QuerySettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    clickhouse_query_settings: MutableMapping[str, Any],
    robust: bool,
) -> Result:
    # Global rate limiter is added at the end of the chain to be
    # the last for evaluation.
    # This allows us not to borrow capacity from the global quota
    # during the evaluation if one of the more specific limiters
    # (like the project rate limiter) rejects the query first.
    query_settings.add_rate_limit(get_global_rate_limit_params())
    # XXX: We should consider moving this that it applies to the logical query,
    # not the physical query.
    with RateLimitAggregator(
        query_settings.get_rate_limit_params()
    ) as rate_limit_stats_container:
        stats.update(rate_limit_stats_container.to_dict())
        timer.mark("rate_limit")

        project_rate_limit_stats = rate_limit_stats_container.get_stats(
            PROJECT_RATE_LIMIT_NAME
        )

        thread_quota = query_settings.get_resource_quota()
        if (
            ("max_threads" in clickhouse_query_settings or thread_quota is not None)
            and project_rate_limit_stats is not None
            and project_rate_limit_stats.concurrent > 1
        ):
            maxt = (
                clickhouse_query_settings["max_threads"]
                if thread_quota is None
                else thread_quota.max_threads
            )
            clickhouse_query_settings["max_threads"] = max(
                1, maxt - project_rate_limit_stats.concurrent + 1
            )

        _record_rate_limit_metrics(rate_limit_stats_container, reader, stats)

        return execute_query(
            clickhouse_query,
            query_settings,
            formatted_query,
            reader,
            timer,
            stats,
            clickhouse_query_settings,
            robust=robust,
        )
Example #5
0
def _record_failure_building_request(
    status: QueryStatus, timer: Timer, referrer: Optional[str]
) -> None:
    # TODO: Revisit if recording some data for these queries in the querylog
    # table would be useful.
    if settings.RECORD_QUERIES:
        timer.send_metrics_to(
            metrics,
            tags={"status": status.value, "referrer": referrer or "none"},
        )
        _add_tags(timer)
Example #6
0
def execute_query_with_caching(
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    query_settings: QuerySettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    clickhouse_query_settings: MutableMapping[str, Any],
    robust: bool,
) -> Result:
    # XXX: ``uncompressed_cache_max_cols`` is used to control both the result
    # cache, as well as the uncompressed cache. These should be independent.
    use_cache, uc_max = state.get_configs(
        [("use_cache", settings.USE_RESULT_CACHE), ("uncompressed_cache_max_cols", 5)]
    )

    column_counter = ReferencedColumnsCounter()
    column_counter.visit(clickhouse_query.get_from_clause())
    assert isinstance(uc_max, int)
    if column_counter.count_columns() > uc_max:
        use_cache = False

    execute = partial(
        execute_query_with_rate_limits,
        clickhouse_query,
        query_settings,
        formatted_query,
        reader,
        timer,
        stats,
        clickhouse_query_settings,
        robust=robust,
    )

    with sentry_sdk.start_span(description="execute", op="db") as span:
        key = get_query_cache_key(formatted_query)
        clickhouse_query_settings["query_id"] = key
        if use_cache:
            cache_partition = _get_cache_partition(reader)
            result = cache_partition.get(key)
            timer.mark("cache_get")
            stats["cache_hit"] = result is not None
            if result is not None:
                span.set_tag("cache", "hit")
                return result

            span.set_tag("cache", "miss")
            result = execute()
            cache_partition.set(key, result)
            timer.mark("cache_set")
            return result
        else:
            return execute()
Example #7
0
def validate_request_content(body, schema: RequestSchema, timer: Timer,
                             dataset: Dataset, referrer: str) -> Request:
    with sentry_sdk.start_span(description="validate_request_content",
                               op="validate") as span:
        try:
            request = schema.validate(body, dataset, referrer)
            span.set_data("snuba_query", request.body)
        except jsonschema.ValidationError as error:
            raise BadRequest(str(error)) from error

        timer.mark("validate_schema")

    return request
Example #8
0
def dataset_query(dataset: Dataset, body, timer: Timer) -> Response:
    assert http_request.method == "POST"

    with sentry_sdk.start_span(description="build_schema", op="validate"):
        schema = RequestSchema.build_with_extensions(
            dataset.get_extensions(), HTTPRequestSettings
        )

    request = build_request(body, schema, timer, dataset, http_request.referrer)

    try:
        result = parse_and_run_query(dataset, request, timer)
    except QueryException as exception:
        status = 500
        details: Mapping[str, Any]

        cause = exception.__cause__
        if isinstance(cause, RateLimitExceeded):
            status = 429
            details = {
                "type": "rate-limited",
                "message": "rate limit exceeded",
            }
        elif isinstance(cause, ClickhouseError):
            details = {
                "type": "clickhouse",
                "message": str(cause),
                "code": cause.code,
            }
        elif isinstance(cause, Exception):
            details = {
                "type": "unknown",
                "message": str(cause),
            }
        else:
            raise  # exception should have been chained

        return Response(
            json.dumps(
                {"error": details, "timing": timer.for_json(), **exception.extra}
            ),
            status,
            {"Content-Type": "application/json"},
        )

    payload: MutableMapping[str, Any] = {**result.result, "timing": timer.for_json()}

    if settings.STATS_IN_RESPONSE or request.settings.get_debug():
        payload.update(result.extra)

    return Response(json.dumps(payload), 200, {"Content-Type": "application/json"})
Example #9
0
def execute_query_with_caching(
    clickhouse_query: Query,
    request_settings: RequestSettings,
    formatted_query: SqlQuery,
    reader: Reader[SqlQuery],
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    # XXX: ``uncompressed_cache_max_cols`` is used to control both the result
    # cache, as well as the uncompressed cache. These should be independent.
    use_cache, uc_max = state.get_configs([("use_cache",
                                            settings.USE_RESULT_CACHE),
                                           ("uncompressed_cache_max_cols", 5)])

    if (len(
            set((
                # Skip aliases when counting columns
                (c.table_name, c.column_name)
                for c in clickhouse_query.get_all_ast_referenced_columns()))) >
            uc_max):
        use_cache = False

    execute = partial(
        execute_query_with_rate_limits,
        clickhouse_query,
        request_settings,
        formatted_query,
        reader,
        timer,
        stats,
        query_settings,
    )

    with sentry_sdk.start_span(description="execute", op="db") as span:
        if use_cache:
            key = get_query_cache_key(formatted_query)
            result = cache.get(key)
            timer.mark("cache_get")
            stats["cache_hit"] = result is not None
            if result is not None:
                span.set_tag("cache", "hit")
                return result

            span.set_tag("cache", "miss")
            result = execute()
            cache.set(key, result)
            timer.mark("cache_set")
            return result
        else:
            return execute()
Example #10
0
def build_request(
    body: MutableMapping[str, Any],
    parser: Parser,
    settings_class: Union[Type[HTTPRequestSettings], Type[SubscriptionRequestSettings]],
    schema: RequestSchema,
    dataset: Dataset,
    timer: Timer,
    referrer: str,
) -> Request:
    with sentry_sdk.start_span(description="build_request", op="validate") as span:
        try:
            request_parts = schema.validate(body)
            if settings_class == HTTPRequestSettings:
                settings = {
                    **request_parts.settings,
                    "consistent": _consistent_override(
                        request_parts.settings.get("consistent", False), referrer
                    ),
                }
                settings_obj: Union[
                    HTTPRequestSettings, SubscriptionRequestSettings
                ] = settings_class(**settings)
            elif settings_class == SubscriptionRequestSettings:
                settings_obj = settings_class(
                    consistent=_consistent_override(True, referrer)
                )

            query = parser(request_parts, settings_obj, dataset)

            request_id = uuid.uuid4().hex
            request = Request(
                request_id,
                # TODO: Replace this with the actual query raw body.
                # this can have an impact on subscriptions so we need
                # to be careful with the change.
                ChainMap(request_parts.query, *request_parts.extensions.values()),
                query,
                settings_obj,
                referrer,
            )
        except (InvalidJsonRequestException, InvalidQueryException) as exception:
            record_invalid_request(timer, referrer)
            raise exception
        except Exception as exception:
            record_error_building_request(timer, referrer)
            raise exception

        span.set_data("snuba_query", request.body)

        timer.mark("validate_schema")
        return request
Example #11
0
def test_timer_send_metrics() -> None:
    backend = TestingMetricsBackend()

    time = TestingClock()
    set_tags = {"foo": "bar", "blue": "car"}
    t = Timer("timer", clock=time, tags=set_tags)
    time.sleep(10)
    t.mark("thing1")
    time.sleep(10)
    t.mark("thing2")
    t.send_metrics_to(
        backend,
        tags={"key": "value"},
        mark_tags={"mark-key": "mark-value", "blue": "dog"},
    )

    overridden_tags = {"foo": "bar", "blue": "dog"}
    assert backend.calls == [
        Timing("timer", (10.0 + 10.0) * 1000, {"key": "value", **set_tags}),
        Timing(
            "timer.thing1", 10.0 * 1000, {"mark-key": "mark-value", **overridden_tags}
        ),
        Timing(
            "timer.thing2", 10.0 * 1000, {"mark-key": "mark-value", **overridden_tags}
        ),
    ]
Example #12
0
def _format_storage_query_and_run(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    from_date: datetime,
    to_date: datetime,
    referrer: str,
    clickhouse_query: Query,
    request_settings: RequestSettings,
    reader: Reader[SqlQuery],
) -> QueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    """

    # TODO: This function (well, it will be a wrapper of this function)
    # where we will transform the result according to the SelectedExpression
    # object in the query to ensure the fields in the QueryResult have
    # the same name the user expects.

    source = clickhouse_query.get_data_source().format_from()
    with sentry_sdk.start_span(description="create_query", op="db") as span:
        formatted_query = AstSqlQuery(clickhouse_query, request_settings)
        span.set_data("query", formatted_query.sql_data())
        metrics.increment("execute")

    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": source,
        "final": clickhouse_query.get_final(),
        "referrer": referrer,
        "num_days": (to_date - from_date).days,
        "sample": clickhouse_query.get_sample(),
    }

    with sentry_sdk.start_span(
        description=formatted_query.format_sql(), op="db"
    ) as span:
        span.set_tag("table", source)

        return raw_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            query_metadata,
            stats,
            span.trace_id,
        )
Example #13
0
def execute_query(
    # TODO: Passing the whole clickhouse query here is needed as long
    # as the execute method depends on it. Otherwise we can make this
    # file rely either entirely on clickhouse query or entirely on
    # the formatter.
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
    robust: bool,
) -> Result:
    """
    Execute a query and return a result.
    """
    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in ClickHouse.
    uc_max = state.get_config("uncompressed_cache_max_cols", 5)
    assert isinstance(uc_max, int)
    column_counter = ReferencedColumnsCounter()
    column_counter.visit(clickhouse_query.get_from_clause())
    if column_counter.count_columns() > uc_max:
        query_settings["use_uncompressed_cache"] = 0

    # Force query to use the first shard replica, which
    # should have synchronously received any cluster writes
    # before this query is run.
    consistent = request_settings.get_consistent()
    stats["consistent"] = consistent
    if consistent:
        query_settings["load_balancing"] = "in_order"
        query_settings["max_threads"] = 1

    result = reader.execute(
        formatted_query,
        query_settings,
        with_totals=clickhouse_query.has_totals(),
        robust=robust,
    )

    timer.mark("execute")
    stats.update({
        "result_rows": len(result["data"]),
        "result_cols": len(result["meta"])
    })

    return result
Example #14
0
def execute_query(
    # TODO: Passing the whole clickhouse query here is needed as long
    # as the execute method depends on it. Otherwise we can make this
    # file rely either entirely on clickhouse query or entirely on
    # the formatter.
    clickhouse_query: Query,
    request_settings: RequestSettings,
    formatted_query: SqlQuery,
    reader: Reader[SqlQuery],
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    """
    Execute a query and return a result.
    """
    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in ClickHouse.
    uc_max = state.get_config("uncompressed_cache_max_cols", 5)
    if (len(
            set((
                # Skip aliases when counting columns
                (c.table_name, c.column_name)
                for c in clickhouse_query.get_all_ast_referenced_columns()))) >
            uc_max):
        query_settings["use_uncompressed_cache"] = 0

    # Force query to use the first shard replica, which
    # should have synchronously received any cluster writes
    # before this query is run.
    consistent = request_settings.get_consistent()
    stats["consistent"] = consistent
    if consistent:
        query_settings["load_balancing"] = "in_order"
        query_settings["max_threads"] = 1

    result = reader.execute(
        formatted_query,
        query_settings,
        with_totals=clickhouse_query.has_totals(),
    )

    timer.mark("execute")
    stats.update({
        "result_rows": len(result["data"]),
        "result_cols": len(result["meta"])
    })

    return result
Example #15
0
    def test(self) -> None:
        creator = SubscriptionCreator(self.dataset, EntityKey.EVENTS)
        subscription = SubscriptionData(
            project_id=1,
            query="MATCH (events) SELECT count() AS count",
            time_window_sec=10 * 60,
            resolution_sec=60,
            entity_subscription=create_entity_subscription(),
        )
        identifier = creator.create(subscription, Timer("test"))
        assert (cast(
            List[Tuple[UUID, SubscriptionData]],
            RedisSubscriptionDataStore(
                redis_client,
                self.entity_key,
                identifier.partition,
            ).all(),
        )[0][1] == subscription)

        SubscriptionDeleter(self.entity_key,
                            identifier.partition).delete(identifier.uuid)
        assert (RedisSubscriptionDataStore(
            redis_client,
            self.entity_key,
            identifier.partition,
        ).all() == [])
Example #16
0
class TestMetricsCountersSubscriptionCreator:
    timer = Timer("test")

    def setup_method(self) -> None:
        self.dataset = get_dataset("metrics")

    @pytest.mark.parametrize("subscription, entity_key", TESTS_CREATE_METRICS)
    def test(self, subscription: SubscriptionData,
             entity_key: EntityKey) -> None:
        creator = SubscriptionCreator(self.dataset, entity_key)
        identifier = creator.create(subscription, self.timer)
        assert (cast(
            List[Tuple[UUID, SubscriptionData]],
            RedisSubscriptionDataStore(
                redis_client,
                entity_key,
                identifier.partition,
            ).all(),
        )[0][1] == subscription)

    @pytest.mark.parametrize("subscription", TESTS_INVALID_METRICS)
    def test_missing_conditions_for_groupby_clause(
            self, subscription: SubscriptionData) -> None:
        creator = SubscriptionCreator(self.dataset, EntityKey.METRICS_COUNTERS)
        with raises(InvalidQueryException):
            creator.create(
                subscription,
                self.timer,
            )
Example #17
0
    def test(self) -> None:
        creator = SubscriptionCreator(self.dataset)
        subscription = LegacySubscriptionData(
            project_id=1,
            conditions=[],
            aggregations=[["count()", "", "count"]],
            time_window=timedelta(minutes=10),
            resolution=timedelta(minutes=1),
        )
        identifier = creator.create(subscription, Timer("test"))
        assert (
            cast(
                List[Tuple[UUID, SubscriptionData]],
                RedisSubscriptionDataStore(
                    redis_client, self.dataset, identifier.partition,
                ).all(),
            )[0][1]
            == subscription
        )

        SubscriptionDeleter(self.dataset, identifier.partition).delete(identifier.uuid)
        assert (
            RedisSubscriptionDataStore(
                redis_client, self.dataset, identifier.partition,
            ).all()
            == []
        )
Example #18
0
    def __execute_query(self, task: ScheduledSubscriptionTask,
                        tick_upper_offset: int) -> Tuple[Request, Result]:
        # Measure the amount of time that took between the task's scheduled
        # time and it beginning to execute.
        self.__metrics.timing("executor.latency",
                              (time.time() - task.timestamp.timestamp()) *
                              1000)

        timer = Timer("query")

        with self.__concurrent_gauge:
            request = task.task.subscription.data.build_request(
                self.__dataset,
                task.timestamp,
                tick_upper_offset,
                timer,
                self.__metrics,
                "subscriptions_executor",
            )

            result = parse_and_run_query(
                self.__dataset,
                request,
                timer,
                robust=True,
                concurrent_queries_gauge=self.__concurrent_clickhouse_gauge,
            ).result

            return (request, result)
Example #19
0
    def compare_conditions(
        self,
        subscription: SubscriptionData,
        exception: Optional[Type[Exception]],
        aggregate: str,
        value: Union[int, float],
    ) -> None:
        timer = Timer("test")
        if exception is not None:
            with pytest.raises(exception):
                request = subscription.build_request(
                    self.dataset,
                    datetime.utcnow(),
                    100,
                    timer,
                )
                parse_and_run_query(self.dataset, request, timer)
            return

        request = subscription.build_request(
            self.dataset,
            datetime.utcnow(),
            100,
            timer,
        )
        result = parse_and_run_query(self.dataset, request, timer)

        assert result.result["data"][0][aggregate] == value
Example #20
0
    def __execute(self, task: ScheduledTask[Subscription],
                  tick: Tick) -> Tuple[Request, Result]:
        # Measure the amount of time that took between this task being
        # scheduled and it beginning to execute.
        self.__metrics.timing("executor.latency",
                              (time.time() - task.timestamp.timestamp()) *
                              1000)

        # XXX: The ``query`` name is taken from the web views so that all query
        # performance metrics are reported to the same spot, regardless of
        # execution environment.
        timer = Timer("query")

        request = task.task.data.build_request(
            self.__dataset,
            task.timestamp,
            tick.offsets.upper,
            timer,
        )

        with self.__concurrent_gauge:
            # XXX: The ``extra`` is discarded from ``QueryResult`` since it is
            # not particularly useful in this context and duplicates data that
            # is already being published to the query log.
            # XXX: The ``request`` instance is copied when passed to
            # ``parse_and_run_query`` since it can/will be mutated during
            # processing.
            return (
                request,
                parse_and_run_query(self.__dataset, copy.deepcopy(request),
                                    timer).result,
            )
Example #21
0
def run_query(dataset: Dataset, request: Request, timer: Timer) -> WebQueryResult:
    try:
        result = parse_and_run_query(dataset, request, timer)
        payload = {**result.result, "timing": timer.for_json()}
        if settings.STATS_IN_RESPONSE or request.settings.get_debug():
            payload.update(result.extra)
        return WebQueryResult(payload, 200)
    except RawQueryException as e:
        return WebQueryResult(
            {
                "error": {"type": e.err_type, "message": e.message, **e.meta},
                "sql": e.sql,
                "stats": e.stats,
                "timing": timer.for_json(),
            },
            429 if e.err_type == "rate-limited" else 500,
        )
Example #22
0
def _format_storage_query_and_run(
    # TODO: remove dependency on Dataset. This is only for formatting the legacy ClickhouseQuery
    # with the AST this won't be needed.
    dataset: Dataset,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    from_date: datetime,
    to_date: datetime,
    request: Request,
) -> RawQueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    TODO: When we will have the AST in production and we will have the StorageQuery
    abstraction, this function is probably going to collapse and disappear.
    """

    source = request.query.get_data_source().format_from()
    with sentry_sdk.start_span(description="create_query", op="db"):
        # TODO: Move the performance logic and the pre_where generation into
        # ClickhouseQuery since they are Clickhouse specific
        query = DictClickhouseQuery(dataset, request.query, request.settings)
    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": source,
        "final": request.query.get_final(),
        "referrer": request.referrer,
        "num_days": (to_date - from_date).days,
        "sample": request.query.get_sample(),
    }

    with sentry_sdk.start_span(description=query.format_sql(),
                               op="db") as span:
        span.set_tag("table", source)
        try:
            span.set_tag(
                "ast_query",
                AstClickhouseQuery(request.query,
                                   request.settings).format_sql(),
            )
        except Exception:
            logger.warning("Failed to format ast query", exc_info=True)

        return raw_query(request, query, timer, query_metadata, stats,
                         span.trace_id)
Example #23
0
def build_request(body, schema: RequestSchema, timer: Timer, dataset: Dataset,
                  referrer: str) -> Request:
    with sentry_sdk.start_span(description="build_request",
                               op="validate") as span:
        try:
            request = schema.validate(body, dataset, referrer)
        except (InvalidJsonRequestException,
                InvalidQueryException) as exception:
            record_invalid_request(timer, referrer)
            raise exception
        except Exception as exception:
            record_error_building_request(timer, referrer)
            raise exception

        span.set_data("snuba_query", request.body)

        timer.mark("validate_schema")
        return request
Example #24
0
def _format_storage_query_and_run(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    referrer: str,
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    reader: Reader,
) -> QueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    """
    from_clause = clickhouse_query.get_from_clause()
    visitor = TablesCollector()
    visitor.visit(from_clause)
    table_names = ",".join(sorted(visitor.get_tables()))
    with sentry_sdk.start_span(description="create_query", op="db") as span:
        formatted_query = format_query(clickhouse_query, request_settings)
        span.set_data("query", formatted_query.structured())
        metrics.increment("execute")

    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": table_names,
        "final": visitor.any_final(),
        "referrer": referrer,
        "sample": visitor.get_sample_rate(),
    }

    with sentry_sdk.start_span(description=formatted_query.get_sql(),
                               op="db") as span:
        span.set_tag("table", table_names)

        return raw_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            query_metadata,
            stats,
            span.trace_id,
        )
Example #25
0
def _format_storage_query_and_run(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    from_date: datetime,
    to_date: datetime,
    referrer: str,
    clickhouse_query: Query,
    request_settings: RequestSettings,
    reader: Reader[SqlQuery],
) -> QueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    """
    source = clickhouse_query.get_from_clause().format_from()
    with sentry_sdk.start_span(description="create_query", op="db") as span:
        formatted_query = AstSqlQuery(clickhouse_query, request_settings)
        span.set_data("query", formatted_query.sql_data())
        metrics.increment("execute")

    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": source,
        "final": clickhouse_query.get_final(),
        "referrer": referrer,
        "num_days": (to_date - from_date).days,
        "sample": clickhouse_query.get_sample(),
    }

    with sentry_sdk.start_span(description=formatted_query.format_sql(),
                               op="db") as span:
        span.set_tag("table", source)

        return raw_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            query_metadata,
            stats,
            span.trace_id,
        )
Example #26
0
    def __execute(self, task: ScheduledTask[Subscription],
                  tick: Tick) -> Tuple[Request, Result]:
        # Measure the amount of time that took between this task being
        # scheduled and it beginning to execute.
        self.__metrics.timing("executor.latency",
                              (time.time() - task.timestamp.timestamp()) *
                              1000)

        # XXX: The ``query`` name is taken from the web views so that all query
        # performance metrics are reported to the same spot, regardless of
        # execution environment.
        timer = Timer("query")

        data_type = "legacy"
        if isinstance(task.task.data, DelegateSubscriptionData):
            data_type = "delegate"
        elif isinstance(task.task.data, SnQLSubscriptionData):
            data_type = "snql"

        self.__metrics.increment("incoming.task", tags={"type": data_type})

        if isinstance(task.task.data, DelegateSubscriptionData):
            try:
                request = task.task.data.build_request(
                    self.__dataset,
                    task.timestamp,
                    tick.offsets.upper,
                    timer,
                    self.__metrics,
                )
                return self.__execute_query(request, timer, task)
            except Exception as e:
                self.__metrics.increment(
                    "snql.subscription.delegate.error.execution")
                logger.warning(
                    f"failed snql subscription query: {e}",
                    exc_info=e,
                    extra={
                        "error": str(e),
                        "data": task.task.data.to_dict()
                    },
                )
                request = task.task.data.to_legacy().build_request(
                    self.__dataset,
                    task.timestamp,
                    tick.offsets.upper,
                    timer,
                    self.__metrics,
                )
                return self.__execute_query(request, timer, task)

        request = task.task.data.build_request(self.__dataset, task.timestamp,
                                               tick.offsets.upper, timer,
                                               self.__metrics)
        return self.__execute_query(request, timer, task)
Example #27
0
def record_query(request: Request, timer: Timer,
                 query_metadata: SnubaQueryMetadata) -> None:
    if settings.RECORD_QUERIES:
        # Send to redis
        # We convert this to a dict before passing it to state in order to avoid a
        # circular dependency, where state would depend on the higher level
        # QueryMetadata class
        state.record_query(query_metadata.to_dict())

        final = str(request.query.get_final())
        referrer = request.referrer or "none"
        timer.send_metrics_to(
            metrics,
            tags={
                "status": query_metadata.status,
                "referrer": referrer,
                "final": final,
            },
            mark_tags={"final": final},
        )
Example #28
0
def _add_tags(timer: Timer, request: Optional[Request] = None) -> None:
    if Hub.current.scope.span:
        duration_group = timer.get_duration_group()
        sentry_sdk.set_tag("duration_group", duration_group)
        if duration_group == ">30s":
            sentry_sdk.set_tag("timeout", "too_long")

        if request is not None:
            experiments: MutableMapping[str,
                                        Any] = request.query.get_experiments()
            for name, value in experiments.items():
                sentry_sdk.set_tag(f"exp-{name}", str(value))
Example #29
0
def test_subscription_task_result_encoder() -> None:
    codec = SubscriptionTaskResultEncoder()

    timestamp = datetime.now()

    entity_subscription = EventsSubscription(data_dict={})
    subscription_data = SubscriptionData(
        project_id=1,
        query="MATCH (events) SELECT count() AS count",
        time_window_sec=60,
        resolution_sec=60,
        entity_subscription=entity_subscription,
    )

    # XXX: This seems way too coupled to the dataset.
    request = subscription_data.build_request(get_dataset("events"), timestamp,
                                              None, Timer("timer"))
    result: Result = {
        "meta": [{
            "type": "UInt64",
            "name": "count"
        }],
        "data": [{
            "count": 1
        }],
    }

    task_result = SubscriptionTaskResult(
        ScheduledSubscriptionTask(
            timestamp,
            SubscriptionWithMetadata(
                EntityKey.EVENTS,
                Subscription(
                    SubscriptionIdentifier(PartitionId(1), uuid.uuid1()),
                    subscription_data,
                ),
                5,
            ),
        ),
        (request, result),
    )

    message = codec.encode(task_result)
    data = json.loads(message.value.decode("utf-8"))
    assert data["version"] == 3
    payload = data["payload"]

    assert payload["subscription_id"] == str(
        task_result.task.task.subscription.identifier)
    assert payload["request"] == request.original_body
    assert payload["result"] == result
    assert payload["timestamp"] == task_result.task.timestamp.isoformat()
    assert payload["entity"] == EntityKey.EVENTS.value
Example #30
0
def _record_timer_metrics(
    request: Request,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
) -> None:
    final = str(request.query.get_final())
    referrer = request.referrer or "none"
    timer.send_metrics_to(
        metrics,
        tags={
            "status": query_metadata.status.value,
            "referrer": referrer,
            "final": final,
            "dataset": query_metadata.dataset,
        },
        mark_tags={
            "final": final,
            "referrer": referrer,
            "dataset": query_metadata.dataset,
        },
    )