Ejemplo n.º 1
0
    def execute(
        self,
        query: FormattedQuery,
        # TODO: move Clickhouse specific arguments into clickhouse.query.Query
        settings: Optional[Mapping[str, str]] = None,
        with_totals: bool = False,
        robust: bool = False,
        capture_trace: bool = False,
    ) -> Result:
        settings = {**settings} if settings is not None else {}

        query_id = None
        if "query_id" in settings:
            query_id = settings.pop("query_id")

        execute_func = (self.__client.execute_robust
                        if robust is True else self.__client.execute)

        return self.__transform_result(
            execute_func(
                query.get_sql(),
                with_column_types=True,
                query_id=query_id,
                settings=settings,
                capture_trace=capture_trace,
            ),
            with_totals=with_totals,
        )
Ejemplo n.º 2
0
def test_mock_consumer() -> None:
    storage = get_writable_storage(StorageKey.ERRORS)

    strategy = KafkaConsumerStrategyFactory(
        None,
        lambda message: None,
        build_mock_batch_writer(storage, True, TestingMetricsBackend(), 100,
                                50),
        max_batch_size=1,
        max_batch_time=1,
        processes=None,
        input_block_size=None,
        output_block_size=None,
        initialize_parallel_transform=None,
    ).create(lambda message: None)

    strategy.submit(
        Message(
            Partition(Topic("events"), 0),
            1,
            KafkaPayload(None, b"INVALID MESSAGE", []),
            datetime.now(),
        ))
    strategy.close()
    strategy.join()

    # If the mock was not applied correctly we would have data in Clickhouse
    reader = storage.get_cluster().get_reader()
    result = reader.execute(
        FormattedQuery([StringNode("SELECT count() as c from errors_local")]))
    assert result["data"] == [{"c": 0}]
Ejemplo n.º 3
0
def format_query(query: FormattableQuery) -> FormattedQuery:
    """
    Formats a Clickhouse Query from the AST representation into an
    intermediate structure that can either be serialized into a string
    (for clickhouse) or extracted as a sequence (for logging and tracing).

    This is the entry point for any type of query, whether simple or
    composite.
    """
    return FormattedQuery(
        _format_query_content(query, ClickhouseExpressionFormatter))
Ejemplo n.º 4
0
    def execute(
        self,
        query: FormattedQuery,
        # TODO: move Clickhouse specific arguments into clickhouse.query.Query
        settings: Optional[Mapping[str, str]] = None,
        with_totals: bool = False,
    ) -> Result:
        settings = {**settings} if settings is not None else {}

        kwargs = {}
        if "query_id" in settings:
            kwargs["query_id"] = settings.pop("query_id")

        return self.__transform_result(
            self.__client.execute(
                query.get_sql(), with_column_types=True, settings=settings, **kwargs
            ),
            with_totals=with_totals,
        )
Ejemplo n.º 5
0
def format_query(query: FormattableQuery,
                 settings: RequestSettings) -> FormattedQuery:
    """
    Formats a Clickhouse Query from the AST representation into an
    intermediate structure that can either be serialized into a string
    (for clickhouse) or extracted as a sequence (for logging and tracing).

    This is the entry point for any type of query, whether simple or
    composite.

    TODO: Remove this method entirely and move the sampling logic
    into a query processor.
    """

    if isinstance(query, Query):
        if settings.get_turbo() and not query.get_from_clause().sampling_rate:
            query.set_from_clause(
                replace(
                    query.get_from_clause(),
                    sampling_rate=snuba_settings.TURBO_SAMPLE_RATE,
                ))
    return FormattedQuery(_format_query_content(query))
Ejemplo n.º 6
0
def raw_query(
    # TODO: Passing the whole clickhouse query here is needed as long
    # as the execute method depends on it. Otherwise we can make this
    # file rely either entirely on clickhouse query or entirely on
    # the formatter.
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    stats: MutableMapping[str, Any],
    trace_id: Optional[str] = None,
) -> QueryResult:
    """
    Submits a raw SQL query to the DB and does some post-processing on it to
    fix some of the formatting issues in the result JSON.
    This function is not supposed to depend on anything higher level than the clickhouse
    query. If this function ends up depending on the dataset, something is wrong.
    """
    all_confs = state.get_all_configs()
    query_settings: MutableMapping[str, Any] = {
        k.split("/", 1)[1]: v
        for k, v in all_confs.items() if k.startswith("query_settings/")
    }

    timer.mark("get_configs")

    sql = formatted_query.get_sql()

    update_with_status = partial(
        update_query_metadata_and_stats,
        clickhouse_query,
        sql,
        timer,
        stats,
        query_metadata,
        query_settings,
        trace_id,
    )

    execute_query_strategy = (
        execute_query_with_readthrough_caching if state.get_config(
            "use_readthrough_query_cache", 1) else execute_query_with_caching)

    try:
        result = execute_query_strategy(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            stats,
            query_settings,
        )
    except Exception as cause:
        if isinstance(cause, RateLimitExceeded):
            stats = update_with_status(QueryStatus.RATE_LIMITED)
        else:
            with configure_scope() as scope:
                if isinstance(cause, ClickhouseError):
                    scope.fingerprint = ["{{default}}", str(cause.code)]
                logger.exception("Error running query: %s\n%s", sql, cause)
            stats = update_with_status(QueryStatus.ERROR)
        raise QueryException({"stats": stats, "sql": sql}) from cause
    else:
        stats = update_with_status(QueryStatus.SUCCESS)
        return QueryResult(result, {"stats": stats, "sql": sql})
Ejemplo n.º 7
0
def get_query_cache_key(formatted_query: FormattedQuery) -> str:
    return md5(force_bytes(formatted_query.get_sql())).hexdigest()
Ejemplo n.º 8
0
def format_query_anonymized(query: FormattableQuery) -> FormattedQuery:
    return FormattedQuery(
        _format_query_content(query, ClickHouseExpressionFormatterAnonymized))
Ejemplo n.º 9
0
def format_snql_anonymized(
    query: Union[LogicalQuery, CompositeQuery[Entity]]
) -> FormattedQuery:

    return FormattedQuery(_format_query_content(query, ExpressionFormatterAnonymized))
Ejemplo n.º 10
0
def test_composite_query() -> None:
    query = FormattedQuery(
        [
            StringNode("SELECT avg(a)"),
            PaddingNode(
                "FROM",
                FormattedSubQuery(
                    [
                        StringNode("SELECT t_a.a, t_b.b"),
                        PaddingNode(
                            "FROM",
                            SequenceNode(
                                [
                                    PaddingNode(
                                        None,
                                        FormattedSubQuery(
                                            [
                                                StringNode("SELECT a, b"),
                                                StringNode("FROM somewhere"),
                                            ]
                                        ),
                                        "t_a",
                                    ),
                                    StringNode("INNER SEMI JOIN"),
                                    PaddingNode(
                                        None,
                                        FormattedSubQuery(
                                            [
                                                StringNode("SELECT a, b"),
                                                StringNode("FROM somewhere_else"),
                                            ]
                                        ),
                                        "t_b",
                                    ),
                                    StringNode("ON t_a.a = t_b.b"),
                                ],
                            ),
                        ),
                    ],
                ),
            ),
            StringNode("WHERE something something"),
        ],
    )

    assert query.get_sql(format="JSON") == (
        "SELECT avg(a) FROM "
        "(SELECT t_a.a, t_b.b FROM "
        "(SELECT a, b FROM somewhere) t_a "
        "INNER SEMI JOIN "
        "(SELECT a, b FROM somewhere_else) t_b "
        "ON t_a.a = t_b.b) "
        "WHERE something something "
        "FORMAT JSON"
    )

    assert query.structured() == [
        "SELECT avg(a)",
        [
            "FROM",
            [
                "SELECT t_a.a, t_b.b",
                [
                    "FROM",
                    [
                        [["SELECT a, b", "FROM somewhere"], "t_a"],
                        "INNER SEMI JOIN",
                        [["SELECT a, b", "FROM somewhere_else"], "t_b"],
                        "ON t_a.a = t_b.b",
                    ],
                ],
            ],
        ],
        "WHERE something something",
    ]
Ejemplo n.º 11
0
def raw_query(
    # TODO: Passing the whole clickhouse query here is needed as long
    # as the execute method depends on it. Otherwise we can make this
    # file rely either entirely on clickhouse query or entirely on
    # the formatter.
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    query_settings: QuerySettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    stats: MutableMapping[str, Any],
    trace_id: Optional[str] = None,
    robust: bool = False,
) -> QueryResult:
    """
    Submits a raw SQL query to the DB and does some post-processing on it to
    fix some of the formatting issues in the result JSON.
    This function is not supposed to depend on anything higher level than the clickhouse
    query. If this function ends up depending on the dataset, something is wrong.
    """
    all_confs = state.get_all_configs()
    clickhouse_query_settings: MutableMapping[str, Any] = {
        k.split("/", 1)[1]: v
        for k, v in all_confs.items()
        if k.startswith("query_settings/")
    }

    timer.mark("get_configs")

    sql = formatted_query.get_sql()

    update_with_status = partial(
        update_query_metadata_and_stats,
        clickhouse_query,
        sql,
        timer,
        stats,
        query_metadata,
        clickhouse_query_settings,
        trace_id,
    )

    execute_query_strategy = (
        execute_query_with_readthrough_caching
        if state.get_config("use_readthrough_query_cache", 1)
        else execute_query_with_caching
    )

    try:
        result = execute_query_strategy(
            clickhouse_query,
            query_settings,
            formatted_query,
            reader,
            timer,
            stats,
            clickhouse_query_settings,
            robust=robust,
        )
    except Exception as cause:
        if isinstance(cause, RateLimitExceeded):
            stats = update_with_status(QueryStatus.RATE_LIMITED)
        else:
            error_code = None
            with configure_scope() as scope:
                if isinstance(cause, ClickhouseError):
                    error_code = cause.code
                    scope.fingerprint = ["{{default}}", str(cause.code)]
                    if scope.span:
                        if cause.code == errors.ErrorCodes.TOO_SLOW:
                            sentry_sdk.set_tag("timeout", "predicted")
                        elif cause.code == errors.ErrorCodes.TIMEOUT_EXCEEDED:
                            sentry_sdk.set_tag("timeout", "query_timeout")
                        elif cause.code in (
                            errors.ErrorCodes.SOCKET_TIMEOUT,
                            errors.ErrorCodes.NETWORK_ERROR,
                        ):
                            sentry_sdk.set_tag("timeout", "network")
                elif isinstance(
                    cause,
                    (TimeoutError, ExecutionTimeoutError, TigerExecutionTimeoutError),
                ):
                    if scope.span:
                        sentry_sdk.set_tag("timeout", "cache_timeout")

                logger.exception("Error running query: %s\n%s", sql, cause)
            stats = update_with_status(QueryStatus.ERROR, error_code=error_code)
        raise QueryException(
            {
                "stats": stats,
                "sql": sql,
                "experiments": clickhouse_query.get_experiments(),
            }
        ) from cause
    else:
        stats = update_with_status(QueryStatus.SUCCESS, result["profile"])
        return QueryResult(
            result,
            {
                "stats": stats,
                "sql": sql,
                "experiments": clickhouse_query.get_experiments(),
            },
        )