コード例 #1
0
    def validate(self, exp: Expression, dataset: Dataset) -> None:
        if not isinstance(exp, FunctionCall):
            return

        dataset_validators = dataset.get_function_call_validators()
        common_function_validators = (
            dataset_validators.keys() & default_validators.keys()
        )
        if common_function_validators:
            logger.warning(
                "Dataset validators are overlapping with default ones. Dataset: %s. Overlap %r",
                dataset,
                common_function_validators,
                exc_info=True,
            )

        validators = ChainMap(default_validators, dataset_validators)
        try:
            validator = validators.get(exp.function_name)
            if validator is not None:
                validator.validate(exp.parameters, dataset.get_abstract_columnset())
        except InvalidFunctionCall as exception:
            raise InvalidExpressionException(
                exp, f"Illegal call to function {exp.function_name}: {str(exception)}",
            ) from exception
コード例 #2
0
ファイル: migrate.py プロジェクト: ruezetle/snuba
def run(conn: Client, dataset: Dataset) -> None:
    schemas: MutableSequence[Schema] = []

    writable_storage = dataset.get_writable_storage()
    if writable_storage:
        writer = writable_storage.get_table_writer()
        schemas.append(writer.get_schema())
    for storage in dataset.get_all_storages():
        schemas.append(storage.get_schemas().get_read_schema())

    for schema in schemas:
        _run_schema(conn, schema)
コード例 #3
0
ファイル: __init__.py プロジェクト: isabella232/snuba
def parse_query(body: MutableMapping[str, Any], dataset: Dataset) -> Query:
    """
    Parses the query body generating the AST. This only takes into
    account the initial query body. Extensions are parsed by extension
    processors and are supposed to update the AST.

    Parsing includes two phases. The first transforms the json body into
    a minimal query Object resolving expressions, conditions, etc.
    The second phase performs some query processing to provide a sane
    query to the dataset specific section.
    - It prevents alias shadowing.
    - It transforms columns from the tags[asd] form into
      SubscriptableReference.
    - Applies aliases to all columns that do not have one and that do not
      represent a reference to an existing alias.
      During query processing a column can be transformed into a different
      expression. It is essential to preserve the original column name so
      that the result set still has a column with the name provided by the
      user no matter on which transformation we applied.
      By applying aliases at this stage every processor just needs to
      preserve them to guarantee the correctness of the query.
    - Expands all the references to aliases by inlining the expression
      to make aliasing transparent to all query processing phases.
      References to aliases are reintroduced at the end of the query
      processing.
      Alias references are packaged back at the end of processing.
    """
    # TODO: Parse the entity out of the query body and select the correct one from the dataset
    entity = dataset.get_default_entity()

    query = _parse_query_impl(body, entity)
    # TODO: These should support composite queries.
    _validate_empty_table_names(query)
    _validate_aliases(query)
    _parse_subscriptables(query)
    _apply_column_aliases(query)
    _expand_aliases(query)
    # WARNING: These steps above assume table resolution did not happen
    # yet. If it is put earlier than here (unlikely), we need to adapt them.
    _deescape_aliases(query)
    _mangle_aliases(query)
    _validate_arrayjoin(query)

    # XXX: Select the entity to be used for the query. This step is temporary. Eventually
    # entity selection will be moved to Sentry and specified for all SnQL queries.
    selected_entity = dataset.select_entity(query)
    query_entity = QueryEntity(
        selected_entity, get_entity(selected_entity).get_data_model()
    )
    query.set_from_clause(query_entity)

    validate_query(query)
    return query
コード例 #4
0
ファイル: factory.py プロジェクト: ruezetle/snuba
def enforce_table_writer(dataset: Dataset) -> TableWriter:
    writable_storage = dataset.get_writable_storage()

    assert (
        writable_storage is not None
    ), f"Dataset{dataset} does not have a writable storage."
    return writable_storage.get_table_writer()
コード例 #5
0
def _run_query_pipeline(
    dataset: Dataset,
    request: Request,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
) -> QueryResult:
    """
    Runs the query processing and execution pipeline for a Snuba Query. This means it takes a Dataset
    and a Request and returns the results of the query.

    This process includes:
    - Applying dataset specific syntax extensions (QueryExtension)
    - Applying dataset query processors on the abstract Snuba query.
    - Using the dataset provided ClickhouseQueryPlanBuilder to build a ClickhouseQueryPlan. This step
      transforms the Snuba Query into the Storage Query (that is contextual to the storage/s).
      From this point on none should depend on the dataset.
    - Executing the plan specific query processors.
    - Providing the newly built Query, processors to be run for each DB query and a QueryRunner
      to the QueryExecutionStrategy to actually run the DB Query.
    """
    if not request.settings.get_turbo() and SampleClauseFinder().visit(
        request.query.get_from_clause()
    ):
        metrics.increment("sample_without_turbo", tags={"referrer": request.referrer})

    query_runner = partial(
        _run_and_apply_column_names, timer, query_metadata, request.referrer,
    )

    return (
        dataset.get_query_pipeline_builder()
        .build_execution_pipeline(request, query_runner)
        .execute()
    )
コード例 #6
0
ファイル: views.py プロジェクト: jiankunking/snuba
    def drop(*, dataset: Dataset):
        for statement in dataset.get_dataset_schemas().get_drop_statements():
            clickhouse_rw.execute(statement.statement)

        ensure_table_exists(dataset, force=True)
        redis_client.flushdb()
        return ("ok", 200, {"Content-Type": "text/plain"})
コード例 #7
0
    def eventstream(*, dataset: Dataset):
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, http_request.data, []),
            datetime.now(),
        )

        type_ = record[1]

        storage = dataset.get_writable_storage()
        assert storage is not None

        if type_ == "insert":
            from snuba.consumer import ConsumerWorker

            worker = ConsumerWorker(storage, metrics=metrics)
        else:
            from snuba.replacer import ReplacerWorker

            worker = ReplacerWorker(storage, metrics=metrics)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ("ok", 200, {"Content-Type": "text/plain"})
コード例 #8
0
ファイル: data.py プロジェクト: ruezetle/snuba
 def build_request(
     self, dataset: Dataset, timestamp: datetime, offset: Optional[int], timer: Timer
 ) -> Request:
     """
     Returns a Request that can be used to run a query via `parse_and_run_query`.
     :param dataset: The Dataset to build the request for
     :param timestamp: Date that the query should run up until
     :param offset: Maximum offset we should query for
     """
     schema = RequestSchema.build_with_extensions(
         dataset.get_extensions(), SubscriptionRequestSettings,
     )
     extra_conditions: Sequence[Condition] = []
     if offset is not None:
         extra_conditions = [[["ifnull", ["offset", 0]], "<=", offset]]
     return validate_request_content(
         {
             "project": self.project_id,
             "conditions": [*self.conditions, *extra_conditions],
             "aggregations": self.aggregations,
             "from_date": (timestamp - self.time_window).isoformat(),
             "to_date": timestamp.isoformat(),
         },
         schema,
         timer,
         dataset,
         SUBSCRIPTION_REFERRER,
     )
コード例 #9
0
ファイル: views.py プロジェクト: Appva/snuba
def validate_request_content(body, schema: RequestSchema, timer,
                             dataset: Dataset) -> Request:
    source = dataset.get_dataset_schemas().get_read_schema().get_data_source()
    try:
        request = schema.validate(body, source)
    except jsonschema.ValidationError as error:
        raise BadRequest(str(error)) from error

    timer.mark('validate_schema')

    return request
コード例 #10
0
ファイル: views.py プロジェクト: getsentry/snuba
def delete_subscription(
    *, dataset: Dataset, partition: int, key: str, entity: Entity
) -> RespTuple:
    if entity not in dataset.get_all_entities():
        raise InvalidSubscriptionError(
            "Invalid subscription dataset and entity combination"
        )
    entity_key = ENTITY_NAME_LOOKUP[entity]
    SubscriptionDeleter(entity_key, PartitionId(partition)).delete(UUID(key))
    metrics.increment("subscription_deleted", tags={"entity": entity_key.value})

    return "ok", 202, {"Content-Type": "text/plain"}
コード例 #11
0
ファイル: views.py プロジェクト: getsentry/snuba
    def eventstream(*, dataset: Dataset) -> RespTuple:
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, http_request.data, []),
            datetime.now(),
        )

        type_ = record[1]

        storage = dataset.get_default_entity().get_writable_storage()
        assert storage is not None

        if type_ == "insert":
            from arroyo.processing.strategies.streaming import (
                KafkaConsumerStrategyFactory,
            )

            from snuba.consumers.consumer import build_batch_writer, process_message

            table_writer = storage.get_table_writer()
            stream_loader = table_writer.get_stream_loader()
            strategy = KafkaConsumerStrategyFactory(
                stream_loader.get_pre_filter(),
                functools.partial(
                    process_message, stream_loader.get_processor(), "consumer_grouup"
                ),
                build_batch_writer(table_writer, metrics=metrics),
                max_batch_size=1,
                max_batch_time=1.0,
                processes=None,
                input_block_size=None,
                output_block_size=None,
            ).create(lambda offsets: None)
            strategy.submit(message)
            strategy.close()
            strategy.join()
        else:
            from snuba.replacer import ReplacerWorker

            worker = ReplacerWorker(storage, "consumer_group", metrics=metrics)
            processed = worker.process_message(message)
            if processed is not None:
                batch = [processed]
                worker.flush_batch(batch)

        return ("ok", 200, {"Content-Type": "text/plain"})
コード例 #12
0
ファイル: replacer.py プロジェクト: Appva/snuba
 def __init__(self, clickhouse: ClickhousePool, dataset: Dataset,
              metrics: MetricsBackend) -> None:
     self.clickhouse = clickhouse
     self.dataset = dataset
     self.metrics = metrics
     self.__all_column_names = [
         col.escaped for col in enforce_table_writer(
             dataset).get_schema().get_columns()
     ]
     self.__required_columns = [
         col.escaped for col in dataset.get_required_columns()
     ]
コード例 #13
0
def dataset_query(dataset: Dataset, body, timer: Timer) -> Response:
    assert http_request.method == "POST"

    with sentry_sdk.start_span(description="build_schema", op="validate"):
        schema = RequestSchema.build_with_extensions(
            dataset.get_extensions(), HTTPRequestSettings
        )

    request = build_request(body, schema, timer, dataset, http_request.referrer)

    try:
        result = parse_and_run_query(dataset, request, timer)
    except QueryException as exception:
        status = 500
        details: Mapping[str, Any]

        cause = exception.__cause__
        if isinstance(cause, RateLimitExceeded):
            status = 429
            details = {
                "type": "rate-limited",
                "message": "rate limit exceeded",
            }
        elif isinstance(cause, ClickhouseError):
            details = {
                "type": "clickhouse",
                "message": str(cause),
                "code": cause.code,
            }
        elif isinstance(cause, Exception):
            details = {
                "type": "unknown",
                "message": str(cause),
            }
        else:
            raise  # exception should have been chained

        return Response(
            json.dumps(
                {"error": details, "timing": timer.for_json(), **exception.extra}
            ),
            status,
            {"Content-Type": "application/json"},
        )

    payload: MutableMapping[str, Any] = {**result.result, "timing": timer.for_json()}

    if settings.STATS_IN_RESPONSE or request.settings.get_debug():
        payload.update(result.extra)

    return Response(json.dumps(payload), 200, {"Content-Type": "application/json"})
コード例 #14
0
def truncate_dataset(dataset: Dataset) -> None:
    for storage in dataset.get_all_storages():
        cluster = storage.get_cluster()
        clickhouse = cluster.get_query_connection(ClickhouseClientSettings.MIGRATE)
        database = cluster.get_database()

        schema = storage.get_schema()

        if not isinstance(schema, TableSchema):
            return

        table = schema.get_local_table_name()

        clickhouse.execute(f"TRUNCATE TABLE IF EXISTS {database}.{table}")
コード例 #15
0
ファイル: views.py プロジェクト: ruezetle/snuba
def dataset_query_view(*, dataset: Dataset, timer: Timer):
    if http_request.method == "GET":
        schema = RequestSchema.build_with_extensions(
            dataset.get_extensions(), HTTPRequestSettings
        )
        return render_template(
            "query.html",
            query_template=json.dumps(schema.generate_template(), indent=4,),
        )
    elif http_request.method == "POST":
        body = parse_request_body(http_request)
        return dataset_query(dataset, body, timer)
    else:
        assert False, "unexpected fallthrough"
コード例 #16
0
def test_no_schema_diffs(dataset: Dataset) -> None:
    from snuba.migrations.parse_schema import get_local_schema

    writable_storage = dataset.get_writable_storage()
    if not writable_storage:
        pytest.skip(f"{dataset!r} has no writable storage")

    clickhouse = writable_storage.get_cluster().get_query_connection(
        ClickhouseClientSettings.MIGRATE)
    table_writer = writable_storage.get_table_writer()
    dataset_schema = table_writer.get_schema()
    local_table_name = dataset_schema.get_local_table_name()
    local_schema = get_local_schema(clickhouse, local_table_name)

    assert not dataset_schema.get_column_differences(local_schema)
コード例 #17
0
ファイル: views.py プロジェクト: getsentry/snuba
def create_subscription(*, dataset: Dataset, timer: Timer, entity: Entity) -> RespTuple:
    if entity not in dataset.get_all_entities():
        raise InvalidSubscriptionError(
            "Invalid subscription dataset and entity combination"
        )
    entity_key = ENTITY_NAME_LOOKUP[entity]
    subscription = SubscriptionDataCodec(entity_key).decode(http_request.data)
    identifier = SubscriptionCreator(dataset, entity_key).create(subscription, timer)

    metrics.increment("subscription_created", tags={"entity": entity_key.value})
    return (
        json.dumps({"subscription_id": str(identifier)}),
        202,
        {"Content-Type": "application/json"},
    )
コード例 #18
0
ファイル: views.py プロジェクト: jiankunking/snuba
def dataset_query(dataset: Dataset, body, timer: Timer) -> Response:
    assert http_request.method == "POST"
    ensure_table_exists(dataset)
    return format_result(
        run_query(
            dataset,
            validate_request_content(
                body,
                RequestSchema.build_with_extensions(dataset.get_extensions(),
                                                    HTTPRequestSettings),
                timer,
                dataset,
                http_request.referrer,
            ),
            timer,
        ))
コード例 #19
0
ファイル: views.py プロジェクト: jiankunking/snuba
    def ensure_table_exists(dataset: Dataset, force: bool = False) -> None:
        if not force and _ensured.get(dataset, False):
            return

        assert local_dataset_mode(), "Cannot create table in distributed mode"

        from snuba import migrate

        # We cannot build distributed tables this way. So this only works in local
        # mode.
        for statement in dataset.get_dataset_schemas().get_create_statements():
            clickhouse_rw.execute(statement.statement)

        migrate.run(clickhouse_rw, dataset)

        _ensured[dataset] = True
コード例 #20
0
ファイル: views.py プロジェクト: chhetripradeep/snuba
def dataset_query_view(*, dataset: Dataset, timer: Timer) -> Union[Response, str]:
    if http_request.method == "GET":
        schema = RequestSchema.build_with_extensions(
            dataset.get_default_entity().get_extensions(),
            HTTPRequestSettings,
            Language.LEGACY,
        )
        return render_template(
            "query.html",
            query_template=json.dumps(schema.generate_template(), indent=4,),
        )
    elif http_request.method == "POST":
        body = parse_request_body(http_request)
        _trace_transaction(dataset)
        return dataset_query(dataset, body, timer, Language.LEGACY)
    else:
        assert False, "unexpected fallthrough"
コード例 #21
0
ファイル: views.py プロジェクト: getsentry/snuba
def truncate_dataset(dataset: Dataset) -> None:
    for entity in dataset.get_all_entities():
        for storage in entity.get_all_storages():
            cluster = storage.get_cluster()
            nodes = [*cluster.get_local_nodes(), *cluster.get_distributed_nodes()]
            for node in nodes:
                clickhouse = cluster.get_node_connection(
                    ClickhouseClientSettings.MIGRATE, node
                )

                database = cluster.get_database()

                schema = storage.get_schema()

                if not isinstance(schema, TableSchema):
                    return

                table = schema.get_local_table_name()

                clickhouse.execute(f"TRUNCATE TABLE IF EXISTS {database}.{table}")
コード例 #22
0
def parse_query(body: MutableMapping[str, Any], dataset: Dataset) -> Query:
    """
    Parses the query body generating the AST. This only takes into
    account the initial query body. Extensions are parsed by extension
    processors and are supposed to update the AST.
    """
    try:
        return _parse_query_impl(body, dataset)
    except Exception as e:
        # During the development there is no need to fail Snuba queries if the parser
        # has an issue, anyway the production query is ran based on the old query
        # representation.
        # Once we will be actually using the ast to build the Clickhouse query
        # this try/except block will disappear.
        enforce_validity = state.get_config("query_parsing_enforce_validity",
                                            0)
        if enforce_validity:
            raise e
        else:
            logger.exception("Failed to parse query")
            source = dataset.get_dataset_schemas().get_read_schema(
            ).get_data_source()
            return Query(body, source)
コード例 #23
0
ファイル: query.py プロジェクト: ruezetle/snuba
def _run_query_pipeline(
    dataset: Dataset,
    request: Request,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
) -> RawQueryResult:
    """
    Runs the query processing and execution pipeline for a Snuba Query. This means it takes a Dataset
    and a Request and returns the results of the query.

    This process includes:
    - Applying dataset specific syntax extensions (QueryExtension)
    - Applying dataset query processors on the abstract Snuba query.
    - Using the dataset provided StorageQueryPlanBuilder to build a StorageQueryPlan. This step
      transforms the Snuba Query into the Storage Query (that is contextual to the storage/s).
      From this point on none should depend on the dataset.
    - Executing the storage specific query processors.
    - Providing the newly built Query and a QueryRunner to the QueryExecutionStrategy to actually
      run the DB Query.
    """

    # TODO: this will work perfectly with datasets that are not time series. Remove it.
    from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit(
        request.extensions["timeseries"])

    if (request.query.get_sample() is not None and request.query.get_sample()
            != 1.0) and not request.settings.get_turbo():
        metrics.increment("sample_without_turbo",
                          tags={"referrer": request.referrer})

    extensions = dataset.get_extensions()
    for name, extension in extensions.items():
        extension.get_processor().process_query(request.query,
                                                request.extensions[name],
                                                request.settings)

    # TODO: Fit this in a query processor. All query transformations should be driven by
    # datasets/storages and never hardcoded.
    if request.settings.get_turbo():
        request.query.set_final(False)

    for processor in dataset.get_query_processors():
        processor.process_query(request.query, request.settings)

    storage_query_plan = dataset.get_query_plan_builder().build_plan(request)

    # TODO: This below should be a storage specific query processor.
    relational_source = request.query.get_data_source()
    request.query.add_conditions(relational_source.get_mandatory_conditions())

    for processor in storage_query_plan.query_processors:
        processor.process_query(request.query, request.settings)

    query_runner = partial(
        _format_storage_query_and_run,
        dataset,
        timer,
        query_metadata,
        from_date,
        to_date,
    )

    return storage_query_plan.execution_strategy.execute(request, query_runner)
コード例 #24
0
ファイル: query.py プロジェクト: jiankunking/snuba
def parse_and_run_query(
    dataset: Dataset, request: Request, timer: Timer
) -> ClickhouseQueryResult:
    from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit(
        request.extensions["timeseries"]
    )

    if (
        request.query.get_sample() is not None and request.query.get_sample() != 1.0
    ) and not request.settings.get_turbo():
        metrics.increment("sample_without_turbo", tags={"referrer": request.referrer})

    extensions = dataset.get_extensions()
    for name, extension in extensions.items():
        extension.get_processor().process_query(
            request.query, request.extensions[name], request.settings
        )

    request.query.add_conditions(dataset.default_conditions())

    if request.settings.get_turbo():
        request.query.set_final(False)

    for processor in dataset.get_query_processors():
        processor.process_query(request.query, request.settings)

    relational_source = request.query.get_data_source()
    request.query.add_conditions(relational_source.get_mandatory_conditions())

    source = relational_source.format_from()
    with sentry_sdk.start_span(description="create_query", op="db"):
        # TODO: consider moving the performance logic and the pre_where generation into
        # ClickhouseQuery since they are Clickhouse specific
        query = DictClickhouseQuery(dataset, request.query, request.settings)
    timer.mark("prepare_query")

    num_days = (to_date - from_date).days
    stats = {
        "clickhouse_table": source,
        "final": request.query.get_final(),
        "referrer": request.referrer,
        "num_days": num_days,
        "sample": request.query.get_sample(),
    }

    with sentry_sdk.configure_scope() as scope:
        if scope.span:
            scope.span.set_tag("dataset", type(dataset).__name__)
            scope.span.set_tag("referrer", http_request.referrer)
            scope.span.set_tag("timeframe_days", num_days)

    with sentry_sdk.start_span(description=query.format_sql(), op="db") as span:
        span.set_tag("dataset", type(dataset).__name__)
        span.set_tag("table", source)
        try:
            span.set_tag(
                "ast_query",
                AstClickhouseQuery(request.query, request.settings).format_sql(),
            )
        except Exception:
            logger.exception("Failed to format ast query")
        result = raw_query(
            request, query, NativeDriverReader(clickhouse_ro), timer, stats
        )

    with sentry_sdk.configure_scope() as scope:
        if scope.span:
            if "max_threads" in stats:
                scope.span.set_tag("max_threads", stats["max_threads"])

    return result
コード例 #25
0
def _parse_query_impl(body: MutableMapping[str, Any],
                      dataset: Dataset) -> Query:
    aggregate_exprs = []
    for aggregation in body.get("aggregations", []):
        assert isinstance(aggregation, (list, tuple))
        aggregation_function = aggregation[0]
        column_expr = aggregation[1]
        column_expr = column_expr if column_expr else []
        alias = aggregation[2]
        alias = alias if alias else None

        aggregate_exprs.append(
            parse_aggregation(aggregation_function, column_expr, alias))

    groupby_exprs = [
        parse_expression(tuplify(group_by))
        for group_by in to_list(body.get("groupby", []))
    ]
    select_exprs = [
        parse_expression(tuplify(select))
        for select in body.get("selected_columns", [])
    ]

    selected_cols = groupby_exprs + aggregate_exprs + select_exprs

    arrayjoin = body.get("arrayjoin")
    if arrayjoin:
        array_join_expr: Optional[Expression] = parse_expression(
            body["arrayjoin"])
    else:
        array_join_expr = None

    where_expr = parse_conditions_to_expr(body.get("conditions", []), dataset,
                                          arrayjoin)
    having_expr = parse_conditions_to_expr(body.get("having", []), dataset,
                                           arrayjoin)

    orderby_exprs = []
    for orderby in to_list(body.get("orderby", [])):
        if isinstance(orderby, str):
            match = NEGATE_RE.match(orderby)
            assert match is not None, f"Invalid Order By clause {orderby}"
            direction, col = match.groups()
            orderby = col
        elif is_function(orderby):
            match = NEGATE_RE.match(orderby[0])
            assert match is not None, f"Invalid Order By clause {orderby}"
            direction, col = match.groups()
            orderby = [col] + orderby[1:]
        else:
            raise ValueError(f"Invalid Order By clause {orderby}")
        orderby_parsed = parse_expression(tuplify(orderby))
        orderby_exprs.append(
            OrderBy(
                OrderByDirection.DESC
                if direction == "-" else OrderByDirection.ASC,
                orderby_parsed,
            ))

    source = dataset.get_dataset_schemas().get_read_schema().get_data_source()
    return Query(
        body,
        source,
        selected_columns=selected_cols,
        array_join=array_join_expr,
        condition=where_expr,
        groupby=groupby_exprs,
        having=having_expr,
        order_by=orderby_exprs,
    )
コード例 #26
0
ファイル: views.py プロジェクト: getsentry/snuba
 def write(*, dataset: Dataset) -> RespTuple:
     return _write_to_entity(entity=dataset.get_default_entity())
コード例 #27
0
ファイル: conditions.py プロジェクト: jiankunking/snuba
def parse_conditions(
    operand_builder: Callable[[Any], TExpression],
    and_builder: Callable[[Sequence[TExpression]], Optional[TExpression]],
    or_builder: Callable[[Sequence[TExpression]], Optional[TExpression]],
    unpack_array_condition_builder: Callable[[TExpression, str, Any],
                                             TExpression],
    simple_condition_builder: Callable[[TExpression, str, Any], TExpression],
    dataset: Dataset,
    conditions: Any,
    array_join: Optional[str],
    depth: int = 0,
) -> Optional[TExpression]:
    """
    Return a boolean expression suitable for putting in the WHERE clause of the
    query.  The expression is constructed by ANDing groups of OR expressions.
    Expansion of columns is handled, as is replacement of columns with aliases,
    if the column has already been expanded and aliased elsewhere.

    operand_builder: Builds the TExpression representing the left hand side
      of a simple condition. This can be as nested as the user wants
    and_builder / or_builder: Combine a list of expressions in AND/OR
    unpack_array_condition_builder: Deals with a special case where we unpack conditions
      on array columns. More details in the code.
    simple_condition_builder: Generates a simple condition made by expression on the
      left hand side, an operator and a literal on the right hand side.
    """
    from snuba.clickhouse.columns import Array

    if not conditions:
        return None

    if depth == 0:
        # dedupe conditions at top level, but keep them in order
        sub = OrderedDict((
            parse_conditions(
                operand_builder,
                and_builder,
                or_builder,
                unpack_array_condition_builder,
                simple_condition_builder,
                dataset,
                cond,
                array_join,
                depth + 1,
            ),
            None,
        ) for cond in conditions)
        return and_builder([s for s in sub.keys() if s])
    elif is_condition(conditions):
        lhs, op, lit = dataset.process_condition(conditions)

        # facilitate deduping IN conditions by sorting them.
        if op in ("IN", "NOT IN") and isinstance(lit, tuple):
            lit = tuple(sorted(lit))

        # If the LHS is a simple column name that refers to an array column
        # (and we are not arrayJoining on that column, which would make it
        # scalar again) and the RHS is a scalar value, we assume that the user
        # actually means to check if any (or all) items in the array match the
        # predicate, so we return an `any(x == value for x in array_column)`
        # type expression. We assume that operators looking for a specific value
        # (IN, =, LIKE) are looking for rows where any array value matches, and
        # exclusionary operators (NOT IN, NOT LIKE, !=) are looking for rows
        # where all elements match (eg. all NOT LIKE 'foo').
        columns = dataset.get_dataset_schemas().get_read_schema().get_columns()
        if (isinstance(lhs, str) and lhs in columns
                and isinstance(columns[lhs].type, Array)
                and columns[lhs].base_name != array_join
                and not isinstance(lit, (list, tuple))):
            return unpack_array_condition_builder(operand_builder(lhs), op,
                                                  lit)
        else:
            return simple_condition_builder(operand_builder(lhs), op, lit)

    elif depth == 1:
        sub_expression = (parse_conditions(
            operand_builder,
            and_builder,
            or_builder,
            unpack_array_condition_builder,
            simple_condition_builder,
            dataset,
            cond,
            array_join,
            depth + 1,
        ) for cond in conditions)
        return or_builder([s for s in sub_expression if s])
    else:
        raise InvalidConditionException(str(conditions))
コード例 #28
0
ファイル: factory.py プロジェクト: jiankunking/snuba
def enforce_table_writer(dataset: Dataset) -> TableWriter:
    table_writer = dataset.get_table_writer()
    assert table_writer is not None, f"Dataset{dataset} is not writable"
    return table_writer
コード例 #29
0
ファイル: views.py プロジェクト: chhetripradeep/snuba
def dataset_query(
    dataset: Dataset, body: MutableMapping[str, Any], timer: Timer, language: Language
) -> Response:
    assert http_request.method == "POST"
    referrer = http_request.referrer or "<unknown>"  # mypy

    if language == Language.SNQL:
        metrics.increment("snql.query.incoming", tags={"referrer": referrer})
        parser: Callable[
            [RequestParts, RequestSettings, Dataset],
            Union[Query, CompositeQuery[Entity]],
        ] = partial(parse_snql_query, [])
    else:
        parser = parse_legacy_query

    with sentry_sdk.start_span(description="build_schema", op="validate"):
        schema = RequestSchema.build_with_extensions(
            dataset.get_default_entity().get_extensions(), HTTPRequestSettings, language
        )

    request = build_request(
        body, parser, HTTPRequestSettings, schema, dataset, timer, referrer
    )

    try:
        result = parse_and_run_query(dataset, request, timer)

        # Some metrics to track the adoption of SnQL
        query_type = "simple"
        if language == Language.SNQL:
            if isinstance(request.query, CompositeQuery):
                if isinstance(request.query.get_from_clause(), JoinClause):
                    query_type = "join"
                else:
                    query_type = "subquery"

            metrics.increment(
                "snql.query.success", tags={"referrer": referrer, "type": query_type}
            )

    except QueryException as exception:
        status = 500
        details: Mapping[str, Any]

        cause = exception.__cause__
        if isinstance(cause, RateLimitExceeded):
            status = 429
            details = {
                "type": "rate-limited",
                "message": "rate limit exceeded",
            }
        elif isinstance(cause, ClickhouseError):
            details = {
                "type": "clickhouse",
                "message": str(cause),
                "code": cause.code,
            }
        elif isinstance(cause, Exception):
            details = {
                "type": "unknown",
                "message": str(cause),
            }
        else:
            raise  # exception should have been chained

        if language == Language.SNQL:
            metrics.increment(
                "snql.query.failed", tags={"referrer": referrer, "status": f"{status}"},
            )

        return Response(
            json.dumps(
                {"error": details, "timing": timer.for_json(), **exception.extra}
            ),
            status,
            {"Content-Type": "application/json"},
        )

    payload: MutableMapping[str, Any] = {**result.result, "timing": timer.for_json()}

    if settings.STATS_IN_RESPONSE or request.settings.get_debug():
        payload.update(result.extra)

    return Response(json.dumps(payload), 200, {"Content-Type": "application/json"})
コード例 #30
0
def _parse_query_impl(body: MutableMapping[str, Any],
                      dataset: Dataset) -> Query:
    def build_selected_expressions(
        raw_expressions: Sequence[Any], ) -> List[SelectedExpression]:
        output = []
        for raw_expression in raw_expressions:
            exp = parse_expression(tuplify(raw_expression),
                                   dataset.get_abstract_columnset(), set())
            output.append(
                SelectedExpression(
                    # An expression in the query can be a string or a
                    # complex list with an alias. In the second case
                    # we trust the parser to find the alias.
                    name=raw_expression
                    if isinstance(raw_expression, str) else exp.alias,
                    expression=exp,
                ))
        return output

    aggregations = []
    for aggregation in body.get("aggregations", []):
        if not isinstance(aggregation, Sequence):
            raise ParsingException((
                f"Invalid aggregation structure {aggregation}. "
                "It must be a sequence containing expression, column and alias."
            ))
        aggregation_function = aggregation[0]
        column_expr = aggregation[1]
        column_expr = column_expr if column_expr else []
        alias = aggregation[2]
        alias = alias if alias else None

        aggregations.append(
            SelectedExpression(
                name=alias,
                expression=parse_aggregation(
                    aggregation_function,
                    column_expr,
                    alias,
                    dataset.get_abstract_columnset(),
                    set(),
                ),
            ))

    groupby_clause = build_selected_expressions(
        to_list(body.get("groupby", [])))

    select_clause = (
        groupby_clause + aggregations +
        build_selected_expressions(body.get("selected_columns", [])))

    array_join_cols = set()
    arrayjoin = body.get("arrayjoin")
    # TODO: Properly detect all array join columns in all clauses of the query.
    # This is missing an arrayJoin in condition with an alias that is then
    # used in the select.
    if arrayjoin:
        array_join_cols.add(arrayjoin)
        array_join_expr: Optional[Expression] = parse_expression(
            body["arrayjoin"], dataset.get_abstract_columnset(), {arrayjoin})
    else:
        array_join_expr = None
        for select_expr in select_clause:
            if isinstance(select_expr.expression, FunctionCall):
                if select_expr.expression.function_name == "arrayJoin":
                    parameters = select_expr.expression.parameters
                    if len(parameters) != 1:
                        raise ParsingException(
                            "arrayJoin(...) only accepts a single parameter.")
                    if isinstance(parameters[0], Column):
                        array_join_cols.add(parameters[0].column_name)
                    else:
                        # We only accepts columns or functions that do not
                        # reference columns. We could not say whether we are
                        # actually arrayjoining on the values of the column
                        # if it is nested in an arbitrary function. But
                        # functions of literals are fine.
                        for e in parameters[0]:
                            if isinstance(e, Column):
                                raise ParsingException(
                                    "arrayJoin(...) cannot contain columns nested in functions."
                                )

    where_expr = parse_conditions_to_expr(body.get("conditions", []), dataset,
                                          array_join_cols)
    having_expr = parse_conditions_to_expr(body.get("having", []), dataset,
                                           array_join_cols)

    orderby_exprs = []
    for orderby in to_list(body.get("orderby", [])):
        if isinstance(orderby, str):
            match = NEGATE_RE.match(orderby)
            if match is None:
                raise ParsingException((
                    f"Invalid Order By clause {orderby}. If the Order By is a string, "
                    "it must respect the format `[-]column`"))
            direction, col = match.groups()
            orderby = col
        elif is_function(orderby):
            match = NEGATE_RE.match(orderby[0])
            if match is None:
                raise ParsingException((
                    f"Invalid Order By clause {orderby}. If the Order By is an expression, "
                    "the function name must respect the format `[-]func_name`"
                ))
            direction, col = match.groups()
            orderby = [col] + orderby[1:]
        else:
            raise ParsingException(
                (f"Invalid Order By clause {orderby}. The Clause was neither "
                 "a string nor a function call."))
        orderby_parsed = parse_expression(tuplify(orderby),
                                          dataset.get_abstract_columnset(),
                                          set())
        orderby_exprs.append(
            OrderBy(
                OrderByDirection.DESC
                if direction == "-" else OrderByDirection.ASC,
                orderby_parsed,
            ))

    return Query(
        body,
        None,
        selected_columns=select_clause,
        array_join=array_join_expr,
        condition=where_expr,
        groupby=[g.expression for g in groupby_clause],
        having=having_expr,
        order_by=orderby_exprs,
    )