Beispiel #1
0
def detect_table(query: Query, events_only_columns: ColumnSet,
                 transactions_only_columns: ColumnSet) -> str:
    """
    Given a query, we attempt to guess whether it is better to fetch data from the
    "events" or "transactions" storage. This is going to be wrong in some cases.
    """
    # First check for a top level condition that matches either type = transaction
    # type != transaction.
    conditions = query.get_conditions()
    if conditions:
        for idx, condition in enumerate(conditions):
            if is_condition(condition):
                if tuple(condition) == ("type", "=", "error"):
                    return EVENTS
                elif tuple(condition) == ("type", "=", "transaction"):
                    return TRANSACTIONS

    # Check for any conditions that reference a table specific field
    condition_columns = query.get_columns_referenced_in_conditions()
    if any(events_only_columns.get(col) for col in condition_columns):
        return EVENTS
    if any(transactions_only_columns.get(col) for col in condition_columns):
        return TRANSACTIONS

    # Check for any other references to a table specific field
    all_referenced_columns = query.get_all_referenced_columns()
    if any(events_only_columns.get(col) for col in all_referenced_columns):
        return EVENTS
    if any(
            transactions_only_columns.get(col)
            for col in all_referenced_columns):
        return TRANSACTIONS

    # Use events by default
    return EVENTS
Beispiel #2
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        def process_column(exp: Expression) -> Expression:
            if isinstance(exp, Column):
                if exp.column_name == "group_id":
                    return FunctionCall(
                        exp.alias,
                        "nullIf",
                        (
                            Column(None, exp.column_name, exp.table_name),
                            Literal(None, 0),
                        ),
                    )
                elif exp.column_name == "message":
                    # Because of the rename from message->search_message without backfill,
                    # records will have one or the other of these fields.
                    # TODO this can be removed once all data has search_message filled in.
                    return FunctionCall(
                        exp.alias,
                        "coalesce",
                        (
                            Column(None, exp.column_name, exp.table_name),
                            Column(None, "search_message", exp.table_name),
                        ),
                    )

            return exp

        query.transform_expressions(process_column)
Beispiel #3
0
    def test_nested_aggregate_legacy_format(self, dataset):
        source = (dataset.get_all_storages()
                  [0].get_schemas().get_read_schema().get_data_source())
        priority = [
            "toUInt64(plus(multiply(log(times_seen), 600), last_seen))",
            "",
            "priority",
        ]
        assert (
            column_expr(
                dataset,
                "",
                Query({"aggregations": [priority]}, source),
                ParsingContext(),
                priority[2],
                priority[0],
            ) ==
            "(toUInt64(plus(multiply(log(times_seen), 600), last_seen)) AS priority)"
        )

        top_k = ["topK(3)", "logger", "top_3"]
        assert (column_expr(
            dataset,
            top_k[1],
            Query({"aggregations": [top_k]}, source),
            ParsingContext(),
            top_k[2],
            top_k[0],
        ) == "(topK(3)(logger) AS top_3)")
Beispiel #4
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        def process_functions(exp: Expression) -> Expression:
            if isinstance(exp, FunctionCall):
                if exp.function_name == "uniq":
                    return FunctionCall(
                        exp.alias,
                        "ifNull",
                        (
                            replace(exp, alias=None),
                            Literal(None, 0),
                        ),
                    )
                if exp.function_name == "emptyIfNull":
                    return FunctionCall(
                        exp.alias,
                        "ifNull",
                        (
                            replace(exp, alias=None),
                            Literal(None, ""),
                        ),
                    )
            if isinstance(exp, CurriedFunctionCall):
                if exp.internal_function.function_name == "top":
                    return replace(
                        exp,
                        internal_function=replace(exp.internal_function,
                                                  function_name="topK"),
                    )
            return exp

        query.transform_expressions(process_functions)
def test_query_extension_processing(
    raw_data: dict,
    expected_conditions: Sequence[Condition],
    expected_granularity: int,
):
    state.set_config('max_days', 1)
    extension = TimeSeriesExtension(
        default_granularity=60,
        default_window=datetime.timedelta(days=5),
        timestamp_column='timestamp',
    )
    valid_data = validate_jsonschema(raw_data, extension.get_schema())
    query = Query(
        {"conditions": []},
        TableSource("my_table", ColumnSet([])),
    )

    request_settings = RequestSettings(turbo=False,
                                       consistent=False,
                                       debug=False)

    extension.get_processor().process_query(query, valid_data,
                                            request_settings)
    assert query.get_conditions() == expected_conditions
    assert query.get_granularity() == expected_granularity
Beispiel #6
0
    def process_query(
        self,
        query: Query,
        request_settings: RequestSettings,
    ) -> None:
        from_clause = query.get_data_source()
        if not isinstance(from_clause, JoinClause):
            return

        referenced_columns = query.get_all_referenced_columns()
        referenced_aliases = set()
        for qualified_column in referenced_columns:
            # This will be much better when we will represent columns
            # with a more structured data type than strings.
            match = QUALIFIED_COLUMN_REGEX.match(qualified_column)
            if match:
                # match[1] is the first parenthesized group in the regex, thus
                # the table alias.
                table_alias = match[1]
                referenced_aliases.add(table_alias)

        assert (len(referenced_aliases) >
                0), "Trying to otpimize a join query without aliases"
        if len(referenced_aliases) > 1:
            return

        from_tables = from_clause.get_tables()
        table = from_tables[referenced_aliases.pop()]

        query.set_data_source(table)
Beispiel #7
0
def test_prewhere(initial_table, consistent, expected_table) -> None:
    state.set_config("enable_events_readonly_table", True)
    body = {
        "conditions": [
            ["d", "=", "1"],
            ["c", "=", "3"],
            ["a", "=", "1"],
            ["b", "=", "2"],
        ],
    }
    cols = ColumnSet([("col", String())])
    query = Query(
        body,
        TableSource(initial_table, cols, [["time", "=", "1"]], ["c1"]),
    )

    request_settings = HTTPRequestSettings(consistent=consistent)
    processor = ReadOnlyTableSelector("sentry_dist", "sentry_ro")
    processor.process_query(query, request_settings)

    source = query.get_data_source()
    assert isinstance(source, TableSource)
    assert source.format_from() == expected_table
    assert source.get_columns() == cols
    assert source.get_prewhere_candidates() == ["c1"]
    assert source.get_mandatory_conditions() == [["time", "=", "1"]]
Beispiel #8
0
 def process_query(self, query: Query,
                   extension_data: ExtensionData) -> None:
     from_date, to_date = self.get_time_limit(extension_data)
     query.add_conditions([
         (self.__timestamp_column, '>=', from_date.isoformat()),
         (self.__timestamp_column, '<', to_date.isoformat()),
     ])
Beispiel #9
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        def process_functions(exp: Expression) -> Expression:
            if isinstance(exp, FunctionCall) and exp.function_name == "impact":
                assert len(exp.parameters) == 3
                column = exp.parameters[0]
                satisfied = exp.parameters[1]
                user_column = exp.parameters[2]

                return plus(
                    minus(Literal(None, 1), apdex(column, satisfied)),
                    multiply(
                        minus(
                            Literal(None, 1),
                            div(
                                Literal(None, 1),
                                FunctionCall(
                                    None,
                                    "sqrt",
                                    (FunctionCall(None, "uniq", user_column)),
                                ),
                            ),
                        ),
                        Literal(None, 3),
                    ),
                )

            return exp

        query.transform_expressions(process_functions)
Beispiel #10
0
def test_join_optimizer_two_tables(
    selected_cols: Sequence[Any],
    conditions: Sequence[Condition],
    groupby: Groupby,
    expected: str,
) -> None:
    query = Query(
        {
            "selected_columns": selected_cols,
            "conditions": conditions,
            "arrayjoin": None,
            "having": [],
            "groupby": groupby,
            "aggregations": [],
            "orderby": None,
            "limitby": None,
            "sample": 10,
            "limit": 100,
            "offset": 50,
            "totals": True,
            "granularity": 60,
        },
        simple_join_structure,
    )
    request_settings = HTTPRequestSettings()

    optimizer = SimpleJoinOptimizer()
    optimizer.process_query(query, request_settings)

    assert query.get_data_source().format_from() == expected
 def process_query(
     self,
     query: Query,
     extension_data: ExtensionData,
     request_settings: RequestSettings,
 ) -> None:
     organization_id = extension_data["organization"]
     query.add_conditions([("org_id", "=", organization_id)])
def test_format_expressions(pre_format: Query, expected_query: Query) -> None:
    copy = deepcopy(pre_format)
    BasicFunctionsProcessor().process_query(copy, HTTPRequestSettings())
    assert (copy.get_selected_columns_from_ast() ==
            expected_query.get_selected_columns_from_ast())
    assert copy.get_groupby_from_ast() == expected_query.get_groupby_from_ast()
    assert copy.get_condition_from_ast(
    ) == expected_query.get_condition_from_ast()
Beispiel #13
0
    def column_expr(
        self,
        column_name,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ):
        detected_dataset = detect_table(query, self.__events_columns,
                                        self.__transactions_columns)

        if detected_dataset == TRANSACTIONS:
            if column_name == "time":
                return self.time_expr("finish_ts", query.get_granularity(),
                                      table_alias)
            if column_name == "type":
                return "'transaction'"
            if column_name == "timestamp":
                return "finish_ts"
            if column_name == "username":
                return "user_name"
            if column_name == "email":
                return "user_email"
            if column_name == "transaction":
                return "transaction_name"
            if column_name == "message":
                return "transaction_name"
            if column_name == "title":
                return "transaction_name"
            if column_name == "group_id":
                # TODO: We return 0 here instead of NULL so conditions like group_id
                # in (1, 2, 3) will work, since Clickhouse won't run a query like:
                # SELECT (NULL AS group_id) FROM transactions WHERE group_id IN (1, 2, 3)
                # When we have the query AST, we should solve this by transforming the
                # nonsensical conditions instead.
                return "0"
            if column_name == "geo_country_code":
                column_name = "contexts[geo.country_code]"
            if column_name == "geo_region":
                column_name = "contexts[geo.region]"
            if column_name == "geo_city":
                column_name = "contexts[geo.city]"
            if self.__events_columns.get(column_name):
                return "NULL"
        else:
            if column_name == "time":
                return self.time_expr("timestamp", query.get_granularity(),
                                      table_alias)
            if column_name == "release":
                column_name = "tags[sentry:release]"
            if column_name == "dist":
                column_name = "tags[sentry:dist]"
            if column_name == "user":
                column_name = "tags[sentry:user]"
            if self.__transactions_columns.get(column_name):
                return "NULL"

        return get_dataset(detected_dataset).column_expr(
            column_name, query, parsing_context)
Beispiel #14
0
def test_col_replacement(
    initial_query: MutableMapping[str, Any],
    old_col: str,
    new_col: str,
    expected: Mapping[str, Any],
):
    query = Query(initial_query, TableSource("my_table", ColumnSet([])))
    query.replace_column(old_col, new_col)
    assert expected == query.get_body()
Beispiel #15
0
    def setup_method(self, test_method):
        super().setup_method(test_method)
        raw_data = {"project": 2}

        self.extension = ProjectExtension(
            processor=ProjectWithGroupsProcessor(project_column="project_id")
        )
        self.valid_data = validate_jsonschema(raw_data, self.extension.get_schema())
        self.query = Query({"conditions": []}, TableSource("my_table", ColumnSet([])),)
Beispiel #16
0
def test_prewhere(query_body, keys, new_conditions, prewhere_conditions) -> None:
    settings.MAX_PREWHERE_CONDITIONS = 2
    query = Query(query_body, TableSource("my_table", ColumnSet([]), None, keys),)

    request_settings = HTTPRequestSettings()
    processor = PrewhereProcessor()
    processor.process_query(query, request_settings)

    assert query.get_conditions() == new_conditions
    assert query.get_prewhere() == prewhere_conditions
class TestProjectExtensionWithGroups(BaseTest):
    def setup_method(self, test_method):
        super().setup_method(test_method)
        raw_data = {'project': 2}

        self.extension = ProjectExtension(
            processor=ProjectWithGroupsProcessor()
        )
        self.valid_data = validate_jsonschema(raw_data, self.extension.get_schema())
        self.query = Query({
            "conditions": []
        })

    def test_with_turbo(self):
        request_settings = RequestSettings(turbo=True, consistent=False, debug=False)

        self.extension.get_processor().process_query(self.query, self.valid_data, request_settings)

        assert self.query.get_conditions() == [('project_id', 'IN', [2])]

    def test_without_turbo_with_projects_needing_final(self):
        request_settings = RequestSettings(turbo=False, consistent=False, debug=False)
        replacer.set_project_needs_final(2)

        self.extension.get_processor().process_query(self.query, self.valid_data, request_settings)

        assert self.query.get_conditions() == [('project_id', 'IN', [2])]
        assert self.query.get_final()

    def test_without_turbo_without_projects_needing_final(self):
        request_settings = RequestSettings(turbo=False, consistent=False, debug=False)

        self.extension.get_processor().process_query(self.query, self.valid_data, request_settings)

        assert self.query.get_conditions() == [('project_id', 'IN', [2])]
        assert not self.query.get_final()

    def test_when_there_are_not_many_groups_to_exclude(self):
        request_settings = RequestSettings(turbo=False, consistent=False, debug=False)
        state.set_config('max_group_ids_exclude', 5)
        replacer.set_project_exclude_groups(2, [100, 101, 102])

        self.extension.get_processor().process_query(self.query, self.valid_data, request_settings)

        expected = [
            ('project_id', 'IN', [2]),
            (['assumeNotNull', ['group_id']], 'NOT IN', [100, 101, 102])
        ]
        assert self.query.get_conditions() == expected
        assert not self.query.get_final()

    def test_when_there_are_too_many_groups_to_exclude(self):
        request_settings = RequestSettings(turbo=False, consistent=False, debug=False)
        state.set_config('max_group_ids_exclude', 2)
        replacer.set_project_exclude_groups(2, [100, 101, 102])

        self.extension.get_processor().process_query(self.query, self.valid_data, request_settings)

        assert self.query.get_conditions() == [('project_id', 'IN', [2])]
        assert self.query.get_final()
Beispiel #18
0
def test_conditions_expr():
    dataset = get_dataset("groups")
    state.set_config('use_escape_alias', 1)
    conditions = [['events.a', '=', 1]]
    assert conditions_expr(dataset, conditions, Query({}),
                           ParsingContext()) == '(events.a AS `events.a`) = 1'

    conditions = [[['events.a', '=', 1], ['groups.b', '=', 2]],
                  [['events.c', '=', 3], ['groups.d', '=', 4]]]
    assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \
        == ('((events.a AS `events.a`) = 1 OR (groups.b AS `groups.b`) = 2)'
        ' AND ((events.c AS `events.c`) = 3 OR (groups.d AS `groups.d`) = 4)'
        )

    # Test column expansion
    conditions = [[['events.tags[foo]', '=', 1], ['groups.b', '=', 2]]]
    expanded = column_expr(dataset, 'events.tags[foo]', Query({}),
                           ParsingContext())
    assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \
        == '({} = 1 OR (groups.b AS `groups.b`) = 2)'.format(expanded)

    # Test using alias if column has already been expanded in SELECT clause
    reuse_query = Query({})
    parsing_context = ParsingContext()
    conditions = [[['events.tags[foo]', '=', 1], ['groups.b', '=', 2]]]
    column_expr(dataset, 'events.tags[foo]', reuse_query,
                parsing_context)  # Expand it once so the next time is aliased
    assert conditions_expr(dataset, conditions, reuse_query, parsing_context) \
        == '(`events.tags[foo]` = 1 OR (groups.b AS `groups.b`) = 2)'

    # Test special output format of LIKE
    conditions = [['events.primary_hash', 'LIKE', '%foo%']]
    assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \
        == '(events.primary_hash AS `events.primary_hash`) LIKE \'%foo%\''

    conditions = tuplify(
        [[['notEmpty', ['arrayElement', ['events.exception_stacks.type', 1]]],
          '=', 1]])
    assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \
        == 'notEmpty(arrayElement((events.exception_stacks.type AS `events.exception_stacks.type`), 1)) = 1'

    conditions = tuplify([[['notEmpty', ['events.tags[sentry:user]']], '=',
                           1]])
    assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \
        == 'notEmpty(`events.tags[sentry:user]`) = 1'

    conditions = tuplify([[['notEmpty', ['events.tags_key']], '=', 1]])
    assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \
        == 'notEmpty((arrayJoin(events.tags.key) AS `events.tags_key`)) = 1'

    # Test scalar condition on array column is expanded as an iterator.
    conditions = [['events.exception_frames.filename', 'LIKE', '%foo%']]
    assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \
        == 'arrayExists(x -> assumeNotNull(x LIKE \'%foo%\'), (events.exception_frames.filename AS `events.exception_frames.filename`))'
Beispiel #19
0
def test_empty_query():
    query = Query({})

    assert query.get_selected_columns() is None
    assert query.get_aggregations() is None
    assert query.get_groupby() is None
    assert query.get_conditions() is None
    assert query.get_orderby() is None
    assert query.get_sample() is None
    assert query.get_limit() == 0
    assert query.get_offset() == 0
    def setup_method(self, test_method):
        super().setup_method(test_method)
        raw_data = {'project': 2}

        self.extension = ProjectExtension(
            processor=ProjectWithGroupsProcessor()
        )
        self.valid_data = validate_jsonschema(raw_data, self.extension.get_schema())
        self.query = Query({
            "conditions": []
        })
Beispiel #21
0
    def process_query(self, query: Query, request_settings: RequestSettings) -> None:
        def process_functions(exp: Expression) -> Expression:
            if isinstance(exp, FunctionCall) and exp.function_name == "apdex":
                assert len(exp.parameters) == 2
                column = exp.parameters[0]
                satisfied = exp.parameters[1]
                return apdex(column, satisfied)

            return exp

        query.transform_expressions(process_functions)
Beispiel #22
0
 def process_query(
     self,
     query: Query,
     extension_data: ExtensionData,
     request_settings: RequestSettings,
 ) -> None:
     from_date, to_date = self.get_time_limit(extension_data)
     query.set_granularity(extension_data["granularity"])
     query.add_conditions([
         (self.__timestamp_column, '>=', from_date.isoformat()),
         (self.__timestamp_column, '<', to_date.isoformat()),
     ])
Beispiel #23
0
def test_query_extension_processing(raw_data: dict,
                                    expected_conditions: Sequence[Condition]):
    state.set_config('max_days', 1)
    extension = TimeSeriesExtension(
        default_granularity=3600,
        default_window=datetime.timedelta(days=5),
        timestamp_column='timestamp',
    )
    valid_data = validate_jsonschema(raw_data, extension.get_schema())
    query = Query({"conditions": []})

    extension.get_processor().process_query(query, valid_data)
    assert query.get_conditions() == expected_conditions
def test_project_extension_query_processing(raw_data: dict, expected_conditions: Sequence[Condition]):
    extension = ProjectExtension(
        processor=ProjectExtensionProcessor()
    )
    valid_data = validate_jsonschema(raw_data, extension.get_schema())
    query = Query({
        "conditions": []
    })
    request_settings = RequestSettings(turbo=False, consistent=False, debug=False)

    extension.get_processor().process_query(query, valid_data, request_settings)

    assert query.get_conditions() == expected_conditions
Beispiel #25
0
    def test_data_source(
        self, query_body: MutableMapping[str, Any], expected_dataset: str
    ):
        query = Query(query_body, get_dataset_source("discover"))

        request_settings = HTTPRequestSettings()
        for processor in get_dataset("discover").get_query_processors():
            processor.process_query(query, request_settings)

        assert (
            query.get_data_source().format_from()
            == get_dataset_source(expected_dataset).format_from()
        )
Beispiel #26
0
    def process_query(
            self,
            query: Query,
            extension_data: ExtensionData,
            request_settings: RequestSettings,
    ) -> None:
        project_ids = util.to_list(extension_data['project'])

        if project_ids:
            query.add_conditions([('project_id', 'IN', project_ids)])

        request_settings.add_rate_limit(self._get_rate_limit_params(project_ids))

        self.do_post_processing(project_ids, query, request_settings)
Beispiel #27
0
 def do_post_processing(
     self,
     project_ids: Sequence[int],
     query: Query,
     request_settings: RequestSettings,
 ) -> None:
     if not request_settings.get_turbo():
         final, exclude_group_ids = get_projects_query_flags(
             project_ids, self.__replacer_state_name)
         if not final and exclude_group_ids:
             # If the number of groups to exclude exceeds our limit, the query
             # should just use final instead of the exclusion set.
             max_group_ids_exclude = get_config(
                 "max_group_ids_exclude",
                 settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE)
             if len(exclude_group_ids) > max_group_ids_exclude:
                 query.set_final(True)
             else:
                 query.add_conditions([(["assumeNotNull", ["group_id"]],
                                        "NOT IN", exclude_group_ids)])
                 query.add_condition_to_ast(
                     not_in_condition(
                         None,
                         FunctionCall(None, "assumeNotNull",
                                      (Column(None, "group_id", None), )),
                         [Literal(None, p) for p in exclude_group_ids],
                     ))
         else:
             query.set_final(final)
def test_organization_extension_query_processing_happy_path():
    extension = OrganizationExtension()
    raw_data = {"organization": 2}

    valid_data = validate_jsonschema(raw_data, extension.get_schema())
    query = Query({"conditions": []})
    request_settings = RequestSettings(turbo=False,
                                       consistent=False,
                                       debug=False)

    extension.get_processor().process_query(query, valid_data,
                                            request_settings)

    assert query.get_conditions() == [("org_id", "=", 2)]
 def process_query(
     self,
     query: Query,
     extension_data: ExtensionData,
     request_settings: RequestSettings,
 ) -> None:
     organization_id = extension_data["organization"]
     query.add_conditions([("org_id", "=", organization_id)])
     query.add_condition_to_ast(
         binary_condition(
             None,
             ConditionFunctions.EQ,
             Column(None, "org_id", None),
             Literal(None, organization_id),
         ))
Beispiel #30
0
def test_project_extension_query_adds_rate_limits():
    extension = ProjectExtension(processor=ProjectExtensionProcessor(
        project_column="project_id"))
    raw_data = {'project': [2, 3]}
    valid_data = validate_jsonschema(raw_data, extension.get_schema())
    query = Query(
        {"conditions": []},
        TableSource("my_table", ColumnSet([])),
    )
    request_settings = RequestSettings(turbo=False,
                                       consistent=False,
                                       debug=False)

    num_rate_limits_before_processing = len(
        request_settings.get_rate_limit_params())
    extension.get_processor().process_query(query, valid_data,
                                            request_settings)

    rate_limits = request_settings.get_rate_limit_params()
    # make sure a rate limit was added by the processing
    assert len(rate_limits) == num_rate_limits_before_processing + 1

    most_recent_rate_limit = rate_limits[-1]
    assert most_recent_rate_limit.bucket == '2'
    assert most_recent_rate_limit.per_second_limit == 1000
    assert most_recent_rate_limit.concurrent_limit == 1000