Example #1
0
def get_levels_overview(group):
    query = (Query("events", Entity("events")).set_select([
        Column("primary_hash"),
        Function("max", [Function("length", [Column("hierarchical_hashes")])],
                 "num_levels"),
        _current_level_expr(group),
    ]).set_where(_get_group_filters(group)).set_groupby(
        [Column("primary_hash")]))

    res = snuba.raw_snql_query(
        query, referrer="api.group_hashes_levels.get_levels_overview")

    if not res["data"]:
        raise NoEvents()

    if len(res["data"]) > 1:
        raise MergedIssues()

    assert len(res["data"]) == 1

    fields = res["data"][0]

    if fields["num_levels"] <= 0:
        raise NotHierarchical()

    # TODO: Cache this if it takes too long. This is called from multiple
    # places, grouping overview and then again in the new-issues endpoint.

    return LevelsOverview(
        current_level=fields["current_level"] - 1,
        only_primary_hash=fields["primary_hash"],
        num_levels=fields["num_levels"],
    )
Example #2
0
def _current_level_expr(group):
    materialized_hashes = {
        gh.hash
        for gh in GroupHash.objects.filter(project=group.project, group=group)
    }

    # Evaluates to the index of the last hash that is in materialized_hashes,
    # or 1 otherwise.
    find_hash_expr = _construct_arraymax([1] + [  # type: ignore
        Function("indexOf", [Column("hierarchical_hashes"), hash])
        for hash in materialized_hashes
    ])

    return Function("max", [find_hash_expr], "current_level")
def _get_hash_for_parent_level(group: Group, id: int,
                               levels_overview: LevelsOverview) -> str:
    # If this is violated, there cannot be a 1:1 mapping between level and hash.
    assert 0 <= id < levels_overview.current_level

    # This cache never needs explicit invalidation because during every level
    # change, the group ID changes.
    #
    # No idea if the query is slow, caching just because I can.
    cache_key = f"group-parent-level-hash:{group.id}:{id}"

    return_hash: str = cache.get(cache_key)

    if return_hash is None:
        query = (Query("events", Entity("events")).set_select([
            Function("arrayElement", [Column("hierarchical_hashes"), id + 1],
                     "hash")
        ]).set_where(_get_group_filters(group)).set_limit(1))

        return_hash: str = get_path(snuba.raw_snql_query(query), "data", 0,
                                    "hash")  # type: ignore
        cache.set(cache_key, return_hash)

    assert return_hash
    return return_hash
Example #4
0
def _get_full_hierarchical_hashes(group: Group, hash: str) -> Optional[Sequence[str]]:
    query = (
        Query("events", Entity("events"))
        .set_select(
            [
                Column("hierarchical_hashes"),
            ]
        )
        .set_where(
            _get_group_filters(group)
            + [
                Condition(
                    Function(
                        "has",
                        [Column("hierarchical_hashes"), hash],
                    ),
                    Op.EQ,
                    1,
                ),
            ]
        )
    )

    data = snuba.raw_snql_query(query, referrer="group_split.get_full_hierarchical_hashes")["data"]
    if not data:
        return None

    return data[0]["hierarchical_hashes"]
Example #5
0
def _construct_arraymax(elements):
    # XXX(markus): This is quite horrible but Snuba SDK does not allow us to do
    # arrayMax([<other function call>, ...]), i.e. it does not allow function
    # calls in array literals. So instead of arrayMax([1, 2, 3]) we do
    # greatest(1, greatest(2, 3)).
    assert elements

    if len(elements) == 1:
        return elements[0]

    # Attempt to build well-balanced 'tree' of greatest() such that
    # we don't run into ClickHouse recursion limits.

    return Function(
        "greatest",
        [
            _construct_arraymax(elements[:len(elements) // 2]),
            _construct_arraymax(elements[len(elements) // 2:]),
        ],
    )
Example #6
0
    def query(
        self,
        projects: Sequence[Project],
        retention_window_start: Optional[datetime],
        group_queryset: QuerySet,
        environments: Sequence[Environment],
        sort_by: str,
        limit: int,
        cursor: Optional[Cursor],
        count_hits: bool,
        paginator_options: Mapping[str, Any],
        search_filters: Sequence[SearchFilter],
        date_from: Optional[datetime],
        date_to: Optional[datetime],
        max_hits=None,
    ) -> CursorResult:

        if not validate_cdc_search_filters(search_filters):
            raise InvalidQueryForExecutor(
                "Search filters invalid for this query executor")

        start, end, retention_date = self.calculate_start_end(
            retention_window_start, search_filters, date_from, date_to)

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatibility
            # with Django search (for now).
            return self.empty_result

        if start >= end:
            # TODO: This maintains backwards compatibility with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return self.empty_result

        e_event = self.entities["event"]
        e_group = self.entities["group"]

        where_conditions = [
            Condition(Column("project_id", e_event), Op.IN,
                      [p.id for p in projects]),
            Condition(Column("timestamp", e_event), Op.GTE, start),
            Condition(Column("timestamp", e_event), Op.LT, end),
        ]
        # TODO: This is still basically only handling status, handle this better once we introduce
        # more conditions.
        for search_filter in search_filters:
            where_conditions.append(
                Condition(Column(search_filter.key.name, e_group), Op.IN,
                          search_filter.value.raw_value))

        if environments:
            # TODO: Should this be handled via filter_keys, once we have a snql compatible version?
            where_conditions.append(
                Condition(Column("environment", e_event), Op.IN,
                          [e.name for e in environments]))

        sort_func = self.aggregation_defs[self.sort_strategies[sort_by]]

        having = []
        if cursor is not None:
            op = Op.GTE if cursor.is_prev else Op.LTE
            having.append(Condition(sort_func, op, cursor.value))

        query = Query(
            "events",
            match=Join([Relationship(e_event, "grouped", e_group)]),
            select=[
                Column("id", e_group),
                replace(sort_func, alias="score"),
            ],
            where=where_conditions,
            groupby=[Column("id", e_group)],
            having=having,
            orderby=[OrderBy(sort_func, direction=Direction.DESC)],
            limit=Limit(limit + 1),
        )

        data = snuba.raw_snql_query(
            query, referrer="search.snuba.cdc_search.query")["data"]

        hits_query = Query(
            "events",
            match=Join([Relationship(e_event, "grouped", e_group)]),
            select=[
                Function("uniq", [Column("id", e_group)], alias="count"),
            ],
            where=where_conditions,
        )
        hits = None
        if count_hits:
            hits = snuba.raw_snql_query(
                hits_query,
                referrer="search.snuba.cdc_search.hits")["data"][0]["count"]

        paginator_results = SequencePaginator(
            [(row["score"], row["g.id"]) for row in data],
            reverse=True,
            **paginator_options,
        ).get_result(limit, cursor, known_hits=hits, max_hits=max_hits)
        # We filter against `group_queryset` here so that we recheck all conditions in Postgres.
        # Since replay between Postgres and Clickhouse can happen, we might get back results that
        # have changed state in Postgres. By rechecking them we guarantee than any returned results
        # have the correct state.
        # TODO: This can result in us returning less than a full page of results, but shouldn't
        # affect cursors. If we want to, we can iterate and query snuba until we manage to get a
        # full page. In practice, this will likely only skip a couple of results at worst, and
        # probably not be noticeable to the user, so holding off for now to reduce complexity.
        groups = group_queryset.in_bulk(paginator_results.results)
        paginator_results.results = [
            groups[k] for k in paginator_results.results if k in groups
        ]
        return paginator_results
Example #7
0
class CdcPostgresSnubaQueryExecutor(PostgresSnubaQueryExecutor):
    sort_strategies = {
        "date": "last_seen",
        "freq": "times_seen",
        "new": "first_seen",
        "priority": "priority",
        "user": "******",
    }

    entities = {
        "event": Entity("events", alias="e"),
        "group": Entity("groupedmessage", alias="g"),
    }
    times_seen_aggregation = Function(
        "ifNull",
        [Function("count", [Column("group_id", entities["event"])]), 0])
    first_seen_aggregation = Function(
        "ifNull",
        [
            Function(
                "multiply",
                [
                    Function("toUInt64", [
                        Function("min",
                                 [Column("timestamp", entities["event"])])
                    ]),
                    1000,
                ],
            ),
            0,
        ],
    )
    last_seen_aggregation = Function(
        "ifNull",
        [
            Function(
                "multiply",
                [
                    Function("toUInt64", [
                        Function("max",
                                 [Column("timestamp", entities["event"])])
                    ]),
                    1000,
                ],
            ),
            0,
        ],
    )

    aggregation_defs = {
        "times_seen":
        times_seen_aggregation,
        "first_seen":
        first_seen_aggregation,
        "last_seen":
        last_seen_aggregation,
        # https://github.com/getsentry/sentry/blob/804c85100d0003cfdda91701911f21ed5f66f67c/src/sentry/event_manager.py#L241-L271
        "priority":
        Function(
            "toUInt64",
            [
                Function(
                    "plus",
                    [
                        Function(
                            "multiply",
                            [
                                Function(
                                    "log",
                                    [times_seen_aggregation],
                                ),
                                600,
                            ],
                        ),
                        last_seen_aggregation,
                    ],
                )
            ],
        ),
        "user_count":
        Function("ifNull", [
            Function("uniq", [Column("tags[sentry:user]", entities["event"])]),
            0
        ]),
    }

    def calculate_start_end(
        self,
        retention_window_start: Optional[datetime],
        search_filters: Sequence[SearchFilter],
        date_from: Optional[datetime],
        date_to: Optional[datetime],
    ):
        now = timezone.now()
        end = None
        end_params = [
            _f for _f in
            [date_to, get_search_filter(search_filters, "date", "<")] if _f
        ]
        if end_params:
            end = min(end_params)

        if not end:
            end = now + ALLOWED_FUTURE_DELTA

        retention_date = max(
            _f for _f in [retention_window_start, now - timedelta(days=90)]
            if _f)
        start_params = [
            date_from, retention_date,
            get_search_filter(search_filters, "date", ">")
        ]
        start = max(_f for _f in start_params if _f)
        end = max([retention_date, end])
        return start, end, retention_date

    def query(
        self,
        projects: Sequence[Project],
        retention_window_start: Optional[datetime],
        group_queryset: QuerySet,
        environments: Sequence[Environment],
        sort_by: str,
        limit: int,
        cursor: Optional[Cursor],
        count_hits: bool,
        paginator_options: Mapping[str, Any],
        search_filters: Sequence[SearchFilter],
        date_from: Optional[datetime],
        date_to: Optional[datetime],
        max_hits=None,
    ) -> CursorResult:

        if not validate_cdc_search_filters(search_filters):
            raise InvalidQueryForExecutor(
                "Search filters invalid for this query executor")

        start, end, retention_date = self.calculate_start_end(
            retention_window_start, search_filters, date_from, date_to)

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatibility
            # with Django search (for now).
            return self.empty_result

        if start >= end:
            # TODO: This maintains backwards compatibility with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return self.empty_result

        e_event = self.entities["event"]
        e_group = self.entities["group"]

        where_conditions = [
            Condition(Column("project_id", e_event), Op.IN,
                      [p.id for p in projects]),
            Condition(Column("timestamp", e_event), Op.GTE, start),
            Condition(Column("timestamp", e_event), Op.LT, end),
        ]
        # TODO: This is still basically only handling status, handle this better once we introduce
        # more conditions.
        for search_filter in search_filters:
            where_conditions.append(
                Condition(Column(search_filter.key.name, e_group), Op.IN,
                          search_filter.value.raw_value))

        if environments:
            # TODO: Should this be handled via filter_keys, once we have a snql compatible version?
            where_conditions.append(
                Condition(Column("environment", e_event), Op.IN,
                          [e.name for e in environments]))

        sort_func = self.aggregation_defs[self.sort_strategies[sort_by]]

        having = []
        if cursor is not None:
            op = Op.GTE if cursor.is_prev else Op.LTE
            having.append(Condition(sort_func, op, cursor.value))

        query = Query(
            "events",
            match=Join([Relationship(e_event, "grouped", e_group)]),
            select=[
                Column("id", e_group),
                replace(sort_func, alias="score"),
            ],
            where=where_conditions,
            groupby=[Column("id", e_group)],
            having=having,
            orderby=[OrderBy(sort_func, direction=Direction.DESC)],
            limit=Limit(limit + 1),
        )

        data = snuba.raw_snql_query(
            query, referrer="search.snuba.cdc_search.query")["data"]

        hits_query = Query(
            "events",
            match=Join([Relationship(e_event, "grouped", e_group)]),
            select=[
                Function("uniq", [Column("id", e_group)], alias="count"),
            ],
            where=where_conditions,
        )
        hits = None
        if count_hits:
            hits = snuba.raw_snql_query(
                hits_query,
                referrer="search.snuba.cdc_search.hits")["data"][0]["count"]

        paginator_results = SequencePaginator(
            [(row["score"], row["g.id"]) for row in data],
            reverse=True,
            **paginator_options,
        ).get_result(limit, cursor, known_hits=hits, max_hits=max_hits)
        # We filter against `group_queryset` here so that we recheck all conditions in Postgres.
        # Since replay between Postgres and Clickhouse can happen, we might get back results that
        # have changed state in Postgres. By rechecking them we guarantee than any returned results
        # have the correct state.
        # TODO: This can result in us returning less than a full page of results, but shouldn't
        # affect cursors. If we want to, we can iterate and query snuba until we manage to get a
        # full page. In practice, this will likely only skip a couple of results at worst, and
        # probably not be noticeable to the user, so holding off for now to reduce complexity.
        groups = group_queryset.in_bulk(paginator_results.results)
        paginator_results.results = [
            groups[k] for k in paginator_results.results if k in groups
        ]
        return paginator_results
Example #8
0
def _render_trees(group: Group, user):
    materialized_hashes = list({
        gh.hash
        for gh in GroupHash.objects.filter(project=group.project, group=group)
    })

    # Evaluates to the index of the last hash that is in materialized_hashes,
    # or 1 otherwise.
    find_hash_expr = _construct_arraymax([1] + [  # type: ignore
        Function("indexOf", [Column("hierarchical_hashes"), hash])
        for hash in materialized_hashes
    ])

    # After much deliberation I (markus) decided that it would be best to
    # render the entire tree using one large Snuba query. A previous
    # implementation incurred n+1 queries on Snuba (n = number of materialized
    # hashes) and was very buggy when it came to missing materialized hashes
    # (which can happen if fallback/secondary grouping is turned on), events
    # were counted twice because those n+1 queries accidentally counted
    # overlapping sets of events, and the endpoint response time was kind of
    # bad because of n+1 query.
    #
    # It being one large query may also make it easier to add pagination down
    # the road.

    query = (
        Query("events", Entity("events")).set_select([
            Function("count", [], "event_count"),
            Function(
                "argMax",
                [Column("event_id"), Column("timestamp")], "event_id"),
            Function("max", [Column("timestamp")], "latest_event_timestamp"),
            # If hierarchical_hashes contains any of the materialized
            # hashes, find_hash_expr evaluates to the last found index and
            # arraySlice will give us this hash + the next child hash that
            # we use in groupby
            #
            # If hierarchical_hashes does not contain any of those hashes,
            # find_hash_expr will return 1 so we start slicing at the beginning.
            # This can happen when hierarchical_hashes is empty (=>
            # hash_slice = []), but we also try to recover gracefully from
            # a hypothetical case where we are missing some hashes in
            # postgres (unclear how this could be reached).
            #
            # We select some intermediate computation values here which we
            # definitely don't need the results of. It's just temp vars.
            Function(
                # First we find the materialized hash using find_hash_expr,
                # and subtract 1 which should be the parent hash if there
                # is one. If there isn't, this now can be an out-of-bounds
                # access by being 0 (arrays are indexed starting with 1)
                "minus",
                [find_hash_expr, 1],
                "parent_hash_i",
            ),
            # We clip the value to be at least 1, this will be where we
            # start slicing hierarchical_hashes. 0 would be an out of
            # bounds access.
            Function("greatest", [Column("parent_hash_i"), 1], "slice_start"),
            # This will return a slice of length 2 if the materialized hash
            # has been found at the beginning of the array, but return a
            # slice of length 3 if not.
            Function(
                "arraySlice",
                [
                    Column("hierarchical_hashes"),
                    Column("slice_start"),
                    Function(
                        "minus",
                        [
                            Function(
                                "plus",
                                [Column("parent_hash_i"), 3],
                            ),
                            Column("slice_start"),
                        ],
                    ),
                ],
                "hash_slice",
            ),
            Column("primary_hash"),
        ]).set_where(_get_group_filters(group)).set_groupby([
            Column("parent_hash_i"),
            Column("slice_start"),
            Column("hash_slice"),
            Column("primary_hash"),
        ]).set_orderby(
            [OrderBy(Column("latest_event_timestamp"), Direction.DESC)]))

    rv = []

    for row in snuba.raw_snql_query(
            query, referrer="api.group_split.render_grouping_tree")["data"]:
        if len(row["hash_slice"]) == 0:
            hash = row["primary_hash"]
            parent_hash = child_hash = None
        elif len(row["hash_slice"]) == 1:
            (hash, ) = row["hash_slice"]
            parent_hash = child_hash = None
        elif len(row["hash_slice"]) == 2:
            hash, child_hash = row["hash_slice"]
            parent_hash = None
        elif len(row["hash_slice"]) == 3:
            parent_hash, hash, child_hash = row["hash_slice"]
        else:
            raise ValueError("unexpected length of hash_slice")

        _add_hash(
            rv,
            group,
            user,
            parent_hash,
            hash,
            child_hash,
            row["event_count"],
            row["latest_event_timestamp"],
            row["event_id"],
        )

    rv.sort(key=lambda tree: (tree["id"] or "", tree["childId"] or ""))

    return rv
def _query_snuba(group: Group, id: int, offset=None, limit=None):
    query = (Query("events", Entity("events")).set_select([
        Function(
            "arrayElement",
            [
                Column("hierarchical_hashes"),
                Function("least", [
                    id + 1,
                    Function("length", [Column("hierarchical_hashes")])
                ]),
            ],
            "new_materialized_hash",
        ),
        Function("argMax",
                 [Column("event_id"), Column("timestamp")], "latest_event_id"),
        Function("max", [Column("timestamp")], "latest_event_timestamp"),
        Function("count", [], "event_count"),
    ]).set_groupby([Column("new_materialized_hash")]).set_orderby(
        [OrderBy(Column("latest_event_timestamp"), Direction.DESC)]))

    levels_overview = get_levels_overview(group)

    # These conditions are always valid
    common_where = [
        Condition(Column("primary_hash"), Op.EQ,
                  levels_overview.only_primary_hash),
        Condition(Column("project_id"), Op.EQ, group.project_id),
    ]

    if id >= levels_overview.current_level:
        # Good path: Since we increase the level we can easily constrain the
        # entire query by group_id and timerange
        query = query.set_where(common_where + _get_group_filters(group))
    else:
        # Bad path: We decreased the level and now we need to count events from
        # other groups. If we cannot filter by group_id, we can also not
        # restrict the timerange to anything at all. The Snuba API still
        # requires us to set a timerange, so we set it to the maximum of 90d.
        #
        # Luckily the minmax index on group_id alone is reasonably efficient so
        # that filtering by timerange (=primary key) is only a little bit
        # faster.
        now = datetime.datetime.now()
        new_materialized_hash = _get_hash_for_parent_level(
            group, id, levels_overview)
        query = query.set_where(common_where + [
            Condition(
                Function("arrayElement",
                         [Column("hierarchical_hashes"), id + 1]),
                Op.EQ,
                new_materialized_hash,
            ),
            Condition(Column("timestamp"), Op.GTE, now -
                      datetime.timedelta(days=90)),
            Condition(Column("timestamp"), Op.LT, now +
                      datetime.timedelta(seconds=10)),
        ])

    if offset is not None:
        query = query.set_offset(offset)

    if limit is not None:
        query = query.set_limit(limit)

    return snuba.raw_snql_query(
        query, referrer="api.group_hashes_levels.get_level_new_issues")["data"]
Example #10
0
def _render_trees(group: Group, user):
    materialized_hashes = {
        gh.hash
        for gh in GroupHash.objects.filter(project=group.project, group=group)
    }

    rv = []

    common_where = _get_group_filters(group)

    for materialized_hash in materialized_hashes:
        # For every materialized hash we want to render parent and child
        # hashes, a limited view of the entire tree. We fetch one sample event
        # so we know how we need to slice hierarchical_hashes.
        hierarchical_hashes = _get_full_hierarchical_hashes(
            group, materialized_hash)

        if not hierarchical_hashes:
            # No hierarchical_hashes found, the materialized hash is probably
            # from flat grouping.
            parent_pos = None
            hash_pos = None
            child_pos = None
            slice_start = 0
        else:
            materialized_pos = hierarchical_hashes.index(materialized_hash)

            if materialized_pos == 0:
                parent_pos = None
                hash_pos = 0
                child_pos = 1
                slice_start = 1
            else:
                parent_pos = 0
                hash_pos = 1
                child_pos = 2
                slice_start = materialized_pos

        # Select sub-views of the trees that contain materialized_hash.
        query = (Query("events", Entity("events")).set_select([
            Function("count", [], "event_count"),
            Function(
                "argMax",
                [Column("event_id"), Column("timestamp")], "event_id"),
            Function("max", [Column("timestamp")], "latest_event_timestamp"),
            Function("arraySlice",
                     [Column("hierarchical_hashes"), slice_start, 3],
                     "hashes"),
        ]).set_where(common_where + [
            Condition(
                Function(
                    "has",
                    [
                        Column("hierarchical_hashes"),
                        materialized_hash,
                    ],
                ),
                Op.EQ,
                1,
            ),
        ]).set_groupby([Column("hashes")]).set_orderby(
            [OrderBy(Column("latest_event_timestamp"), Direction.DESC)]))

        for row in snuba.raw_snql_query(query)["data"]:
            assert not row["hashes"] or row["hashes"][
                hash_pos] == materialized_hash

            event_id = row["event_id"]
            event = eventstore.get_event_by_id(group.project_id, event_id)

            tree = {
                "parentId": _get_checked(row["hashes"], parent_pos),
                "id": materialized_hash,
                "childId": _get_checked(row["hashes"], child_pos),
                "eventCount": row["event_count"],
                "latestEvent": serialize(event, user, EventSerializer()),
            }

            rv.append(tree)

            if not row["hashes"]:
                continue

            try:
                for variant in event.get_grouping_variants().values():
                    if not isinstance(variant, ComponentVariant):
                        continue

                    if variant.get_hash() == tree["parentId"]:
                        tree["parentLabel"] = variant.component.tree_label

                    if variant.get_hash() == tree["childId"]:
                        tree["childLabel"] = variant.component.tree_label

                    if variant.get_hash() == tree["id"]:
                        tree["label"] = variant.component.tree_label
            except Exception:
                sentry_sdk.capture_exception()

    rv.sort(key=lambda tree:
            (tree["parentId"] or "", tree["id"] or "", tree["childId"] or ""))

    return rv