Ejemplo n.º 1
0
    def get_group_tag_value_paginator(
        self, project_id, group_id, environment_ids, key, order_by="-id"
    ):
        from sentry.api.paginator import SequencePaginator

        if order_by in ("-last_seen", "-first_seen", "-times_seen"):
            pass
        elif order_by == "-id":
            # Snuba has no unique id per GroupTagValue so we'll substitute `-first_seen`
            order_by = "-first_seen"
        else:
            raise ValueError("Unsupported order_by: %s" % order_by)

        group_tag_values = self.get_group_tag_value_iter(project_id, group_id, environment_ids, key)

        desc = order_by.startswith("-")
        score_field = order_by.lstrip("-")
        if score_field == "times_seen":
            return SequencePaginator(
                [(int(getattr(gtv, score_field)), gtv) for gtv in group_tag_values],
                reverse=desc,
            )

        return SequencePaginator(
            [
                (int(to_timestamp(getattr(gtv, score_field)) * 1000), gtv)
                for gtv in group_tag_values
            ],
            reverse=desc,
        )
Ejemplo n.º 2
0
    def test_empty_results(self):
        paginator = SequencePaginator([])
        result = paginator.get_result(5)
        assert list(result) == []
        assert result.prev == Cursor(0, 0, True, False)
        assert result.next == Cursor(0, 0, False, False)

        paginator = SequencePaginator([], reverse=True)
        result = paginator.get_result(5)
        assert list(result) == []
        assert result.prev == Cursor(0, 0, True, False)
        assert result.next == Cursor(0, 0, False, False)
Ejemplo n.º 3
0
    def get(self, request, organization, key):
        if not TAG_KEY_RE.match(key):
            return Response(
                {"detail": 'Invalid tag key format for "%s"' % (key, )},
                status=400)

        try:
            filter_params = self.get_filter_params(request, organization)
        except NoProjects:
            paginator = SequencePaginator([])
        else:
            paginator = tagstore.get_tag_value_paginator_for_projects(
                filter_params["project_id"],
                filter_params.get("environment"),
                key,
                filter_params["start"],
                filter_params["end"],
                query=request.GET.get("query"),
            )

        return self.paginate(
            request=request,
            paginator=paginator,
            on_results=lambda results: serialize(results, request.user),
        )
Ejemplo n.º 4
0
    def _get_tag_values_for_release_stages(self, projects, environments, query):
        from sentry.api.paginator import SequencePaginator

        organization_id = Project.objects.filter(id=projects[0]).values_list(
            "organization_id", flat=True
        )[0]
        versions = Release.objects.filter_by_stage(
            organization_id,
            "=",
            query,
            project_ids=projects,
            environments=environments,
        )
        if environments:
            versions = versions.filter(
                id__in=ReleaseEnvironment.objects.filter(
                    environment_id__in=environments
                ).values_list("release_id", flat=True)
            )

        versions = versions.order_by("version").values_list("version", flat=True)[:1000]
        return SequencePaginator(
            [
                (i, TagValue(RELEASE_STAGE_ALIAS, v, None, None, None))
                for i, v in enumerate(versions)
            ]
        )
    def get(self, request, organization, key):
        if not TAG_KEY_RE.match(key):
            return Response(
                {'detail': 'Invalid tag key format for "%s"' % (key, )},
                status=400)

        try:
            filter_params = self.get_filter_params(request, organization)
        except OrganizationEventsError as exc:
            return Response({'detail': exc.message}, status=400)
        except NoProjects:
            paginator = SequencePaginator([])
        else:
            # TODO(jess): update this when snuba tagstore is the primary backend for us
            tagstore = SnubaTagStorage()

            paginator = tagstore.get_tag_value_paginator_for_projects(
                filter_params['project_id'],
                filter_params.get('environment'),
                key,
                filter_params['start'],
                filter_params['end'],
                query=request.GET.get('query'),
            )

        return self.paginate(
            request=request,
            paginator=paginator,
            on_results=lambda results: serialize(results, request.user),
        )
Ejemplo n.º 6
0
    def get_tag_value_paginator_for_projects(self,
                                             projects,
                                             environments,
                                             key,
                                             start,
                                             end,
                                             query=None,
                                             order_by='-last_seen'):
        from sentry.api.paginator import SequencePaginator

        if not order_by == '-last_seen':
            raise ValueError("Unsupported order_by: %s" % order_by)

        snuba_key = snuba.get_snuba_column_name(key)

        conditions = []

        if snuba_key in BLACKLISTED_COLUMNS:
            snuba_key = 'tags[%s]' % (key, )

        if query:
            conditions.append([snuba_key, 'LIKE', u'%{}%'.format(query)])
        else:
            conditions.append([snuba_key, '!=', ''])

        filters = {
            'project_id': projects,
        }
        if environments:
            filters['environment'] = environments

        results = snuba.query(
            start=start,
            end=end,
            groupby=[snuba_key],
            filter_keys=filters,
            aggregations=[
                ['count()', '', 'times_seen'],
                ['min', 'timestamp', 'first_seen'],
                ['max', 'timestamp', 'last_seen'],
            ],
            conditions=conditions,
            orderby=order_by,
            # TODO: This means they can't actually paginate all TagValues.
            limit=1000,
            arrayjoin=snuba.get_arrayjoin(snuba_key),
            referrer='tagstore.get_tag_value_paginator_for_projects',
        )

        tag_values = [
            TagValue(key=key, value=value, **fix_tag_value_data(data))
            for value, data in six.iteritems(results)
        ]

        desc = order_by.startswith('-')
        score_field = order_by.lstrip('-')
        return SequencePaginator(
            [(int(to_timestamp(getattr(tv, score_field)) * 1000), tv)
             for tv in tag_values],
            reverse=desc)
    def get(self, request: Request, organization, key) -> Response:
        if not TAG_KEY_RE.match(key):
            return Response({"detail": f'Invalid tag key format for "{key}"'}, status=400)

        sentry_sdk.set_tag("query.tag_key", key)

        try:
            # still used by events v1 which doesn't require global views
            filter_params = self.get_snuba_params(request, organization, check_global_views=False)
        except NoProjects:
            paginator = SequencePaginator([])
        else:
            with self.handle_query_errors():
                paginator = tagstore.get_tag_value_paginator_for_projects(
                    filter_params["project_id"],
                    filter_params.get("environment"),
                    key,
                    filter_params["start"],
                    filter_params["end"],
                    query=request.GET.get("query"),
                    include_transactions=request.GET.get("includeTransactions") == "1",
                )

        return self.paginate(
            request=request,
            paginator=paginator,
            on_results=lambda results: serialize(results, request.user),
        )
Ejemplo n.º 8
0
    def _get_tag_values_for_releases_across_all_datasets(
            self, projects, environments, query):
        from sentry.api.paginator import SequencePaginator

        organization_id = Project.objects.filter(id=projects[0]).values_list(
            "organization_id", flat=True)[0]
        qs = Release.objects.filter(organization_id=organization_id)

        if projects:
            qs = qs.filter(id__in=ReleaseProject.objects.filter(
                project_id__in=projects).values_list("release_id", flat=True))
        if environments:
            qs = qs.filter(id__in=ReleaseEnvironment.objects.filter(
                environment_id__in=environments).values_list("release_id",
                                                             flat=True))

        if query:
            qs = qs.filter(version__startswith=query)

        versions = qs.order_by("version").values_list("version",
                                                      flat=True)[:1000]

        return SequencePaginator([(i,
                                   TagValue(RELEASE_ALIAS, v, None, None,
                                            None))
                                  for i, v in enumerate(versions)])
Ejemplo n.º 9
0
    def _get_tag_values_for_semver_build(self, projects, environments, build):
        from sentry.api.paginator import SequencePaginator

        build = build if build else ""
        if not build.endswith("*"):
            build += "*"

        organization_id = Project.objects.filter(id=projects[0]).values_list(
            "organization_id", flat=True
        )[0]
        builds = Release.objects.filter_by_semver_build(organization_id, "exact", build, projects)

        if environments:
            builds = builds.filter(
                id__in=ReleaseEnvironment.objects.filter(
                    environment_id__in=environments
                ).values_list("release_id", flat=True)
            )

        packages = (
            builds.values_list("build_code", flat=True).distinct().order_by("build_code")[:1000]
        )
        return SequencePaginator(
            [(i, TagValue(SEMVER_BUILD_ALIAS, v, None, None, None)) for i, v in enumerate(packages)]
        )
Ejemplo n.º 10
0
    def test_ascending_simple(self):
        paginator = SequencePaginator([(i, i) for i in range(10)], reverse=False)

        result = paginator.get_result(5)
        assert list(result) == [0, 1, 2, 3, 4]
        assert result.prev == Cursor(0, 0, True, False)
        assert result.next == Cursor(5, 0, False, True)

        result = paginator.get_result(5, result.next)
        assert list(result) == [5, 6, 7, 8, 9]
        assert result.prev == Cursor(5, 0, True, True)
        assert result.next == Cursor(9, 1, False, False)

        result = paginator.get_result(5, result.prev)
        assert list(result) == [0, 1, 2, 3, 4]
        assert result.prev == Cursor(0, 0, True, False)
        assert result.next == Cursor(5, 0, False, True)

        result = paginator.get_result(5, Cursor(100, 0, False))
        assert list(result) == []
        assert result.prev == Cursor(9, 1, True, True)
        assert result.next == Cursor(9, 1, False, False)
Ejemplo n.º 11
0
    def test_descending_simple(self):
        paginator = SequencePaginator([(i, i) for i in range(10)], reverse=True)

        result = paginator.get_result(5)
        assert list(result) == [9, 8, 7, 6, 5]
        assert result.prev == Cursor(9, 0, True, False)
        assert result.next == Cursor(4, 0, False, True)

        result = paginator.get_result(5, result.next)
        assert list(result) == [4, 3, 2, 1, 0]
        assert result.prev == Cursor(4, 0, True, True)
        assert result.next == Cursor(0, 1, False, False)

        result = paginator.get_result(5, result.prev)
        assert list(result) == [9, 8, 7, 6, 5]
        assert result.prev == Cursor(9, 0, True, False)
        assert result.next == Cursor(4, 0, False, True)

        result = paginator.get_result(5, Cursor(-10, 0, False))
        assert list(result) == []
        assert result.prev == Cursor(0, 1, True, True)
        assert result.next == Cursor(0, 1, False, False)
Ejemplo n.º 12
0
    def test_descending_repeated_scores(self):
        paginator = SequencePaginator([(1, i) for i in range(10)], reverse=True)

        result = paginator.get_result(5)
        assert list(result) == [9, 8, 7, 6, 5]
        assert result.prev == Cursor(1, 0, True, False)
        assert result.next == Cursor(1, 5, False, True)

        result = paginator.get_result(5, result.next)
        assert list(result) == [4, 3, 2, 1, 0]
        assert result.prev == Cursor(1, 5, True, True)
        assert result.next == Cursor(1, 10, False, False)

        result = paginator.get_result(5, result.prev)
        assert list(result) == [9, 8, 7, 6, 5]
        assert result.prev == Cursor(1, 0, True, False)
        assert result.next == Cursor(1, 5, False, True)

        result = paginator.get_result(5, Cursor(-10, 0, False))
        assert list(result) == []
        assert result.prev == Cursor(1, 10, True, True)
        assert result.next == Cursor(1, 10, False, False)
Ejemplo n.º 13
0
    def get_tag_value_paginator(self,
                                project_id,
                                environment_id,
                                key,
                                query=None,
                                order_by='-last_seen'):
        from sentry.api.paginator import SequencePaginator

        if not order_by == '-last_seen':
            raise ValueError("Unsupported order_by: %s" % order_by)

        conditions = []
        if query:
            conditions.append(['tags_value', 'LIKE', u'%{}%'.format(query)])

        start, end = self.get_time_range()
        filters = {
            'project_id': [project_id],
            'tags_key': [key],
        }
        if environment_id:
            filters['environment'] = [environment_id]
        results = snuba.query(
            start=start,
            end=end,
            groupby=['tags_value'],
            filter_keys=filters,
            aggregations=[
                ['count()', '', 'times_seen'],
                ['min', 'timestamp', 'first_seen'],
                ['max', 'timestamp', 'last_seen'],
            ],
            conditions=conditions,
            orderby=order_by,
            # TODO: This means they can't actually paginate all TagValues.
            limit=1000,
            referrer='tagstore.get_tag_value_paginator',
        )

        tag_values = [
            TagValue(key=key, value=value, **fix_tag_value_data(data))
            for value, data in six.iteritems(results)
        ]

        desc = order_by.startswith('-')
        score_field = order_by.lstrip('-')
        return SequencePaginator(
            [(int(to_timestamp(getattr(tv, score_field)) * 1000), tv)
             for tv in tag_values],
            reverse=desc)
Ejemplo n.º 14
0
    def test_empty_results(self):
        paginator = SequencePaginator([])
        result = paginator.get_result(5)
        assert list(result) == []
        assert result.prev == Cursor(0, 0, True, False)
        assert result.next == Cursor(0, 0, False, False)

        paginator = SequencePaginator([], reverse=True)
        result = paginator.get_result(5)
        assert list(result) == []
        assert result.prev == Cursor(0, 0, True, False)
        assert result.next == Cursor(0, 0, False, False)
Ejemplo n.º 15
0
    def _get_tag_values_for_semver_package(self, projects, environments,
                                           package):
        from sentry.api.paginator import SequencePaginator

        package = package if package else ""

        organization_id = Project.objects.filter(id=projects[0]).values_list(
            "organization_id", flat=True)[0]
        versions = self._get_semver_versions_for_package(
            projects, organization_id, package)
        if environments:
            versions = versions.filter(
                id__in=ReleaseEnvironment.objects.filter(
                    environment_id__in=environments).values_list("release_id",
                                                                 flat=True))
        packages = versions.values_list(
            "package", flat=True).distinct().order_by("package")[:1000]
        return SequencePaginator([(i,
                                   TagValue(SEMVER_PACKAGE_ALIAS, v, None,
                                            None, None))
                                  for i, v in enumerate(packages)])
Ejemplo n.º 16
0
    def test_ascending_simple(self):
        paginator = SequencePaginator([(i, i) for i in range(10)], reverse=False)

        result = paginator.get_result(5)
        assert list(result) == [0, 1, 2, 3, 4]
        assert result.prev == Cursor(0, 0, True, False)
        assert result.next == Cursor(5, 0, False, True)

        result = paginator.get_result(5, result.next)
        assert list(result) == [5, 6, 7, 8, 9]
        assert result.prev == Cursor(5, 0, True, True)
        assert result.next == Cursor(9, 1, False, False)

        result = paginator.get_result(5, result.prev)
        assert list(result) == [0, 1, 2, 3, 4]
        assert result.prev == Cursor(0, 0, True, False)
        assert result.next == Cursor(5, 0, False, True)

        result = paginator.get_result(5, Cursor(100, 0, False))
        assert list(result) == []
        assert result.prev == Cursor(9, 1, True, True)
        assert result.next == Cursor(9, 1, False, False)
Ejemplo n.º 17
0
    def test_descending_repeated_scores(self):
        paginator = SequencePaginator([(1, i) for i in range(10)], reverse=True)

        result = paginator.get_result(5)
        assert list(result) == [9, 8, 7, 6, 5]
        assert result.prev == Cursor(1, 0, True, False)
        assert result.next == Cursor(1, 5, False, True)

        result = paginator.get_result(5, result.next)
        assert list(result) == [4, 3, 2, 1, 0]
        assert result.prev == Cursor(1, 5, True, True)
        assert result.next == Cursor(1, 10, False, False)

        result = paginator.get_result(5, result.prev)
        assert list(result) == [9, 8, 7, 6, 5]
        assert result.prev == Cursor(1, 0, True, False)
        assert result.next == Cursor(1, 5, False, True)

        result = paginator.get_result(5, Cursor(-10, 0, False))
        assert list(result) == []
        assert result.prev == Cursor(1, 10, True, True)
        assert result.next == Cursor(1, 10, False, False)
Ejemplo n.º 18
0
    def test_descending_simple(self):
        paginator = SequencePaginator([(i, i) for i in range(10)], reverse=True)

        result = paginator.get_result(5)
        assert list(result) == [9, 8, 7, 6, 5]
        assert result.prev == Cursor(9, 0, True, False)
        assert result.next == Cursor(4, 0, False, True)

        result = paginator.get_result(5, result.next)
        assert list(result) == [4, 3, 2, 1, 0]
        assert result.prev == Cursor(4, 0, True, True)
        assert result.next == Cursor(0, 1, False, False)

        result = paginator.get_result(5, result.prev)
        assert list(result) == [9, 8, 7, 6, 5]
        assert result.prev == Cursor(9, 0, True, False)
        assert result.next == Cursor(4, 0, False, True)

        result = paginator.get_result(5, Cursor(-10, 0, False))
        assert list(result) == []
        assert result.prev == Cursor(0, 1, True, True)
        assert result.next == Cursor(0, 1, False, False)
Ejemplo n.º 19
0
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the future, but apparently
        #       `retention_window_start` can be None?
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # pre-filter query
        candidate_hashes = dict(
            GroupHash.objects.filter(
                group__in=group_queryset
            ).values_list(
                'hash', 'group_id'
            )[:MAX_PRE_SNUBA_CANDIDATES + 1]
        )
        metrics.timing('snuba.search.num_candidates', len(candidate_hashes))

        if not candidate_hashes:
            # no matches could possibly be found from this point on
            metrics.incr('snuba.search.no_candidates')
            return Paginator(Group.objects.none()).get_result()
        elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr('snuba.search.too_many_candidates')
            candidate_hashes = None

        sort, extra_aggregations, score_fn = sort_strategies[sort_by]

        # {group_id: group_score, ...}
        snuba_groups = snuba_search(
            project_id=project.id,
            environment_id=environment and environment.id,
            tags=tags,
            start=start,
            end=end,
            sort=sort,
            extra_aggregations=extra_aggregations,
            score_fn=score_fn,
            candidate_hashes=candidate_hashes,
            **parameters
        )
        metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))

        if candidate_hashes:
            # pre-filtered candidates were passed down to Snuba,
            # so we're finished with filtering
            result_groups = snuba_groups.items()
        else:
            # pre-filtered candidates were *not* passed down to Snuba,
            # so we need to do post-filtering to verify Sentry DB predicates
            result_groups = []
            i = 0
            for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1):
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in chunk]
                ).values_list('id', flat=True)

                result_groups.extend(
                    (group_id, snuba_groups[group_id])
                    for group_id in filtered_group_ids
                )

            metrics.timing('snuba.search.num_post_filters', i)

        paginator_results = SequencePaginator(
            [(score, id) for (id, score) in result_groups],
            reverse=True,
            **paginator_options
        ).get_result(limit, cursor, count_hits=count_hits)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
Ejemplo n.º 20
0
    def query(
        self,
        projects: Sequence[Project],
        retention_window_start: Optional[datetime],
        group_queryset: QuerySet,
        environments: Sequence[Environment],
        sort_by: str,
        limit: int,
        cursor: Optional[Cursor],
        count_hits: bool,
        paginator_options: Mapping[str, Any],
        search_filters: Sequence[SearchFilter],
        date_from: Optional[datetime],
        date_to: Optional[datetime],
        max_hits=None,
    ) -> CursorResult:

        if not validate_cdc_search_filters(search_filters):
            raise InvalidQueryForExecutor(
                "Search filters invalid for this query executor")

        start, end, retention_date = self.calculate_start_end(
            retention_window_start, search_filters, date_from, date_to)

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatibility
            # with Django search (for now).
            return self.empty_result

        if start >= end:
            # TODO: This maintains backwards compatibility with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return self.empty_result

        e_event = self.entities["event"]
        e_group = self.entities["group"]

        where_conditions = [
            Condition(Column("project_id", e_event), Op.IN,
                      [p.id for p in projects]),
            Condition(Column("timestamp", e_event), Op.GTE, start),
            Condition(Column("timestamp", e_event), Op.LT, end),
        ]
        # TODO: This is still basically only handling status, handle this better once we introduce
        # more conditions.
        for search_filter in search_filters:
            where_conditions.append(
                Condition(Column(search_filter.key.name, e_group), Op.IN,
                          search_filter.value.raw_value))

        if environments:
            # TODO: Should this be handled via filter_keys, once we have a snql compatible version?
            where_conditions.append(
                Condition(Column("environment", e_event), Op.IN,
                          [e.name for e in environments]))

        sort_func = self.aggregation_defs[self.sort_strategies[sort_by]]

        having = []
        if cursor is not None:
            op = Op.GTE if cursor.is_prev else Op.LTE
            having.append(Condition(sort_func, op, cursor.value))

        query = Query(
            "events",
            match=Join([Relationship(e_event, "grouped", e_group)]),
            select=[
                Column("id", e_group),
                replace(sort_func, alias="score"),
            ],
            where=where_conditions,
            groupby=[Column("id", e_group)],
            having=having,
            orderby=[OrderBy(sort_func, direction=Direction.DESC)],
            limit=Limit(limit + 1),
        )

        data = snuba.raw_snql_query(
            query, referrer="search.snuba.cdc_search.query")["data"]

        hits_query = Query(
            "events",
            match=Join([Relationship(e_event, "grouped", e_group)]),
            select=[
                Function("uniq", [Column("id", e_group)], alias="count"),
            ],
            where=where_conditions,
        )
        hits = None
        if count_hits:
            hits = snuba.raw_snql_query(
                hits_query,
                referrer="search.snuba.cdc_search.hits")["data"][0]["count"]

        paginator_results = SequencePaginator(
            [(row["score"], row["g.id"]) for row in data],
            reverse=True,
            **paginator_options,
        ).get_result(limit, cursor, known_hits=hits, max_hits=max_hits)
        # We filter against `group_queryset` here so that we recheck all conditions in Postgres.
        # Since replay between Postgres and Clickhouse can happen, we might get back results that
        # have changed state in Postgres. By rechecking them we guarantee than any returned results
        # have the correct state.
        # TODO: This can result in us returning less than a full page of results, but shouldn't
        # affect cursors. If we want to, we can iterate and query snuba until we manage to get a
        # full page. In practice, this will likely only skip a couple of results at worst, and
        # probably not be noticeable to the user, so holding off for now to reduce complexity.
        groups = group_queryset.in_bulk(paginator_results.results)
        paginator_results.results = [
            groups[k] for k in paginator_results.results if k in groups
        ]
        return paginator_results
Ejemplo n.º 21
0
    def query(
        self,
        projects,
        retention_window_start,
        group_queryset,
        environments,
        sort_by,
        limit,
        cursor,
        count_hits,
        paginator_options,
        search_filters,
        date_from,
        date_to,
        max_hits=None,
    ):

        now = timezone.now()
        end = None
        end_params = [
            _f for _f in
            [date_to, get_search_filter(search_filters, "date", "<")] if _f
        ]
        if end_params:
            end = min(end_params)

        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            metrics.incr("snuba.search.postgres_only")

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if (cursor is None and sort_by == "date" and
                    # This handles tags and date parameters for search filters.
                    not [
                        sf for sf in search_filters if sf.key.name not in
                        self.postgres_only_fields.union(["date"])
                    ]):
                group_queryset = group_queryset.order_by("-last_seen")
                paginator = DateTimePaginator(group_queryset, "-last_seen",
                                              **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit,
                                            cursor,
                                            count_hits=count_hits,
                                            max_hits=max_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            _f for _f in [retention_window_start, now - timedelta(days=90)]
            if _f)
        start_params = [
            date_from, retention_date,
            get_search_filter(search_filters, "date", ">")
        ]
        start = max(_f for _f in start_params if _f)
        end = max([retention_date, end])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatibility
            # with Django search (for now).
            return self.empty_result

        if start >= end:
            # TODO: This maintains backwards compatibility with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return self.empty_result

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get("snuba.search.max-pre-snuba-candidates")

        with sentry_sdk.start_span(op="snuba_group_query") as span:
            group_ids = list(
                group_queryset.values_list("id",
                                           flat=True)[:max_candidates + 1])
            span.set_data("Max Candidates", max_candidates)
            span.set_data("Result Size", len(group_ids))
        metrics.timing("snuba.search.num_candidates", len(group_ids))

        too_many_candidates = False
        if not group_ids:
            # no matches could possibly be found from this point on
            metrics.incr("snuba.search.no_candidates", skip_internal=False)
            return self.empty_result
        elif len(group_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'status',
            # 'bookmarked_by', 'assigned_to', 'unassigned', or 'subscribed_by')
            # then it might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr("snuba.search.too_many_candidates",
                         skip_internal=False)
            too_many_candidates = True
            group_ids = []

        sort_field = self.sort_strategies[sort_by]
        chunk_growth = options.get("snuba.search.chunk-growth-rate")
        max_chunk_size = options.get("snuba.search.max-chunk-size")
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = self.calculate_hits(
            group_ids,
            too_many_candidates,
            sort_field,
            projects,
            retention_window_start,
            group_queryset,
            environments,
            sort_by,
            limit,
            cursor,
            count_hits,
            paginator_options,
            search_filters,
            start,
            end,
        )
        if count_hits and hits == 0:
            return self.empty_result

        paginator_results = self.empty_result
        result_groups = []
        result_group_ids = set()

        max_time = options.get("snuba.search.max-total-chunk-time-seconds")
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have group_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(group_ids))

            # {group_id: group_score, ...}
            snuba_groups, total = self.snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments
                and [environment.id for environment in environments],
                organization_id=projects[0].organization_id,
                sort_field=sort_field,
                cursor=cursor,
                group_ids=group_ids,
                limit=chunk_limit,
                offset=offset,
                search_filters=search_filters,
            )
            metrics.timing("snuba.search.num_snuba_results", len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if group_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the group_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits and hits is None:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid
                            for gid, _ in snuba_groups]).values_list("id",
                                                                     flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options).get_result(limit,
                                                cursor,
                                                known_hits=hits,
                                                max_hits=max_hits)

            if group_ids or len(
                    paginator_results.results) >= limit or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev
                                   or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing("snuba.search.num_chunks", num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [
            groups[k] for k in paginator_results.results if k in groups
        ]

        return paginator_results
Ejemplo n.º 22
0
 def test_hits(self):
     n = 10
     paginator = SequencePaginator([(i, i) for i in range(n)])
     assert paginator.get_result(5, count_hits=True).hits == n
Ejemplo n.º 23
0
    def _query(self, projects, retention_window_start, group_queryset, tags, environments,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        from sentry.models import (Group, Environment, Event, GroupEnvironment, Release)

        # this backend only supports search within one project/environment
        if len(projects) != 1 or (environments is not None and len(environments) > 1):
            raise NotImplementedError

        project = projects[0]
        environment = environments[0] if environments is not None else environments

        if environment is not None:
            if 'environment' in tags:
                environment_name = tags.pop('environment')
                assert environment_name is ANY or Environment.objects.get(
                    projects=project,
                    name=environment_name,
                ).id == environment.id

            event_queryset_builder = QuerySetBuilder({
                'date_from': ScalarCondition('date_added', 'gt'),
                'date_to': ScalarCondition('date_added', 'lt'),
            })

            if any(key in parameters for key in event_queryset_builder.conditions.keys()):
                event_queryset = event_queryset_builder.build(
                    tagstore.get_event_tag_qs(
                        project_id=project.id,
                        environment_id=environment.id,
                        key='environment',
                        value=environment.name,
                    ),
                    parameters,
                )
                if retention_window_start is not None:
                    event_queryset = event_queryset.filter(date_added__gte=retention_window_start)

                group_queryset = group_queryset.filter(
                    id__in=list(event_queryset.distinct().values_list('group_id', flat=True)[:1000])
                )

            _, group_queryset_sort_clause = sort_strategies[sort_by]
            group_queryset = QuerySetBuilder({
                'first_release': CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                get_sql_column(GroupEnvironment, 'first_release_id'),
                                get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
                'times_seen': CallbackCondition(
                    # This condition represents the exact number of times that
                    # an issue has been seen in an environment. Since an issue
                    # can't be seen in an environment more times than the issue
                    # was seen overall, we can safely exclude any groups that
                    # don't have at least that many events.
                    lambda queryset, times_seen: queryset.exclude(
                        times_seen__lt=times_seen,
                    ),
                ),
                'times_seen_lower': CallbackCondition(
                    # This condition represents the lower threshold for the
                    # number of times an issue has been seen in an environment.
                    # Since an issue can't be seen in an environment more times
                    # than the issue was seen overall, we can safely exclude
                    # any groups that haven't met that threshold.
                    lambda queryset, times_seen: queryset.exclude(
                        times_seen__lt=times_seen,
                    ),
                ),
                # The following conditions make a few assertions that are are
                # correct in an abstract sense but may not accurately reflect
                # the existing implementation (see GH-5289). These assumptions
                # are that 1. The first seen time for a Group is the minimum
                # value of the first seen time for all of it's GroupEnvironment
                # relations; 2. The last seen time for a Group is the maximum
                # value of the last seen time for all of it's GroupEnvironment
                # relations; 3. The first seen time is always less than or
                # equal to the last seen time.
                'age_from': CallbackCondition(
                    # This condition represents the lower threshold for "first
                    # seen" time for an environment. Due to assertions #1 and
                    # #3, we can exclude any groups where the "last seen" time
                    # is prior to this timestamp.
                    lambda queryset, first_seen: queryset.exclude(
                        last_seen__lt=first_seen,
                    ),
                ),
                'age_to': CallbackCondition(
                    # This condition represents the upper threshold for "first
                    # seen" time for an environment. Due to assertions #1, we
                    # can exclude any values where the group first seen is
                    # greater than that threshold.
                    lambda queryset, first_seen: queryset.exclude(
                        first_seen__gt=first_seen,
                    ),
                ),
                'last_seen_from': CallbackCondition(
                    # This condition represents the lower threshold for "last
                    # seen" time for an environment. Due to assertion #2, we
                    # can exclude any values where the group last seen value is
                    # less than that threshold.
                    lambda queryset, last_seen: queryset.exclude(
                        last_seen__lt=last_seen,
                    ),
                ),
                'last_seen_to': CallbackCondition(
                    # This condition represents the upper threshold for "last
                    # seen" time for an environment. Due to assertions #2 and
                    # #3, we can exclude any values where the group first seen
                    # value is greater than that threshold.
                    lambda queryset, last_seen: queryset.exclude(
                        first_seen__gt=last_seen,
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        '{} = {}'.format(
                            get_sql_column(Group, 'id'),
                            get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        '{} = %s'.format(
                            get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            ).order_by(group_queryset_sort_clause)

            get_sort_expression, sort_value_to_cursor_value = environment_sort_strategies[sort_by]

            group_tag_value_queryset = tagstore.get_group_tag_value_qs(
                project_id=project.id,
                group_id=set(group_queryset.values_list('id', flat=True)[:10000]),
                environment_id=environment.id,
                key='environment',
                value=environment.name,
            )

            if retention_window_start is not None:
                group_tag_value_queryset = group_tag_value_queryset.filter(
                    last_seen__gte=retention_window_start
                )

            candidates = dict(
                QuerySetBuilder({
                    'age_from': ScalarCondition('first_seen', 'gt'),
                    'age_to': ScalarCondition('first_seen', 'lt'),
                    'last_seen_from': ScalarCondition('last_seen', 'gt'),
                    'last_seen_to': ScalarCondition('last_seen', 'lt'),
                    'times_seen': CallbackCondition(
                        lambda queryset, times_seen: queryset.filter(times_seen=times_seen),
                    ),
                    'times_seen_lower': ScalarCondition('times_seen', 'gt'),
                    'times_seen_upper': ScalarCondition('times_seen', 'lt'),
                }).build(
                    group_tag_value_queryset,
                    parameters,
                ).extra(
                    select={
                        'sort_value': get_sort_expression(group_tag_value_queryset.model),
                    },
                ).values_list('group_id', 'sort_value')
            )

            if tags:
                # TODO: `get_group_ids_for_search_filter` should be able to
                # utilize the retention window start parameter for additional
                # optimizations.
                matches = tagstore.get_group_ids_for_search_filter(
                    project_id=project.id,
                    environment_id=environment.id,
                    tags=tags,
                    candidates=candidates.keys(),
                    limit=len(candidates),
                )
                for key in set(candidates) - set(matches or []):
                    del candidates[key]

            result = SequencePaginator(
                [(sort_value_to_cursor_value(score), id) for (id, score) in candidates.items()],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, count_hits=count_hits)

            groups = Group.objects.in_bulk(result.results)
            result.results = [groups[k] for k in result.results if k in groups]

            return result
        else:
            event_queryset_builder = QuerySetBuilder({
                'date_from': ScalarCondition('datetime', 'gt'),
                'date_to': ScalarCondition('datetime', 'lt'),
            })

            if any(key in parameters for key in event_queryset_builder.conditions.keys()):
                group_queryset = group_queryset.filter(
                    id__in=list(
                        event_queryset_builder.build(
                            Event.objects.filter(project_id=project.id),
                            parameters,
                        ).distinct().values_list('group_id', flat=True)[:1000],
                    )
                )

            group_queryset = QuerySetBuilder({
                'first_release': CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
                'age_from': ScalarCondition('first_seen', 'gt'),
                'age_to': ScalarCondition('first_seen', 'lt'),
                'last_seen_from': ScalarCondition('last_seen', 'gt'),
                'last_seen_to': ScalarCondition('last_seen', 'lt'),
                'times_seen': CallbackCondition(
                    lambda queryset, times_seen: queryset.filter(times_seen=times_seen),
                ),
                'times_seen_lower': ScalarCondition('times_seen', 'gt'),
                'times_seen_upper': ScalarCondition('times_seen', 'lt'),
            }).build(
                group_queryset,
                parameters,
            ).extra(
                select={
                    'sort_value': get_sort_clause(sort_by),
                },
            )

            if tags:
                group_ids = tagstore.get_group_ids_for_search_filter(
                    project_id=project.id,
                    environment_id=None,
                    tags=tags,
                    candidates=None,
                )

                if group_ids:
                    group_queryset = group_queryset.filter(id__in=group_ids)
                else:
                    group_queryset = group_queryset.none()

            paginator_cls, sort_clause = sort_strategies[sort_by]
            group_queryset = group_queryset.order_by(sort_clause)
            paginator = paginator_cls(group_queryset, sort_clause, **paginator_options)
            return paginator.get_result(limit, cursor, count_hits=count_hits)
Ejemplo n.º 24
0
    def _query(self, projects, retention_window_start, group_queryset, tags, environments,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environments is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[projects[0].organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} IN ({})'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                            ', '.join(['%s' for e in environments])
                        ),
                    ],
                    params=[environment.id for environment in environments],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=projects[0].organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        now = timezone.now()
        end = parameters.get('date_to')
        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if cursor is None \
                    and sort_by == 'date' \
                    and not tags \
                    and not environments \
                    and not any(param in parameters for param in [
                        'age_from', 'age_to', 'last_seen_from',
                        'last_seen_to', 'times_seen', 'times_seen_lower',
                        'times_seen_upper'
                    ]):
                group_queryset = group_queryset.order_by('-last_seen')
                paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options)
                return paginator.get_result(limit, cursor, count_hits=False)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [
                retention_window_start,
                now - timedelta(days=90)
            ])
        )

        start = max(
            filter(None, [
                retention_date,
                parameters.get('date_from'),
            ])
        )

        end = max([
            retention_date,
            end
        ])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatability
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatability with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # num_candidates is the number of Group IDs to send down to Snuba, if
        # more Group ID candidates are found, a "bare" Snuba search is performed
        # and the result groups are then post-filtered via queries to the Sentry DB
        optimizer_enabled = options.get('snuba.search.pre-snuba-candidates-optimizer')
        if optimizer_enabled:
            missed_projects = []
            keys = [self._get_project_count_cache_key(p.id) for p in projects]

            counts_by_projects = {
                self._get_project_id_from_key(key): count for key, count in cache.get_many(keys).items()
            }

            missed_projects = {p.id for p in projects} - set(counts_by_projects.keys())

            if missed_projects:
                missing_counts = snuba.query(
                    start=max(
                        filter(None, [
                            retention_window_start,
                            now - timedelta(days=90)
                        ])
                    ),
                    end=now,
                    groupby=['project_id'],
                    filter_keys={
                        'project_id': list(missed_projects),
                    },
                    aggregations=[['uniq', 'group_id', 'group_count']],
                    referrer='search',
                )

                cache.set_many({
                    self._get_project_count_cache_key(project_id): count
                    for project_id, count in missing_counts.items()
                }, options.get('snuba.search.project-group-count-cache-time'))

                counts_by_projects.update(missing_counts)

            min_candidates = options.get('snuba.search.min-pre-snuba-candidates')
            max_candidates = options.get('snuba.search.max-pre-snuba-candidates')
            candidates_percentage = options.get('snuba.search.pre-snuba-candidates-percentage')

            num_candidates = max(
                min_candidates,
                min(
                    max_candidates,
                    sum(counts_by_projects.values()) * candidates_percentage
                )
            )
        else:
            num_candidates = options.get('snuba.search.min-pre-snuba-candidates')

        # pre-filter query
        candidate_ids = None
        if num_candidates and limit <= num_candidates:
            candidate_ids = list(
                group_queryset.values_list('id', flat=True)[:num_candidates + 1]
            )
            metrics.timing('snuba.search.num_candidates', len(candidate_ids))

            if not candidate_ids:
                # no matches could possibly be found from this point on
                metrics.incr('snuba.search.no_candidates', skip_internal=False)
                return EMPTY_RESULT
            elif len(candidate_ids) > num_candidates:
                # If the pre-filter query didn't include anything to significantly
                # filter down the number of results (from 'first_release', 'query',
                # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
                # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
                # might have surpassed the `num_candidates`. In this case,
                # we *don't* want to pass candidates down to Snuba, and instead we
                # want Snuba to do all the filtering/sorting it can and *then* apply
                # this queryset to the results from Snuba, which we call
                # post-filtering.
                metrics.incr('snuba.search.too_many_candidates', skip_internal=False)
                candidate_ids = None

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, more_results = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                tags=tags,
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba,
                # so we're finished with filtering and these are the
                # only results
                result_groups = snuba_groups
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, count_hits=False)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.
        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
Ejemplo n.º 25
0
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the future, but apparently
        #       `retention_window_start` can be None?
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        '{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        '{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # TODO: If the query didn't include anything to significantly filter
        # down the number of groups at this point ('first_release', 'query',
        # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
        # 'subscribed_by', 'active_at_from', or 'active_at_to') then this
        # queryset might return a *huge* number of groups. In this case, we
        # probably *don't* want to pass candidates down to Snuba, and rather we
        # want Snuba to do all the filtering/sorting it can and *then* apply
        # this queryset to the results from Snuba.
        #
        # However, if this did filter down the number of groups significantly,
        # then passing in candidates is, of course, valuable.
        #
        # Should we decide which way to handle it based on the number of
        # group_ids, the number of hashes? Or should we just always start the
        # query with Snuba? Something else?
        candidate_group_ids = list(group_queryset.values_list('id', flat=True))

        sort, extra_aggregations, calculate_cursor_for_group = sort_strategies[sort_by]

        group_data = do_search(
            project_id=project.id,
            environment_id=environment and environment.id,
            tags=tags,
            start=start,
            end=end,
            sort=sort,
            extra_aggregations=extra_aggregations,
            candidates=candidate_group_ids,
            **parameters
        )

        group_to_score = {}
        for group_id, data in group_data.items():
            group_to_score[group_id] = calculate_cursor_for_group(data)

        paginator_results = SequencePaginator(
            [(score, id) for (id, score) in group_to_score.items()],
            reverse=True,
            **paginator_options
        ).get_result(limit, cursor, count_hits=count_hits)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
Ejemplo n.º 26
0
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the future, but apparently
        #       `retention_window_start` can be None?
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        '{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        '{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # pre-filter query
        candidate_hashes = dict(
            GroupHash.objects.filter(
                group__in=group_queryset
            ).values_list(
                'hash', 'group_id'
            )[:MAX_PRE_SNUBA_CANDIDATES + 1]
        )
        metrics.timing('snuba.search.num_candidates', len(candidate_hashes))

        if not candidate_hashes:
            # no matches could possibly be found from this point on
            metrics.incr('snuba.search.no_candidates')
            return Paginator(Group.objects.none()).get_result()
        elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr('snuba.search.too_many_candidates')
            candidate_hashes = None

        sort, extra_aggregations, score_fn = sort_strategies[sort_by]

        # {group_id: group_score, ...}
        snuba_groups = snuba_search(
            project_id=project.id,
            environment_id=environment and environment.id,
            tags=tags,
            start=start,
            end=end,
            sort=sort,
            extra_aggregations=extra_aggregations,
            score_fn=score_fn,
            candidate_hashes=candidate_hashes,
            **parameters
        )
        metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))

        if candidate_hashes:
            # pre-filtered candidates were passed down to Snuba,
            # so we're finished with filtering
            result_groups = snuba_groups.items()
        else:
            # pre-filtered candidates were *not* passed down to Snuba,
            # so we need to do post-filtering to verify Sentry DB predicates
            result_groups = []
            i = 0
            for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1):
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in chunk]
                ).values_list('id', flat=True)

                result_groups.extend(
                    (group_id, snuba_groups[group_id])
                    for group_id in filtered_group_ids
                )

            metrics.timing('snuba.search.num_post_filters', i)

        paginator_results = SequencePaginator(
            [(score, id) for (id, score) in result_groups],
            reverse=True,
            **paginator_options
        ).get_result(limit, cursor, count_hits=count_hits)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
Ejemplo n.º 27
0
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the past, but apparently
        #       `retention_window_start` can be None(?), so we need a fallback.
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # maximum number of Group IDs to send down to Snuba,
        # if more Group ID candidates are found, a "bare" Snuba
        # search is performed and the result groups are then
        # post-filtered via queries to the Sentry DB
        max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates')

        # pre-filter query
        candidate_ids = None
        if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates:
            candidate_ids = list(
                group_queryset.values_list('id', flat=True)[:max_pre_snuba_candidates + 1]
            )
            metrics.timing('snuba.search.num_candidates', len(candidate_ids))

            if not candidate_ids:
                # no matches could possibly be found from this point on
                metrics.incr('snuba.search.no_candidates')
                return Paginator(Group.objects.none()).get_result()
            elif len(candidate_ids) > max_pre_snuba_candidates:
                # If the pre-filter query didn't include anything to significantly
                # filter down the number of results (from 'first_release', 'query',
                # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
                # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
                # might have surpassed the `max_pre_snuba_candidates`. In this case,
                # we *don't* want to pass candidates down to Snuba, and instead we
                # want Snuba to do all the filtering/sorting it can and *then* apply
                # this queryset to the results from Snuba, which we call
                # post-filtering.
                metrics.incr('snuba.search.too_many_candidates')
                candidate_ids = None

        sort, extra_aggregations, score_fn = sort_strategies[sort_by]

        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0

        paginator_results = Paginator(Group.objects.none()).get_result()
        result_groups = []
        result_group_ids = set()
        min_score = float('inf')
        max_score = -1

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, more_results = snuba_search(
                project_id=project.id,
                environment_id=environment and environment.id,
                tags=tags,
                start=start,
                end=end,
                sort=sort,
                extra_aggregations=extra_aggregations,
                score_fn=score_fn,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba,
                # so we're finished with filtering and these are the
                # only results
                result_groups = snuba_groups
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

                    # used for cursor logic
                    min_score = min(min_score, group_score)
                    max_score = max(max_score, group_score)

            # HACK: If a cursor is being used and there may be more results available
            # in Snuba, we need to detect whether the cursor's value will be
            # found in the result groups. If it isn't in the results yet we need to
            # continue querying before we hand off to the paginator to decide whether
            # enough results are found or not, otherwise the paginator will happily
            # return `limit` worth of results that don't take the cursor into account
            # at all, since it can't know there are more results available.
            # TODO: If chunked search works in practice we should probably extend the
            # paginator to throw something if the cursor value is never found, or do
            # something other than partially leak internal paginator logic up to here.
            # Or make separate Paginator implementation just for Snuba search?
            if cursor is not None \
                    and not candidate_ids \
                    and more_results:
                if cursor.is_prev and min_score < cursor.value:
                    continue
                elif not cursor.is_prev and max_score > cursor.value:
                    continue

            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, count_hits=False)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
Ejemplo n.º 28
0
    def _get_tag_values_for_semver(
        self,
        projects: Sequence[int],
        environments: Optional[Sequence[str]],
        query: Optional[str],
    ):
        from sentry.api.paginator import SequencePaginator

        query = query if query else ""
        organization_id = Project.objects.filter(id=projects[0]).values_list(
            "organization_id", flat=True
        )[0]

        if query and "@" not in query and re.search(r"[^\d.\*]", query):
            # Handle searching just on package
            include_package = True
            versions = self._get_semver_versions_for_package(projects, organization_id, query)
        else:
            include_package = "@" in query
            query = query.replace("*", "")
            if "@" in query:
                versions = Release.objects.filter(version__startswith=query)
            else:
                versions = Release.objects.filter(version__contains="@" + query)

        if projects:
            versions = versions.filter(
                id__in=ReleaseProject.objects.filter(project_id__in=projects).values_list(
                    "release_id", flat=True
                )
            )
        if environments:
            versions = versions.filter(
                id__in=ReleaseEnvironment.objects.filter(
                    environment_id__in=environments
                ).values_list("release_id", flat=True)
            )

        order_by = map(_flip_field_sort, Release.SEMVER_COLS + ["package"])
        versions = (
            versions.filter_to_semver()
            .annotate_prerelease_column()
            .order_by(*order_by)
            .values_list("version", flat=True)[:1000]
        )

        seen = set()
        formatted_versions = []
        # We want to format versions here in a way that makes sense for autocomplete. So we
        # - Only include package if we think the user entered a package
        # - Exclude build number, since it's not used as part of filtering
        # When we don't include package, this can result in duplicate version numbers, so we
        # also de-dupe here. This can result in less than 1000 versions returned, but we
        # typically use very few values so this works ok.
        for version in versions:
            formatted_version = version if include_package else version.split("@", 1)[1]
            formatted_version = formatted_version.split("+", 1)[0]
            if formatted_version in seen:
                continue

            seen.add(formatted_version)
            formatted_versions.append(formatted_version)

        return SequencePaginator(
            [
                (i, TagValue(SEMVER_ALIAS, v, None, None, None))
                for i, v in enumerate(formatted_versions)
            ]
        )
Ejemplo n.º 29
0
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the future, but apparently
        #       `retention_window_start` can be None?
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        '{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        '{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # TODO: If the query didn't include anything to significantly filter
        # down the number of groups at this point ('first_release', 'query',
        # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
        # 'subscribed_by', 'active_at_from', or 'active_at_to') then this
        # queryset might return a *huge* number of groups. In this case, we
        # probably *don't* want to pass candidates down to Snuba, and rather we
        # want Snuba to do all the filtering/sorting it can and *then* apply
        # this queryset to the results from Snuba.
        #
        # However, if this did filter down the number of groups significantly,
        # then passing in candidates is, of course, valuable.
        #
        # Should we decide which way to handle it based on the number of
        # group_ids, the number of hashes? Or should we just always start the
        # query with Snuba? Something else?
        candidate_group_ids = list(group_queryset.values_list('id', flat=True))

        sort, extra_aggregations, calculate_cursor_for_group = sort_strategies[sort_by]

        group_data = do_search(
            project_id=project.id,
            environment_id=environment and environment.id,
            tags=tags,
            start=start,
            end=end,
            sort=sort,
            extra_aggregations=extra_aggregations,
            candidates=candidate_group_ids,
            **parameters
        )

        group_to_score = {}
        for group_id, data in group_data.items():
            group_to_score[group_id] = calculate_cursor_for_group(data)

        paginator_results = SequencePaginator(
            [(score, id) for (id, score) in group_to_score.items()],
            reverse=True,
            **paginator_options
        ).get_result(limit, cursor, count_hits=count_hits)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
Ejemplo n.º 30
0
    def _query(self, projects, retention_window_start, group_queryset, environments,
               sort_by, limit, cursor, count_hits, paginator_options, search_filters,
               date_from, date_to):

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environments is not None:
            environment_ids = [environment.id for environment in environments]
            group_queryset = group_queryset.filter(
                groupenvironment__environment_id__in=environment_ids
            )
            group_queryset = QuerySetBuilder({
                'first_release': QCallbackCondition(
                    lambda version: Q(
                        groupenvironment__first_release__organization_id=projects[0].organization_id,
                        groupenvironment__first_release__version=version,
                        groupenvironment__environment_id__in=environment_ids,
                    )
                ),
                'first_seen': ScalarCondition(
                    'groupenvironment__first_seen',
                    {'groupenvironment__environment_id__in': environment_ids}
                ),
            }).build(group_queryset, search_filters)
        else:
            group_queryset = QuerySetBuilder({
                'first_release': QCallbackCondition(
                    lambda version: Q(
                        first_release__organization_id=projects[0].organization_id,
                        first_release__version=version,
                    ),
                ),
                'first_seen': ScalarCondition('first_seen'),
            }).build(group_queryset, search_filters)

        now = timezone.now()
        end = None
        end_params = filter(
            None,
            [date_to, get_search_filter(search_filters, 'date', '<')],
        )
        if end_params:
            end = min(end_params)

        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if (
                cursor is None and
                sort_by == 'date' and
                not environments and
                # This handles tags and date parameters for search filters.
                not [
                    sf for sf in search_filters
                    if sf.key.name not in issue_only_fields.union(['date', 'message'])
                ]
            ):
                group_queryset = group_queryset.order_by('-last_seen')
                paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit, cursor, count_hits=count_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [
                retention_window_start,
                now - timedelta(days=90)
            ])
        )

        # TODO: We should try and consolidate all this logic together a little
        # better, maybe outside the backend. Should be easier once we're on
        # just the new search filters
        start_params = [
            date_from,
            retention_date,
            get_search_filter(search_filters, 'date', '>'),
        ]
        start = max(filter(None, start_params))

        end = max([
            retention_date,
            end
        ])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatability
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatability with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get('snuba.search.max-pre-snuba-candidates')
        too_many_candidates = False
        candidate_ids = list(
            group_queryset.values_list('id', flat=True)[:max_candidates + 1]
        )
        metrics.timing('snuba.search.num_candidates', len(candidate_ids))
        if not candidate_ids:
            # no matches could possibly be found from this point on
            metrics.incr('snuba.search.no_candidates', skip_internal=False)
            return EMPTY_RESULT
        elif len(candidate_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr('snuba.search.too_many_candidates', skip_internal=False)
            too_many_candidates = True
            candidate_ids = []

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = None

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        if count_hits and (too_many_candidates or cursor is not None):
            # If we had too many candidates to reasonably pass down to snuba,
            # or if we have a cursor that bisects the overall result set (such
            # that our query only sees results on one side of the cursor) then
            # we need an alternative way to figure out the total hits that this
            # query has.

            # To do this, we get a sample of groups matching the snuba side of
            # the query, and see how many of those pass the post-filter in
            # postgres. This should give us an estimate of the total number of
            # snuba matches that will be overall matches, which we can use to
            # get an estimate for X-Hits.

            # The sampling is not simple random sampling. It will return *all*
            # matching groups if there are less than N groups matching the
            # query, or it will return a random, deterministic subset of N of
            # the groups if there are more than N overall matches. This means
            # that the "estimate" is actually an accurate result when there are
            # less than N matching groups.

            # The number of samples required to achieve a certain error bound
            # with a certain confidence interval can be calculated from a
            # rearrangement of the normal approximation (Wald) confidence
            # interval formula:
            #
            # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
            #
            # Effectively if we want the estimate to be within +/- 10% of the
            # real value with 95% confidence, we would need (1.96^2 * p*(1-p))
            # / 0.1^2 samples. With a starting assumption of p=0.5 (this
            # requires the most samples) we would need 96 samples to achieve
            # +/-10% @ 95% confidence.

            sample_size = options.get('snuba.search.hits-sample-size')
            snuba_groups, snuba_total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                sort_field=sort_field,
                limit=sample_size,
                offset=0,
                get_sample=True,
                search_filters=search_filters,
            )
            snuba_count = len(snuba_groups)
            if snuba_count == 0:
                return EMPTY_RESULT
            else:
                filtered_count = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).count()

                hit_ratio = filtered_count / float(snuba_count)
                hits = int(hit_ratio * snuba_total)

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids))

            # {group_id: group_score, ...}
            snuba_groups, total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                search_filters=search_filters,
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the candidate_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits and hits is None:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, known_hits=hits)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
Ejemplo n.º 31
0
    def query(self,
              project,
              tags=None,
              environment=None,
              sort_by='date',
              limit=100,
              cursor=None,
              count_hits=False,
              paginator_options=None,
              **parameters):
        from sentry.models import (Environment, Event, Group, GroupEnvironment,
                                   GroupStatus, GroupSubscription, Release)

        if paginator_options is None:
            paginator_options = {}

        if tags is None:
            tags = {}

        try:
            if tags.get('sentry:release') == 'latest':
                tags['sentry:release'] = get_latest_release(
                    project, environment)

            if parameters.get('first_release') == 'latest':
                parameters['first_release'] = get_latest_release(
                    project, environment)
        except Release.DoesNotExist:
            # no matches could possibly be found from this point on
            return Paginator(Group.objects.none()).get_result()

        group_queryset = QuerySetBuilder({
            'query':
            CallbackCondition(
                lambda queryset, query: queryset.filter(
                    Q(message__icontains=query) | Q(culprit__icontains=query),
                ) if query else queryset,
            ),
            'status':
            CallbackCondition(
                lambda queryset, status: queryset.filter(status=status),
            ),
            'bookmarked_by':
            CallbackCondition(
                lambda queryset, user: queryset.filter(
                    bookmark_set__project=project,
                    bookmark_set__user=user,
                ),
            ),
            'assigned_to':
            CallbackCondition(
                functools.partial(assigned_to_filter, project=project), ),
            'unassigned':
            CallbackCondition(
                lambda queryset, unassigned: queryset.filter(
                    assignee_set__isnull=unassigned, ),
            ),
            'subscribed_by':
            CallbackCondition(
                lambda queryset, user: queryset.filter(
                    id__in=GroupSubscription.objects.filter(
                        project=project,
                        user=user,
                        is_active=True,
                    ).values_list('group'), ),
            ),
            'active_at_from':
            ScalarCondition('active_at', 'gt'),
            'active_at_to':
            ScalarCondition('active_at', 'lt'),
        }).build(
            Group.objects.filter(project=project).exclude(status__in=[
                GroupStatus.PENDING_DELETION,
                GroupStatus.DELETION_IN_PROGRESS,
                GroupStatus.PENDING_MERGE,
            ]),
            parameters,
        )

        # filter out groups which are beyond the retention period
        retention = quotas.get_event_retention(
            organization=project.organization)
        if retention:
            retention_window_start = timezone.now() - timedelta(days=retention)
            # TODO: This could be optimized when building querysets to identify
            # criteria that are logically impossible (e.g. if the upper bound
            # for last seen is before the retention window starts, no results
            # exist.)
            group_queryset = group_queryset.filter(
                last_seen__gte=retention_window_start)
        else:
            retention_window_start = None

        if environment is not None:
            if 'environment' in tags:
                # TODO: This should probably just overwrite the existing tag,
                # rather than asserting on it, but...?
                assert Environment.objects.get(
                    projects=project,
                    name=tags.pop('environment'),
                ).id == environment.id

            event_queryset_builder = QuerySetBuilder({
                'date_from':
                ScalarCondition('date_added', 'gt'),
                'date_to':
                ScalarCondition('date_added', 'lt'),
            })
            if any(key in parameters
                   for key in event_queryset_builder.conditions.keys()):
                event_queryset = event_queryset_builder.build(
                    tagstore.get_event_tag_qs(
                        project.id,
                        environment.id,
                        'environment',
                        environment.name,
                    ),
                    parameters,
                )
                if retention_window_start is not None:
                    event_queryset = event_queryset.filter(
                        date_added__gte=retention_window_start)

                group_queryset = group_queryset.filter(
                    id__in=list(event_queryset.distinct().values_list(
                        'group_id', flat=True)[:1000]))

            group_queryset = QuerySetBuilder({
                'first_release':
                CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                get_sql_column(GroupEnvironment,
                                               'first_release_id'),
                                get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                get_sql_column(Release, 'organization'), ),
                            '{} = %s'.format(
                                get_sql_column(Release, 'version'), ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
                'times_seen':
                CallbackCondition(
                    # This condition represents the exact number of times that
                    # an issue has been seen in an environment. Since an issue
                    # can't be seen in an environment more times than the issue
                    # was seen overall, we can safely exclude any groups that
                    # don't have at least that many events.
                    lambda queryset, times_seen: queryset.exclude(
                        times_seen__lt=times_seen, ),
                ),
                'times_seen_lower':
                CallbackCondition(
                    # This condition represents the lower threshold for the
                    # number of times an issue has been seen in an environment.
                    # Since an issue can't be seen in an environment more times
                    # than the issue was seen overall, we can safely exclude
                    # any groups that haven't met that threshold.
                    lambda queryset, times_seen: queryset.exclude(
                        times_seen__lt=times_seen, ),
                ),
                # The following conditions make a few assertions that are are
                # correct in an abstract sense but may not accurately reflect
                # the existing implementation (see GH-5289). These assumptions
                # are that 1. The first seen time for a Group is the minimum
                # value of the first seen time for all of it's GroupEnvironment
                # relations; 2. The last seen time for a Group is the maximum
                # value of the last seen time for all of it's GroupEnvironment
                # relations; 3. The first seen time is always less than or
                # equal to the last seen time.
                'age_from':
                CallbackCondition(
                    # This condition represents the lower threshold for "first
                    # seen" time for an environment. Due to assertions #1 and
                    # #3, we can exclude any groups where the "last seen" time
                    # is prior to this timestamp.
                    lambda queryset, first_seen: queryset.exclude(
                        last_seen__lt=first_seen, ),
                ),
                'age_to':
                CallbackCondition(
                    # This condition represents the upper threshold for "first
                    # seen" time for an environment. Due to assertions #1, we
                    # can exclude any values where the group first seen is
                    # greater than that threshold.
                    lambda queryset, first_seen: queryset.exclude(
                        first_seen__gt=first_seen, ),
                ),
                'last_seen_from':
                CallbackCondition(
                    # This condition represents the lower threshold for "last
                    # seen" time for an environment. Due to assertion #2, we
                    # can exclude any values where the group last seen value is
                    # less than that threshold.
                    lambda queryset, last_seen: queryset.exclude(last_seen__lt=
                                                                 last_seen, ),
                ),
                'last_seen_to':
                CallbackCondition(
                    # This condition represents the upper threshold for "last
                    # seen" time for an environment. Due to assertions #2 and
                    # #3, we can exclude any values where the group first seen
                    # value is greater than that threshold.
                    lambda queryset, last_seen: queryset.exclude(first_seen__gt
                                                                 =last_seen, ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        '{} = {}'.format(
                            get_sql_column(Group, 'id'),
                            get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        '{} = %s'.format(
                            get_sql_column(GroupEnvironment,
                                           'environment_id'), ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )

            get_sort_expression, sort_value_to_cursor_value = environment_sort_strategies[
                sort_by]

            group_tag_value_queryset = tagstore.get_group_tag_value_qs(
                project.id,
                set(group_queryset.values_list('id',
                                               flat=True)),  # TODO: Limit?,
                environment.id,
                'environment',
                environment.name,
            )

            if retention_window_start is not None:
                group_tag_value_queryset = group_tag_value_queryset.filter(
                    last_seen__gte=retention_window_start)

            candidates = dict(
                QuerySetBuilder({
                    'age_from':
                    ScalarCondition('first_seen', 'gt'),
                    'age_to':
                    ScalarCondition('first_seen', 'lt'),
                    'last_seen_from':
                    ScalarCondition('last_seen', 'gt'),
                    'last_seen_to':
                    ScalarCondition('last_seen', 'lt'),
                    'times_seen':
                    CallbackCondition(
                        lambda queryset, times_seen: queryset.filter(
                            times_seen=times_seen),
                    ),
                    'times_seen_lower':
                    ScalarCondition('times_seen', 'gt'),
                    'times_seen_upper':
                    ScalarCondition('times_seen', 'lt'),
                }).build(
                    group_tag_value_queryset,
                    parameters,
                ).extra(select={
                    'sort_value':
                    get_sort_expression(group_tag_value_queryset.model),
                }, ).values_list('group_id', 'sort_value'))

            if tags:
                # TODO: `get_group_ids_for_search_filter` should be able to
                # utilize the retention window start parameter for additional
                # optimizations.
                matches = tagstore.get_group_ids_for_search_filter(
                    project.id,
                    environment.id,
                    tags,
                    candidates.keys(),
                    limit=len(candidates),
                )
                for key in set(candidates) - set(matches or []):
                    del candidates[key]

            result = SequencePaginator([(sort_value_to_cursor_value(score), id)
                                        for (id, score) in candidates.items()],
                                       reverse=True,
                                       **paginator_options).get_result(
                                           limit,
                                           cursor,
                                           count_hits=count_hits)

            groups = Group.objects.in_bulk(result.results)
            result.results = [groups[k] for k in result.results if k in groups]

            return result
        else:
            event_queryset_builder = QuerySetBuilder({
                'date_from':
                ScalarCondition('datetime', 'gt'),
                'date_to':
                ScalarCondition('datetime', 'lt'),
            })
            if any(key in parameters
                   for key in event_queryset_builder.conditions.keys()):
                group_queryset = group_queryset.filter(id__in=list(
                    event_queryset_builder.build(
                        Event.objects.filter(project_id=project.id),
                        parameters,
                    ).distinct().values_list('group_id', flat=True)[:1000], ))

            group_queryset = QuerySetBuilder({
                'first_release':
                CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
                'age_from':
                ScalarCondition('first_seen', 'gt'),
                'age_to':
                ScalarCondition('first_seen', 'lt'),
                'last_seen_from':
                ScalarCondition('last_seen', 'gt'),
                'last_seen_to':
                ScalarCondition('last_seen', 'lt'),
                'times_seen':
                CallbackCondition(
                    lambda queryset, times_seen: queryset.filter(times_seen=
                                                                 times_seen),
                ),
                'times_seen_lower':
                ScalarCondition('times_seen', 'gt'),
                'times_seen_upper':
                ScalarCondition('times_seen', 'lt'),
            }).build(
                group_queryset,
                parameters,
            ).extra(select={
                'sort_value': get_sort_clause(sort_by),
            }, )

            if tags:
                matches = tagstore.get_group_ids_for_search_filter(
                    project.id, None, tags)
                if matches:
                    group_queryset = group_queryset.filter(id__in=matches)
                else:
                    group_queryset = group_queryset.none()

            paginator_cls, sort_clause = sort_strategies[sort_by]
            group_queryset = group_queryset.order_by(sort_clause)
            paginator = paginator_cls(group_queryset, sort_clause,
                                      **paginator_options)
            return paginator.get_result(limit, cursor, count_hits=count_hits)
Ejemplo n.º 32
0
    def get_tag_value_paginator_for_projects(self,
                                             projects,
                                             environments,
                                             key,
                                             start=None,
                                             end=None,
                                             query=None,
                                             order_by="-last_seen"):
        from sentry.api.paginator import SequencePaginator

        if not order_by == "-last_seen":
            raise ValueError("Unsupported order_by: %s" % order_by)

        dataset = Dataset.Events
        snuba_key = snuba.get_snuba_column_name(key)
        if snuba_key.startswith("tags["):
            snuba_key = snuba.get_snuba_column_name(key,
                                                    dataset=Dataset.Discover)
            if not snuba_key.startswith("tags["):
                dataset = Dataset.Discover

        conditions = []

        # transaction status needs a special case so that the user interacts with the names and not codes
        transaction_status = snuba_key == "transaction_status"
        if transaction_status:
            conditions.append([
                snuba_key,
                "IN",
                # Here we want to use the status codes during filtering,
                # but want to do this with names that include our query
                [
                    span_key for span_key, value in six.iteritems(
                        SPAN_STATUS_CODE_TO_NAME)
                    if (query and query in value) or (not query)
                ],
            ])
        elif key in FUZZY_NUMERIC_KEYS:
            converted_query = int(
                query) if query is not None and query.isdigit() else None
            if converted_query is not None:
                conditions.append([
                    snuba_key, ">=", converted_query - FUZZY_NUMERIC_DISTANCE
                ])
                conditions.append([
                    snuba_key, "<=", converted_query + FUZZY_NUMERIC_DISTANCE
                ])
        elif key == PROJECT_ALIAS:
            project_filters = {
                "id__in": projects,
            }
            if query:
                project_filters["slug__icontains"] = query
            project_queryset = Project.objects.filter(
                **project_filters).values("id", "slug")
            project_slugs = {
                project["id"]: project["slug"]
                for project in project_queryset
            }
            if project_queryset.exists():
                projects = [project["id"] for project in project_queryset]
                snuba_key = "project_id"
                dataset = Dataset.Discover
        else:
            if snuba_key in BLACKLISTED_COLUMNS:
                snuba_key = "tags[%s]" % (key, )

            if query:
                conditions.append([snuba_key, "LIKE", u"%{}%".format(query)])
            else:
                conditions.append([snuba_key, "!=", ""])

        filters = {"project_id": projects}
        if environments:
            filters["environment"] = environments

        results = snuba.query(
            dataset=dataset,
            start=start,
            end=end,
            groupby=[snuba_key],
            filter_keys=filters,
            aggregations=[
                ["count()", "", "times_seen"],
                ["min", "timestamp", "first_seen"],
                ["max", "timestamp", "last_seen"],
            ],
            conditions=conditions,
            orderby=order_by,
            # TODO: This means they can't actually paginate all TagValues.
            limit=1000,
            arrayjoin=snuba.get_arrayjoin(snuba_key),
            referrer="tagstore.get_tag_value_paginator_for_projects",
        )

        # With transaction_status we need to map the ids back to their names
        if transaction_status:
            results = OrderedDict([
                (SPAN_STATUS_CODE_TO_NAME[result_key], data)
                for result_key, data in six.iteritems(results)
            ])
        # With project names we map the ids back to the project slugs
        elif key == PROJECT_ALIAS:
            results = OrderedDict([(project_slugs[value], data)
                                   for value, data in six.iteritems(results)])

        tag_values = [
            TagValue(key=key,
                     value=six.text_type(value),
                     **fix_tag_value_data(data))
            for value, data in six.iteritems(results)
        ]

        desc = order_by.startswith("-")
        score_field = order_by.lstrip("-")
        return SequencePaginator(
            [(int(to_timestamp(getattr(tv, score_field)) * 1000), tv)
             for tv in tag_values],
            reverse=desc,
        )
Ejemplo n.º 33
0
    def _query(self, project, retention_window_start, group_queryset, tags,
               environment, sort_by, limit, cursor, count_hits,
               paginator_options, **parameters):

        from sentry.models import (Group, Environment, Event, GroupEnvironment,
                                   Release)

        if environment is not None:
            if 'environment' in tags:
                environment_name = tags.pop('environment')
                assert environment_name is ANY or Environment.objects.get(
                    projects=project,
                    name=environment_name,
                ).id == environment.id

            event_queryset_builder = QuerySetBuilder({
                'date_from':
                ScalarCondition('date_added', 'gt'),
                'date_to':
                ScalarCondition('date_added', 'lt'),
            })

            if any(key in parameters
                   for key in event_queryset_builder.conditions.keys()):
                event_queryset = event_queryset_builder.build(
                    tagstore.get_event_tag_qs(
                        project_id=project.id,
                        environment_id=environment.id,
                        key='environment',
                        value=environment.name,
                    ),
                    parameters,
                )
                if retention_window_start is not None:
                    event_queryset = event_queryset.filter(
                        date_added__gte=retention_window_start)

                group_queryset = group_queryset.filter(
                    id__in=list(event_queryset.distinct().values_list(
                        'group_id', flat=True)[:1000]))

            group_queryset = QuerySetBuilder({
                'first_release':
                CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                get_sql_column(GroupEnvironment,
                                               'first_release_id'),
                                get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                get_sql_column(Release, 'organization'), ),
                            '{} = %s'.format(
                                get_sql_column(Release, 'version'), ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
                'times_seen':
                CallbackCondition(
                    # This condition represents the exact number of times that
                    # an issue has been seen in an environment. Since an issue
                    # can't be seen in an environment more times than the issue
                    # was seen overall, we can safely exclude any groups that
                    # don't have at least that many events.
                    lambda queryset, times_seen: queryset.exclude(
                        times_seen__lt=times_seen, ),
                ),
                'times_seen_lower':
                CallbackCondition(
                    # This condition represents the lower threshold for the
                    # number of times an issue has been seen in an environment.
                    # Since an issue can't be seen in an environment more times
                    # than the issue was seen overall, we can safely exclude
                    # any groups that haven't met that threshold.
                    lambda queryset, times_seen: queryset.exclude(
                        times_seen__lt=times_seen, ),
                ),
                # The following conditions make a few assertions that are are
                # correct in an abstract sense but may not accurately reflect
                # the existing implementation (see GH-5289). These assumptions
                # are that 1. The first seen time for a Group is the minimum
                # value of the first seen time for all of it's GroupEnvironment
                # relations; 2. The last seen time for a Group is the maximum
                # value of the last seen time for all of it's GroupEnvironment
                # relations; 3. The first seen time is always less than or
                # equal to the last seen time.
                'age_from':
                CallbackCondition(
                    # This condition represents the lower threshold for "first
                    # seen" time for an environment. Due to assertions #1 and
                    # #3, we can exclude any groups where the "last seen" time
                    # is prior to this timestamp.
                    lambda queryset, first_seen: queryset.exclude(
                        last_seen__lt=first_seen, ),
                ),
                'age_to':
                CallbackCondition(
                    # This condition represents the upper threshold for "first
                    # seen" time for an environment. Due to assertions #1, we
                    # can exclude any values where the group first seen is
                    # greater than that threshold.
                    lambda queryset, first_seen: queryset.exclude(
                        first_seen__gt=first_seen, ),
                ),
                'last_seen_from':
                CallbackCondition(
                    # This condition represents the lower threshold for "last
                    # seen" time for an environment. Due to assertion #2, we
                    # can exclude any values where the group last seen value is
                    # less than that threshold.
                    lambda queryset, last_seen: queryset.exclude(last_seen__lt=
                                                                 last_seen, ),
                ),
                'last_seen_to':
                CallbackCondition(
                    # This condition represents the upper threshold for "last
                    # seen" time for an environment. Due to assertions #2 and
                    # #3, we can exclude any values where the group first seen
                    # value is greater than that threshold.
                    lambda queryset, last_seen: queryset.exclude(first_seen__gt
                                                                 =last_seen, ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        '{} = {}'.format(
                            get_sql_column(Group, 'id'),
                            get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        '{} = %s'.format(
                            get_sql_column(GroupEnvironment,
                                           'environment_id'), ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )

            get_sort_expression, sort_value_to_cursor_value = environment_sort_strategies[
                sort_by]

            group_tag_value_queryset = tagstore.get_group_tag_value_qs(
                project_id=project.id,
                group_id=set(
                    group_queryset.values_list('id', flat=True)[:10000]),
                environment_id=environment.id,
                key='environment',
                value=environment.name,
            )

            if retention_window_start is not None:
                group_tag_value_queryset = group_tag_value_queryset.filter(
                    last_seen__gte=retention_window_start)

            candidates = dict(
                QuerySetBuilder({
                    'age_from':
                    ScalarCondition('first_seen', 'gt'),
                    'age_to':
                    ScalarCondition('first_seen', 'lt'),
                    'last_seen_from':
                    ScalarCondition('last_seen', 'gt'),
                    'last_seen_to':
                    ScalarCondition('last_seen', 'lt'),
                    'times_seen':
                    CallbackCondition(
                        lambda queryset, times_seen: queryset.filter(
                            times_seen=times_seen),
                    ),
                    'times_seen_lower':
                    ScalarCondition('times_seen', 'gt'),
                    'times_seen_upper':
                    ScalarCondition('times_seen', 'lt'),
                }).build(
                    group_tag_value_queryset,
                    parameters,
                ).extra(select={
                    'sort_value':
                    get_sort_expression(group_tag_value_queryset.model),
                }, ).values_list('group_id', 'sort_value'))

            if tags:
                # TODO: `get_group_ids_for_search_filter` should be able to
                # utilize the retention window start parameter for additional
                # optimizations.
                matches = tagstore.get_group_ids_for_search_filter(
                    project_id=project.id,
                    environment_id=environment.id,
                    tags=tags,
                    candidates=candidates.keys(),
                    limit=len(candidates),
                )
                for key in set(candidates) - set(matches or []):
                    del candidates[key]

            result = SequencePaginator([(sort_value_to_cursor_value(score), id)
                                        for (id, score) in candidates.items()],
                                       reverse=True,
                                       **paginator_options).get_result(
                                           limit,
                                           cursor,
                                           count_hits=count_hits)

            groups = Group.objects.in_bulk(result.results)
            result.results = [groups[k] for k in result.results if k in groups]

            return result
        else:
            event_queryset_builder = QuerySetBuilder({
                'date_from':
                ScalarCondition('datetime', 'gt'),
                'date_to':
                ScalarCondition('datetime', 'lt'),
            })

            if any(key in parameters
                   for key in event_queryset_builder.conditions.keys()):
                group_queryset = group_queryset.filter(id__in=list(
                    event_queryset_builder.build(
                        Event.objects.filter(project_id=project.id),
                        parameters,
                    ).distinct().values_list('group_id', flat=True)[:1000], ))

            group_queryset = QuerySetBuilder({
                'first_release':
                CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
                'age_from':
                ScalarCondition('first_seen', 'gt'),
                'age_to':
                ScalarCondition('first_seen', 'lt'),
                'last_seen_from':
                ScalarCondition('last_seen', 'gt'),
                'last_seen_to':
                ScalarCondition('last_seen', 'lt'),
                'times_seen':
                CallbackCondition(
                    lambda queryset, times_seen: queryset.filter(times_seen=
                                                                 times_seen),
                ),
                'times_seen_lower':
                ScalarCondition('times_seen', 'gt'),
                'times_seen_upper':
                ScalarCondition('times_seen', 'lt'),
            }).build(
                group_queryset,
                parameters,
            ).extra(select={
                'sort_value': get_sort_clause(sort_by),
            }, )

            if tags:
                group_ids = tagstore.get_group_ids_for_search_filter(
                    project_id=project.id,
                    environment_id=None,
                    tags=tags,
                    candidates=None,
                )

                if group_ids:
                    group_queryset = group_queryset.filter(id__in=group_ids)
                else:
                    group_queryset = group_queryset.none()

            paginator_cls, sort_clause = sort_strategies[sort_by]
            group_queryset = group_queryset.order_by(sort_clause)
            paginator = paginator_cls(group_queryset, sort_clause,
                                      **paginator_options)
            return paginator.get_result(limit, cursor, count_hits=count_hits)
Ejemplo n.º 34
0
    def query(
        self,
        projects,
        retention_window_start,
        group_queryset,
        environments,
        sort_by,
        limit,
        cursor,
        count_hits,
        paginator_options,
        search_filters,
        date_from,
        date_to,
    ):

        now = timezone.now()
        end = None
        end_params = filter(
            None,
            [date_to, get_search_filter(search_filters, "date", "<")])
        if end_params:
            end = min(end_params)

        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if (cursor is None and sort_by == "date" and not environments and
                    # This handles tags and date parameters for search filters.
                    not [
                        sf for sf in search_filters
                        if sf.key.name not in issue_only_fields.union(["date"])
                    ]):
                group_queryset = group_queryset.order_by("-last_seen")
                paginator = DateTimePaginator(group_queryset, "-last_seen",
                                              **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit,
                                            cursor,
                                            count_hits=count_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [retention_window_start, now - timedelta(days=90)]))

        # TODO: We should try and consolidate all this logic together a little
        # better, maybe outside the backend. Should be easier once we're on
        # just the new search filters
        start_params = [
            date_from, retention_date,
            get_search_filter(search_filters, "date", ">")
        ]
        start = max(filter(None, start_params))

        end = max([retention_date, end])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatibility
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatibility with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get("snuba.search.max-pre-snuba-candidates")
        too_many_candidates = False
        candidate_ids = list(
            group_queryset.values_list("id", flat=True)[:max_candidates + 1])
        metrics.timing("snuba.search.num_candidates", len(candidate_ids))
        if not candidate_ids:
            # no matches could possibly be found from this point on
            metrics.incr("snuba.search.no_candidates", skip_internal=False)
            return EMPTY_RESULT
        elif len(candidate_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr("snuba.search.too_many_candidates",
                         skip_internal=False)
            too_many_candidates = True
            candidate_ids = []

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get("snuba.search.chunk-growth-rate")
        max_chunk_size = options.get("snuba.search.max-chunk-size")
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = None

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get("snuba.search.max-total-chunk-time-seconds")
        time_start = time.time()

        if count_hits and (too_many_candidates or cursor is not None):
            # If we had too many candidates to reasonably pass down to snuba,
            # or if we have a cursor that bisects the overall result set (such
            # that our query only sees results on one side of the cursor) then
            # we need an alternative way to figure out the total hits that this
            # query has.

            # To do this, we get a sample of groups matching the snuba side of
            # the query, and see how many of those pass the post-filter in
            # postgres. This should give us an estimate of the total number of
            # snuba matches that will be overall matches, which we can use to
            # get an estimate for X-Hits.

            # The sampling is not simple random sampling. It will return *all*
            # matching groups if there are less than N groups matching the
            # query, or it will return a random, deterministic subset of N of
            # the groups if there are more than N overall matches. This means
            # that the "estimate" is actually an accurate result when there are
            # less than N matching groups.

            # The number of samples required to achieve a certain error bound
            # with a certain confidence interval can be calculated from a
            # rearrangement of the normal approximation (Wald) confidence
            # interval formula:
            #
            # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
            #
            # Effectively if we want the estimate to be within +/- 10% of the
            # real value with 95% confidence, we would need (1.96^2 * p*(1-p))
            # / 0.1^2 samples. With a starting assumption of p=0.5 (this
            # requires the most samples) we would need 96 samples to achieve
            # +/-10% @ 95% confidence.

            sample_size = options.get("snuba.search.hits-sample-size")
            snuba_groups, snuba_total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments
                and [environment.id for environment in environments],
                sort_field=sort_field,
                limit=sample_size,
                offset=0,
                get_sample=True,
                search_filters=search_filters,
            )
            snuba_count = len(snuba_groups)
            if snuba_count == 0:
                return EMPTY_RESULT
            else:
                filtered_count = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]).count()

                hit_ratio = filtered_count / float(snuba_count)
                hits = int(hit_ratio * snuba_total)

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids))

            # {group_id: group_score, ...}
            snuba_groups, total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments
                and [environment.id for environment in environments],
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                search_filters=search_filters,
            )
            metrics.timing("snuba.search.num_snuba_results", len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the candidate_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits and hits is None:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid
                            for gid, _ in snuba_groups]).values_list("id",
                                                                     flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options).get_result(limit, cursor, known_hits=hits)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids or len(
                    paginator_results.results) >= limit or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev
                                   or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing("snuba.search.num_chunks", num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [
            groups[k] for k in paginator_results.results if k in groups
        ]

        return paginator_results
Ejemplo n.º 35
0
    def get_tag_value_paginator_for_projects(
        self,
        projects,
        environments,
        key,
        start=None,
        end=None,
        query=None,
        order_by="-last_seen",
        include_transactions=False,
    ):
        from sentry.api.paginator import SequencePaginator

        if not order_by == "-last_seen":
            raise ValueError("Unsupported order_by: %s" % order_by)

        dataset = Dataset.Events
        snuba_key = snuba.get_snuba_column_name(key)
        if include_transactions and snuba_key.startswith("tags["):
            snuba_key = snuba.get_snuba_column_name(key, dataset=Dataset.Discover)
            if not snuba_key.startswith("tags["):
                dataset = Dataset.Discover

        # We cannot search the values of these columns like we do other columns because they are
        # a different type, and as such, LIKE and != do not work on them. Furthermore, because the
        # use case for these values in autosuggestion is minimal, so we choose to disable them here.
        #
        # event_id:     This is a FixedString which disallows us to use LIKE on it when searching,
        #               but does work with !=. However, for consistency sake we disallow it
        #               entirely, furthermore, suggesting an event_id is not a very useful feature
        #               as they are not human readable.
        # timestamp:    This is a DateTime which disallows us to use both LIKE and != on it when
        #               searching. Suggesting a timestamp can potentially be useful but as it does
        #               work at all, we opt to disable it here. A potential solution can be to
        #               generate a time range to bound where they are searching. e.g. if a user
        #               enters 2020-07 we can generate the following conditions:
        #               >= 2020-07-01T00:00:00 AND <= 2020-07-31T23:59:59
        # time:         This is a column computed from timestamp so it suffers the same issues
        if snuba_key in {"event_id", "timestamp", "time"}:
            return SequencePaginator([])

        # These columns have fixed values and we don't need to emit queries to find out the
        # potential options.
        if key in {"error.handled", "error.unhandled"}:
            return SequencePaginator(
                [
                    (
                        1,
                        TagValue(
                            key=key, value="true", times_seen=None, first_seen=None, last_seen=None
                        ),
                    ),
                    (
                        2,
                        TagValue(
                            key=key, value="false", times_seen=None, first_seen=None, last_seen=None
                        ),
                    ),
                ]
            )

        conditions = []
        # transaction status needs a special case so that the user interacts with the names and not codes
        transaction_status = snuba_key == "transaction_status"
        if include_transactions and transaction_status:
            # Here we want to use the status codes during filtering,
            # but want to do this with names that include our query
            status_codes = [
                span_key
                for span_key, value in six.iteritems(SPAN_STATUS_CODE_TO_NAME)
                if (query and query in value) or (not query)
            ]
            if status_codes:
                conditions.append([snuba_key, "IN", status_codes])
            else:
                return SequencePaginator([])
        elif key in FUZZY_NUMERIC_KEYS:
            converted_query = int(query) if query is not None and query.isdigit() else None
            if converted_query is not None:
                conditions.append([snuba_key, ">=", converted_query - FUZZY_NUMERIC_DISTANCE])
                conditions.append([snuba_key, "<=", converted_query + FUZZY_NUMERIC_DISTANCE])
        elif include_transactions and key == PROJECT_ALIAS:
            project_filters = {
                "id__in": projects,
            }
            if query:
                project_filters["slug__icontains"] = query
            project_queryset = Project.objects.filter(**project_filters).values("id", "slug")

            if not project_queryset.exists():
                return SequencePaginator([])

            project_slugs = {project["id"]: project["slug"] for project in project_queryset}
            projects = [project["id"] for project in project_queryset]
            snuba_key = "project_id"
            dataset = Dataset.Discover
        else:
            snuba_name = snuba_key

            is_user_alias = include_transactions and key == USER_DISPLAY_ALIAS
            if is_user_alias:
                # user.alias is a pseudo column in discover. It is computed by coalescing
                # together multiple user attributes. Here we get the coalese function used,
                # and resolve it to the corresponding snuba query
                dataset = Dataset.Discover
                resolver = snuba.resolve_column(dataset)
                snuba_name = FIELD_ALIASES[USER_DISPLAY_ALIAS].get_field()
                snuba.resolve_complex_column(snuba_name, resolver)
            elif snuba_name in BLACKLISTED_COLUMNS:
                snuba_name = "tags[%s]" % (key,)

            if query:
                conditions.append([snuba_name, "LIKE", u"%{}%".format(query)])
            else:
                conditions.append([snuba_name, "!=", ""])

        filters = {"project_id": projects}
        if environments:
            filters["environment"] = environments

        results = snuba.query(
            dataset=dataset,
            start=start,
            end=end,
            groupby=[snuba_key],
            filter_keys=filters,
            aggregations=[
                ["count()", "", "times_seen"],
                ["min", "timestamp", "first_seen"],
                ["max", "timestamp", "last_seen"],
            ],
            conditions=conditions,
            orderby=order_by,
            # TODO: This means they can't actually paginate all TagValues.
            limit=1000,
            arrayjoin=snuba.get_arrayjoin(snuba_key),
            referrer="tagstore.get_tag_value_paginator_for_projects",
        )

        if include_transactions:
            # With transaction_status we need to map the ids back to their names
            if transaction_status:
                results = OrderedDict(
                    [
                        (SPAN_STATUS_CODE_TO_NAME[result_key], data)
                        for result_key, data in six.iteritems(results)
                    ]
                )
            # With project names we map the ids back to the project slugs
            elif key == PROJECT_ALIAS:
                results = OrderedDict(
                    [
                        (project_slugs[value], data)
                        for value, data in six.iteritems(results)
                        if value in project_slugs
                    ]
                )

        tag_values = [
            TagValue(key=key, value=six.text_type(value), **fix_tag_value_data(data))
            for value, data in six.iteritems(results)
        ]

        desc = order_by.startswith("-")
        score_field = order_by.lstrip("-")
        return SequencePaginator(
            [(int(to_timestamp(getattr(tv, score_field)) * 1000), tv) for tv in tag_values],
            reverse=desc,
        )
Ejemplo n.º 36
0
 def test_hits(self):
     n = 10
     paginator = SequencePaginator([(i, i) for i in range(n)])
     assert paginator.get_result(5, count_hits=True).hits == n
Ejemplo n.º 37
0
    def _query(self, projects, retention_window_start, group_queryset, tags, environments,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environments is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[projects[0].organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} IN ({})'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                            ', '.join(['%s' for e in environments])
                        ),
                    ],
                    params=[environment.id for environment in environments],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=projects[0].organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        now = timezone.now()
        end = parameters.get('date_to')
        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if cursor is None \
                    and sort_by == 'date' \
                    and not tags \
                    and not environments \
                    and not any(param in parameters for param in [
                        'age_from', 'age_to', 'last_seen_from',
                        'last_seen_to', 'times_seen', 'times_seen_lower',
                        'times_seen_upper'
                    ]):
                group_queryset = group_queryset.order_by('-last_seen')
                paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit, cursor, count_hits=count_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [
                retention_window_start,
                now - timedelta(days=90)
            ])
        )

        start = max(
            filter(None, [
                retention_date,
                parameters.get('date_from'),
            ])
        )

        end = max([
            retention_date,
            end
        ])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatability
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatability with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get('snuba.search.max-pre-snuba-candidates')
        candidate_ids = list(
            group_queryset.values_list('id', flat=True)[:max_candidates + 1]
        )
        metrics.timing('snuba.search.num_candidates', len(candidate_ids))
        if not candidate_ids:
            # no matches could possibly be found from this point on
            metrics.incr('snuba.search.no_candidates', skip_internal=False)
            return EMPTY_RESULT
        elif len(candidate_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr('snuba.search.too_many_candidates', skip_internal=False)
            candidate_ids = None

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = None

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        if count_hits and candidate_ids is None:
            # If we have no candidates, get a random sample of groups matching
            # the snuba side of the query, and see how many of those pass the
            # post-filter in postgres. This should give us an estimate of the
            # total number of snuba matches that will be overall matches, which
            # we can use to get an estimate for X-Hits. Note no cursor, so we
            # are always estimating the total hits.

            # The number of samples required to achieve a certain error bound
            # with a certain confidence interval can be calculated from a
            # rearrangement of the normal approximation (Wald) confidence
            # interval formula:
            #
            # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
            #
            # Effectively if we want the estimate to be within +/- 10% of the
            # real value with 95% confidence, we would need (1.96^2 * p*(1-p))
            # / 0.1^2 samples. With a starting assumption of p=0.5 (this
            # requires the most samples) we would need 96 samples to achieve
            # +/-10% @ 95% confidence.

            sample_size = options.get('snuba.search.hits-sample-size')
            snuba_groups, snuba_total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                tags=tags,
                sort_field=sort_field,
                limit=sample_size,
                offset=0,
                get_sample=True,
                **parameters
            )
            snuba_count = len(snuba_groups)
            if snuba_count == 0:
                return EMPTY_RESULT
            else:
                filtered_count = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).count()

                hit_ratio = filtered_count / float(snuba_count)
                hits = int(hit_ratio * snuba_total)

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                tags=tags,
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the candidate_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

                if count_hits:
                    if not more_results:
                        # We know we have got all possible groups from snuba and filtered
                        # them all down, so we have all hits.
                        # TODO this probably doesn't work because we could be on page N
                        # and not be including hits from previous pages.
                        hits = len(result_groups)
                    else:
                        # We also could have underestimated hits from our sample and have
                        # already seen more hits than the estimate, so make sure hits is
                        # at least as big as what we have seen.
                        hits = max(hits, len(result_groups))

            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, known_hits=hits)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results