Beispiel #1
0
def debounce_update_release_health_data(organization, project_ids):
    """This causes a flush of snuba health data to the postgres tables once
    per minute for the given projects.
    """
    # Figure out which projects need to get updates from the snuba.
    should_update = {}
    cache_keys = ["debounce-health:%d" % id for id in project_ids]
    cache_data = cache.get_many(cache_keys)
    for project_id, cache_key in izip(project_ids, cache_keys):
        if cache_data.get(cache_key) is None:
            should_update[project_id] = cache_key

    if not should_update:
        return

    projects = {p.id: p for p in Project.objects.get_many_from_cache(should_update.keys())}

    # This gives us updates for all release-projects which have seen new
    # health data over the last days. It will miss releases where the last
    # date is longer than what `get_changed_project_release_model_adoptions`
    # considers recent.
    project_releases = release_health.get_changed_project_release_model_adoptions(
        should_update.keys()
    )

    # Check which we already have rows for.
    existing = set(
        ReleaseProject.objects.filter(
            project_id__in=[x[0] for x in project_releases],
            release__version__in=[x[1] for x in project_releases],
        ).values_list("project_id", "release__version")
    )
    to_upsert = []
    for key in project_releases:
        if key not in existing:
            to_upsert.append(key)

    if to_upsert:
        dates = release_health.get_oldest_health_data_for_releases(to_upsert)

        for project_id, version in to_upsert:
            project = projects.get(project_id)
            if project is None:
                # should not happen
                continue

        # We might have never observed the release.  This for instance can
        # happen if the release only had health data so far.  For these cases
        # we want to create the release the first time we observed it on the
        # health side.
        release = Release.get_or_create(
            project=project, version=version, date_added=dates.get((project_id, version))
        )

        # Make sure that the release knows about this project.  Like we had before
        # the project might not have been associated with this release yet.
        release.add_project(project)

    # Debounce updates for a minute
    cache.set_many(dict(izip(should_update.values(), [True] * len(should_update))), 60)
Beispiel #2
0
def debounce_update_release_health_data(organization, project_ids):
    """This causes a flush of snuba health data to the postgres tables once
    per minute for the given projects.
    """
    # Figure out which projects need to get updates from the snuba.
    should_update = {}
    cache_keys = ["debounce-health:%d" % id for id in project_ids]
    cache_data = cache.get_many(cache_keys)
    for project_id, cache_key in izip(project_ids, cache_keys):
        if cache_data.get(cache_key) is None:
            should_update[project_id] = cache_key

    if not should_update:
        return

    projects = {
        p.id: p
        for p in Project.objects.get_many_from_cache(should_update.keys())
    }

    # This gives us updates for all release-projects which have seen new
    # health data over the last 24 hours. It will miss releases where the last
    # date is <24h ago.  We need to aggregate the data for the totals per release
    # manually here now.  This does not take environments into account.
    for project_id, version in get_changed_project_release_model_adoptions(
            should_update.keys()):
        project = projects.get(project_id)
        if project is None:
            # should not happen
            continue

        # We might have never observed the release.  This for instance can
        # happen if the release only had health data so far.  For these cases
        # we want to create the release the first time we observed it on the
        # health side.
        release = Release.get_or_create(project=project, version=version)

        # Make sure that the release knows about this project.  Like we had before
        # the project might not have been associated with this release yet.
        release.add_project(project)

    # Debounce updates for a minute
    cache.set_many(
        dict(izip(should_update.values(), [True] * len(should_update))), 60)
Beispiel #3
0
    def get_or_create_bulk(cls, project_id, environment_id, keys):
        # Attempt to create a bunch of models in one big batch with as few
        # queries and cache calls as possible.
        # In best case, this is all done in 1 cache get.
        # In ideal case, we'll do 3 queries total instead of N.
        # Absolute worst case, we still just do O(n) queries, but this should be rare.
        key_to_model = {key: None for key in keys}
        remaining_keys = set(keys)

        # First attempt to hit from cache, which in theory is the hot case
        cache_key_to_key = {
            cls.get_cache_key(project_id, environment_id, key): key
            for key in keys
        }
        cache_key_to_models = cache.get_many(cache_key_to_key.keys())
        for model in cache_key_to_models.values():
            key_to_model[model.key] = model
            remaining_keys.remove(model.key)

        if not remaining_keys:
            # 100% cache hit on all items, good work team
            return key_to_model

        # If we have some misses, we want to first check if
        # all of the misses actually exist in the database
        # already in one bulk query.
        to_cache = {}
        for model in cls.objects.filter(
                project_id=project_id,
                environment_id=environment_id,
                key__in=remaining_keys,
        ):
            key_to_model[model.key] = to_cache[cls.get_cache_key(
                project_id, environment_id, model.key)] = model
            remaining_keys.remove(model.key)

        # If we have found them all, cache all these misses
        # and return all the hits.
        if not remaining_keys:
            cache.set_many(to_cache, 3600)
            return key_to_model

        # At this point, we need to create all of our keys, since they
        # don't exist in cache or the database.

        # First attempt to create them all in one bulk query
        try:
            with transaction.atomic():
                cls.objects.bulk_create([
                    cls(
                        project_id=project_id,
                        environment_id=environment_id,
                        key=key,
                    ) for key in remaining_keys
                ])
        except IntegrityError:
            pass
        else:
            # If we succeed, the shitty part is we need one
            # more query to get back the actual rows with their ids.
            for model in cls.objects.filter(project_id=project_id,
                                            environment_id=environment_id,
                                            key__in=remaining_keys):
                key_to_model[model.key] = to_cache[cls.get_cache_key(
                    project_id, environment_id, model.key)] = model
                remaining_keys.remove(model.key)

            cache.set_many(to_cache, 3600)

            # Not clear if this could actually happen, but if it does,
            # guard ourselves against returning bad data.
            if not remaining_keys:
                return key_to_model

        # Fall back to just doing it manually
        # This case will only ever happen in a race condition.
        for key in remaining_keys:
            key_to_model[key] = cls.get_or_create(project_id, environment_id,
                                                  key)[0]

        return key_to_model
Beispiel #4
0
    def _query(self, projects, retention_window_start, group_queryset, tags, environments,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environments is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[projects[0].organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} IN ({})'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                            ', '.join(['%s' for e in environments])
                        ),
                    ],
                    params=[environment.id for environment in environments],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=projects[0].organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        now = timezone.now()
        end = parameters.get('date_to')
        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if cursor is None \
                    and sort_by == 'date' \
                    and not tags \
                    and not environments \
                    and not any(param in parameters for param in [
                        'age_from', 'age_to', 'last_seen_from',
                        'last_seen_to', 'times_seen', 'times_seen_lower',
                        'times_seen_upper'
                    ]):
                group_queryset = group_queryset.order_by('-last_seen')
                paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options)
                return paginator.get_result(limit, cursor, count_hits=False)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [
                retention_window_start,
                now - timedelta(days=90)
            ])
        )

        start = max(
            filter(None, [
                retention_date,
                parameters.get('date_from'),
            ])
        )

        end = max([
            retention_date,
            end
        ])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatability
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatability with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # num_candidates is the number of Group IDs to send down to Snuba, if
        # more Group ID candidates are found, a "bare" Snuba search is performed
        # and the result groups are then post-filtered via queries to the Sentry DB
        optimizer_enabled = options.get('snuba.search.pre-snuba-candidates-optimizer')
        if optimizer_enabled:
            missed_projects = []
            keys = [self._get_project_count_cache_key(p.id) for p in projects]

            counts_by_projects = {
                self._get_project_id_from_key(key): count for key, count in cache.get_many(keys).items()
            }

            missed_projects = {p.id for p in projects} - set(counts_by_projects.keys())

            if missed_projects:
                missing_counts = snuba.query(
                    start=max(
                        filter(None, [
                            retention_window_start,
                            now - timedelta(days=90)
                        ])
                    ),
                    end=now,
                    groupby=['project_id'],
                    filter_keys={
                        'project_id': list(missed_projects),
                    },
                    aggregations=[['uniq', 'group_id', 'group_count']],
                    referrer='search',
                )

                cache.set_many({
                    self._get_project_count_cache_key(project_id): count
                    for project_id, count in missing_counts.items()
                }, options.get('snuba.search.project-group-count-cache-time'))

                counts_by_projects.update(missing_counts)

            min_candidates = options.get('snuba.search.min-pre-snuba-candidates')
            max_candidates = options.get('snuba.search.max-pre-snuba-candidates')
            candidates_percentage = options.get('snuba.search.pre-snuba-candidates-percentage')

            num_candidates = max(
                min_candidates,
                min(
                    max_candidates,
                    sum(counts_by_projects.values()) * candidates_percentage
                )
            )
        else:
            num_candidates = options.get('snuba.search.min-pre-snuba-candidates')

        # pre-filter query
        candidate_ids = None
        if num_candidates and limit <= num_candidates:
            candidate_ids = list(
                group_queryset.values_list('id', flat=True)[:num_candidates + 1]
            )
            metrics.timing('snuba.search.num_candidates', len(candidate_ids))

            if not candidate_ids:
                # no matches could possibly be found from this point on
                metrics.incr('snuba.search.no_candidates', skip_internal=False)
                return EMPTY_RESULT
            elif len(candidate_ids) > num_candidates:
                # If the pre-filter query didn't include anything to significantly
                # filter down the number of results (from 'first_release', 'query',
                # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
                # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
                # might have surpassed the `num_candidates`. In this case,
                # we *don't* want to pass candidates down to Snuba, and instead we
                # want Snuba to do all the filtering/sorting it can and *then* apply
                # this queryset to the results from Snuba, which we call
                # post-filtering.
                metrics.incr('snuba.search.too_many_candidates', skip_internal=False)
                candidate_ids = None

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, more_results = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                tags=tags,
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba,
                # so we're finished with filtering and these are the
                # only results
                result_groups = snuba_groups
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, count_hits=False)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.
        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
Beispiel #5
0
    def get_or_create_bulk(cls, project_id, environment_id, keys):
        # Attempt to create a bunch of models in one big batch with as few
        # queries and cache calls as possible.
        # In best case, this is all done in 1 cache get.
        # In ideal case, we'll do 3 queries total instead of N.
        # Absolute worst case, we still just do O(n) queries, but this should be rare.
        key_to_model = {key: None for key in keys}
        remaining_keys = set(keys)

        # First attempt to hit from cache, which in theory is the hot case
        cache_key_to_key = {cls.get_cache_key(project_id, environment_id, key): key for key in keys}
        cache_key_to_models = cache.get_many(cache_key_to_key.keys())
        for model in cache_key_to_models.values():
            key_to_model[model.key] = model
            remaining_keys.remove(model.key)

        if not remaining_keys:
            # 100% cache hit on all items, good work team
            return key_to_model

        # If we have some misses, we want to first check if
        # all of the misses actually exist in the database
        # already in one bulk query.
        to_cache = {}
        for model in cls.objects.filter(
            project_id=project_id,
            environment_id=environment_id,
            key__in=remaining_keys,
        ):
            key_to_model[model.key] = to_cache[cls.get_cache_key(
                project_id, environment_id, model.key)] = model
            remaining_keys.remove(model.key)

        # If we have found them all, cache all these misses
        # and return all the hits.
        if not remaining_keys:
            cache.set_many(to_cache, 3600)
            return key_to_model

        # At this point, we need to create all of our keys, since they
        # don't exist in cache or the database.

        # First attempt to create them all in one bulk query
        try:
            with transaction.atomic():
                cls.objects.bulk_create([
                    cls(
                        project_id=project_id,
                        environment_id=environment_id,
                        key=key,
                    )
                    for key in remaining_keys
                ])
        except IntegrityError:
            pass
        else:
            # If we succeed, the shitty part is we need one
            # more query to get back the actual rows with their ids.
            for model in cls.objects.filter(
                project_id=project_id,
                environment_id=environment_id,
                key__in=remaining_keys
            ):
                key_to_model[model.key] = to_cache[cls.get_cache_key(
                    project_id, environment_id, model.key)] = model
                remaining_keys.remove(model.key)

            cache.set_many(to_cache, 3600)

            # Not clear if this could actually happen, but if it does,
            # guard ourselves against returning bad data.
            if not remaining_keys:
                return key_to_model

        # Fall back to just doing it manually
        # This case will only ever happen in a race condition.
        for key in remaining_keys:
            key_to_model[key] = cls.get_or_create(project_id, environment_id, key)[0]

        return key_to_model