Example #1
0
    def get_or_create_bulk(cls, project_id, tags):
        # Attempt to create a bunch of models in one big batch with as few
        # queries and cache calls as possible.
        # In best case, this is all done in 1 cache get.
        # If we miss cache hit here, we have to fall back to old behavior.
        key_to_model = {tag: None for tag in tags}
        tags_by_key_id = {tag[0].id: tag for tag in tags}
        remaining_keys = set(tags)

        # First attempt to hit from cache, which in theory is the hot case
        cache_key_to_key = {cls.get_cache_key(project_id, tk.id, v): (tk, v) for tk, v in tags}
        cache_key_to_models = cache.get_many(cache_key_to_key.keys())
        for model in cache_key_to_models.values():
            key_to_model[tags_by_key_id[model._key_id]] = model
            remaining_keys.remove(tags_by_key_id[model._key_id])

        if not remaining_keys:
            # 100% cache hit on all items, good work team
            return key_to_model

        # Fall back to just doing it manually
        # Further optimizations start to become not so great.
        # For some reason, when trying to do a bulk SELECT with all of the
        # key value pairs in big OR ends up using the wrong index and ultimating
        # generating a significantly less efficient query. The only alternative is to
        # splice this up a bit and do all of the SELECTs, then do a bulk INSERT for remaining
        for key in remaining_keys:
            key_to_model[key] = cls.get_or_create(project_id, key[0].id, key[1])[0]

        return key_to_model
Example #2
0
    def get_or_create_bulk(cls, project_id, tags):
        # Attempt to create a bunch of models in one big batch with as few
        # queries and cache calls as possible.
        # In best case, this is all done in 1 cache get.
        # If we miss cache hit here, we have to fall back to old behavior.
        key_to_model = {tag: None for tag in tags}
        tags_by_key_id = {tag[0].id: tag for tag in tags}
        remaining_keys = set(tags)

        # First attempt to hit from cache, which in theory is the hot case
        cache_key_to_key = {cls.get_cache_key(project_id, tk.id, v): (tk, v) for tk, v in tags}
        cache_key_to_models = cache.get_many(cache_key_to_key.keys())
        for model in cache_key_to_models.values():
            key_to_model[tags_by_key_id[model._key_id]] = model
            remaining_keys.remove(tags_by_key_id[model._key_id])

        if not remaining_keys:
            # 100% cache hit on all items, good work team
            return key_to_model

        # Fall back to just doing it manually
        # Further optimizations start to become not so great.
        # For some reason, when trying to do a bulk SELECT with all of the
        # key value pairs in big OR ends up using the wrong index and ultimating
        # generating a significantly less efficient query. The only alternative is to
        # splice this up a bit and do all of the SELECTs, then do a bulk INSERT for remaining
        for key in remaining_keys:
            key_to_model[key] = cls.get_or_create(project_id, key[0].id, key[1])[0]

        return key_to_model
Example #3
0
def debounce_update_release_health_data(organization, project_ids):
    """This causes a flush of snuba health data to the postgres tables once
    per minute for the given projects.
    """
    # Figure out which projects need to get updates from the snuba.
    should_update = {}
    cache_keys = ["debounce-health:%d" % id for id in project_ids]
    cache_data = cache.get_many(cache_keys)
    for project_id, cache_key in izip(project_ids, cache_keys):
        if cache_data.get(cache_key) is None:
            should_update[project_id] = cache_key

    if not should_update:
        return

    projects = {p.id: p for p in Project.objects.get_many_from_cache(should_update.keys())}

    # This gives us updates for all release-projects which have seen new
    # health data over the last days. It will miss releases where the last
    # date is longer than what `get_changed_project_release_model_adoptions`
    # considers recent.
    project_releases = release_health.get_changed_project_release_model_adoptions(
        should_update.keys()
    )

    # Check which we already have rows for.
    existing = set(
        ReleaseProject.objects.filter(
            project_id__in=[x[0] for x in project_releases],
            release__version__in=[x[1] for x in project_releases],
        ).values_list("project_id", "release__version")
    )
    to_upsert = []
    for key in project_releases:
        if key not in existing:
            to_upsert.append(key)

    if to_upsert:
        dates = release_health.get_oldest_health_data_for_releases(to_upsert)

        for project_id, version in to_upsert:
            project = projects.get(project_id)
            if project is None:
                # should not happen
                continue

        # We might have never observed the release.  This for instance can
        # happen if the release only had health data so far.  For these cases
        # we want to create the release the first time we observed it on the
        # health side.
        release = Release.get_or_create(
            project=project, version=version, date_added=dates.get((project_id, version))
        )

        # Make sure that the release knows about this project.  Like we had before
        # the project might not have been associated with this release yet.
        release.add_project(project)

    # Debounce updates for a minute
    cache.set_many(dict(izip(should_update.values(), [True] * len(should_update))), 60)
Example #4
0
    def _get_group_snuba_stats(self, item_list, seen_stats):
        start = self._get_start_from_seen_stats(seen_stats)
        unhandled = {}

        cache_keys = []
        for item in item_list:
            cache_keys.append("group-mechanism-handled:%d" % item.id)

        cache_data = cache.get_many(cache_keys)
        for item, cache_key in zip(item_list, cache_keys):
            unhandled[item.id] = cache_data.get(cache_key)

        filter_keys = {}
        for item in item_list:
            if unhandled.get(item.id) is not None:
                continue
            filter_keys.setdefault("project_id", []).append(item.project_id)
            filter_keys.setdefault("group_id", []).append(item.id)

        if filter_keys:
            rv = raw_query(
                dataset=Dataset.Events,
                selected_columns=[
                    "group_id",
                    [
                        "argMax",
                        [["has", ["exception_stacks.mechanism_handled", 0]],
                         "timestamp"],
                        "unhandled",
                    ],
                ],
                groupby=["group_id"],
                filter_keys=filter_keys,
                start=start,
                orderby="group_id",
                referrer="group.unhandled-flag",
            )
            for x in rv["data"]:
                unhandled[x["group_id"]] = x["unhandled"]

                # cache the handled flag for 60 seconds.  This is broadly in line with
                # the time we give for buffer flushes so the user experience is somewhat
                # consistent here.
                cache.set("group-mechanism-handled:%d" % x["group_id"],
                          x["unhandled"], 60)

        return {
            group_id: {
                "unhandled": unhandled
            }
            for group_id, unhandled in unhandled.items()
        }
Example #5
0
def debounce_update_release_health_data(organization, project_ids):
    """This causes a flush of snuba health data to the postgres tables once
    per minute for the given projects.
    """
    # Figure out which projects need to get updates from the snuba.
    should_update = {}
    cache_keys = ["debounce-health:%d" % id for id in project_ids]
    cache_data = cache.get_many(cache_keys)
    for project_id, cache_key in izip(project_ids, cache_keys):
        if cache_data.get(cache_key) is None:
            should_update[project_id] = cache_key

    if not should_update:
        return

    projects = {
        p.id: p
        for p in Project.objects.get_many_from_cache(should_update.keys())
    }

    # This gives us updates for all release-projects which have seen new
    # health data over the last 24 hours. It will miss releases where the last
    # date is <24h ago.  We need to aggregate the data for the totals per release
    # manually here now.  This does not take environments into account.
    for project_id, version in get_changed_project_release_model_adoptions(
            should_update.keys()):
        project = projects.get(project_id)
        if project is None:
            # should not happen
            continue

        # We might have never observed the release.  This for instance can
        # happen if the release only had health data so far.  For these cases
        # we want to create the release the first time we observed it on the
        # health side.
        release = Release.get_or_create(project=project, version=version)

        # Make sure that the release knows about this project.  Like we had before
        # the project might not have been associated with this release yet.
        release.add_project(project)

    # Debounce updates for a minute
    cache.set_many(
        dict(izip(should_update.values(), [True] * len(should_update))), 60)
Example #6
0
    def get_many_from_cache(self, values, key="pk"):
        """
        Wrapper around `QuerySet.filter(pk__in=values)` which supports caching of
        the intermediate value.  Callee is responsible for making sure the
        cache key is cleared on save.

        NOTE: We can only query by primary key or some other unique identifier.
        It is not possible to e.g. run `Project.objects.get_many_from_cache([1,
        2, 3], key="organization_id")` and get back all projects belonging to
        those orgs. The length of the return value is bounded by the length of
        `values`.

        For most models, if one attempts to use a non-PK value this will just
        degrade to a DB query, like with `get_from_cache`.
        """

        pk_name = self.model._meta.pk.name

        if key == "pk":
            key = pk_name

        # Kill __exact since it's the default behavior
        if key.endswith("__exact"):
            key = key.split("__exact", 1)[0]

        if key not in self.cache_fields and key != pk_name:
            raise ValueError("We cannot cache this query. Just hit the database.")

        final_results = []
        cache_lookup_cache_keys = []
        cache_lookup_values = []

        local_cache = self._get_local_cache()
        for value in values:
            cache_key = self.__get_lookup_cache_key(**{key: value})
            result = local_cache and local_cache.get(cache_key)
            if result is not None:
                final_results.append(result)
            else:
                cache_lookup_cache_keys.append(cache_key)
                cache_lookup_values.append(value)

        if not cache_lookup_cache_keys:
            return final_results

        cache_results = cache.get_many(cache_lookup_cache_keys, version=self.cache_version)

        db_lookup_cache_keys = []
        db_lookup_values = []

        nested_lookup_cache_keys = []
        nested_lookup_values = []

        for cache_key, value in zip(cache_lookup_cache_keys, cache_lookup_values):
            cache_result = cache_results.get(cache_key)
            if cache_result is None:
                db_lookup_cache_keys.append(cache_key)
                db_lookup_values.append(value)
                continue

            # If we didn't look up by pk we need to hit the reffed key
            if key != pk_name:
                nested_lookup_cache_keys.append(cache_key)
                nested_lookup_values.append(cache_result)
                continue

            if not isinstance(cache_result, self.model):
                if settings.DEBUG:
                    raise ValueError("Unexpected value type returned from cache")
                logger.error("Cache response returned invalid value %r", cache_result)
                db_lookup_cache_keys.append(cache_key)
                db_lookup_values.append(value)
                continue

            if key == pk_name and int(value) != cache_result.pk:
                if settings.DEBUG:
                    raise ValueError("Unexpected value returned from cache")
                logger.error("Cache response returned invalid value %r", cache_result)
                db_lookup_cache_keys.append(cache_key)
                db_lookup_values.append(value)
                continue

            final_results.append(cache_result)

        if nested_lookup_values:
            nested_results = self.get_many_from_cache(nested_lookup_values, key=pk_name)
            final_results.extend(nested_results)
            if local_cache is not None:
                for nested_result in nested_results:
                    value = getattr(nested_result, key)
                    cache_key = self.__get_lookup_cache_key(**{key: value})
                    local_cache[cache_key] = nested_result

        if not db_lookup_values:
            return final_results

        cache_writes = []

        db_results = {getattr(x, key): x for x in self.filter(**{key + "__in": db_lookup_values})}
        for cache_key, value in zip(db_lookup_cache_keys, db_lookup_values):
            db_result = db_results.get(value)
            if db_result is None:
                continue  # This model ultimately does not exist

            # Ensure we're pushing it into the cache
            cache_writes.append(db_result)
            if local_cache is not None:
                local_cache[cache_key] = db_result

            final_results.append(db_result)

        # XXX: Should use set_many here, but __post_save code is too complex
        for instance in cache_writes:
            self.__post_save(instance=instance)

        return final_results
Example #7
0
    def get_many_from_cache(self, values, key="pk"):
        """
        Wrapper around `QuerySet.filter(pk__in=values)` which supports caching of
        the intermediate value.  Callee is responsible for making sure the
        cache key is cleared on save.
        """

        pk_name = self.model._meta.pk.name

        if key == "pk":
            key = pk_name

        # Kill __exact since it's the default behavior
        if key.endswith("__exact"):
            key = key.split("__exact", 1)[0]

        if key not in self.cache_fields and key != pk_name:
            return self.filter(**{key + "__in": values})

        final_results = []
        cache_lookup_cache_keys = []
        cache_lookup_values = []

        local_cache = self._get_local_cache()
        for value in values:
            cache_key = self.__get_lookup_cache_key(**{key: value})
            result = local_cache and local_cache.get(cache_key)
            if result is not None:
                final_results.append(result)
            else:
                cache_lookup_cache_keys.append(cache_key)
                cache_lookup_values.append(value)

        if not cache_lookup_cache_keys:
            return final_results

        cache_results = cache.get_many(cache_lookup_cache_keys, version=self.cache_version)

        db_lookup_cache_keys = []
        db_lookup_values = []

        nested_lookup_cache_keys = []
        nested_lookup_values = []

        for cache_key, value in zip(cache_lookup_cache_keys, cache_lookup_values):
            cache_result = cache_results.get(cache_key)
            if cache_result is None:
                db_lookup_cache_keys.append(cache_key)
                db_lookup_values.append(value)
                continue

            # If we didn't look up by pk we need to hit the reffed key
            if key != pk_name:
                nested_lookup_cache_keys.append(cache_key)
                nested_lookup_values.append(cache_result)
                continue

            if not isinstance(cache_result, self.model):
                if settings.DEBUG:
                    raise ValueError("Unexpected value type returned from cache")
                logger.error("Cache response returned invalid value %r", cache_result)
                db_lookup_cache_keys.append(cache_key)
                db_lookup_values.append(value)
                continue

            if key == pk_name and int(value) != cache_result.pk:
                if settings.DEBUG:
                    raise ValueError("Unexpected value returned from cache")
                logger.error("Cache response returned invalid value %r", cache_result)
                db_lookup_cache_keys.append(cache_key)
                db_lookup_values.append(value)
                continue

            final_results.append(cache_result)

        if nested_lookup_values:
            nested_results = self.get_many_from_cache(nested_lookup_values, key=pk_name)
            final_results.extend(nested_results)
            if local_cache is not None:
                for nested_result in nested_results:
                    value = getattr(nested_result, key)
                    cache_key = self.__get_lookup_cache_key(**{key: value})
                    local_cache[cache_key] = nested_result

        if not db_lookup_values:
            return final_results

        cache_writes = []

        db_results = {getattr(x, key): x for x in self.filter(**{key + "__in": db_lookup_values})}
        for cache_key, value in zip(db_lookup_cache_keys, db_lookup_values):
            db_result = db_results.get(value)
            if db_result is None:
                continue  # This model ultimately does not exist

            # Ensure we're pushing it into the cache
            cache_writes.append(db_result)
            if local_cache is not None:
                local_cache[cache_key] = db_result

            final_results.append(db_result)

        # XXX: Should use set_many here, but __post_save code is too complex
        for instance in cache_writes:
            self.__post_save(instance=instance)

        return final_results
Example #8
0
    def get_or_create_bulk(cls, project_id, environment_id, keys):
        # Attempt to create a bunch of models in one big batch with as few
        # queries and cache calls as possible.
        # In best case, this is all done in 1 cache get.
        # In ideal case, we'll do 3 queries total instead of N.
        # Absolute worst case, we still just do O(n) queries, but this should be rare.
        key_to_model = {key: None for key in keys}
        remaining_keys = set(keys)

        # First attempt to hit from cache, which in theory is the hot case
        cache_key_to_key = {
            cls.get_cache_key(project_id, environment_id, key): key
            for key in keys
        }
        cache_key_to_models = cache.get_many(cache_key_to_key.keys())
        for model in cache_key_to_models.values():
            key_to_model[model.key] = model
            remaining_keys.remove(model.key)

        if not remaining_keys:
            # 100% cache hit on all items, good work team
            return key_to_model

        # If we have some misses, we want to first check if
        # all of the misses actually exist in the database
        # already in one bulk query.
        to_cache = {}
        for model in cls.objects.filter(
                project_id=project_id,
                environment_id=environment_id,
                key__in=remaining_keys,
        ):
            key_to_model[model.key] = to_cache[cls.get_cache_key(
                project_id, environment_id, model.key)] = model
            remaining_keys.remove(model.key)

        # If we have found them all, cache all these misses
        # and return all the hits.
        if not remaining_keys:
            cache.set_many(to_cache, 3600)
            return key_to_model

        # At this point, we need to create all of our keys, since they
        # don't exist in cache or the database.

        # First attempt to create them all in one bulk query
        try:
            with transaction.atomic():
                cls.objects.bulk_create([
                    cls(
                        project_id=project_id,
                        environment_id=environment_id,
                        key=key,
                    ) for key in remaining_keys
                ])
        except IntegrityError:
            pass
        else:
            # If we succeed, the shitty part is we need one
            # more query to get back the actual rows with their ids.
            for model in cls.objects.filter(project_id=project_id,
                                            environment_id=environment_id,
                                            key__in=remaining_keys):
                key_to_model[model.key] = to_cache[cls.get_cache_key(
                    project_id, environment_id, model.key)] = model
                remaining_keys.remove(model.key)

            cache.set_many(to_cache, 3600)

            # Not clear if this could actually happen, but if it does,
            # guard ourselves against returning bad data.
            if not remaining_keys:
                return key_to_model

        # Fall back to just doing it manually
        # This case will only ever happen in a race condition.
        for key in remaining_keys:
            key_to_model[key] = cls.get_or_create(project_id, environment_id,
                                                  key)[0]

        return key_to_model
Example #9
0
    def get_attrs(self, item_list, user):
        if not self._collapse("base"):
            attrs = super().get_attrs(item_list, user)
        else:
            seen_stats = self._get_seen_stats(item_list, user)
            if seen_stats:
                attrs = {item: seen_stats.get(item, {}) for item in item_list}
            else:
                attrs = {item: {} for item in item_list}

        if self.stats_period and not self._collapse("stats"):
            partial_get_stats = functools.partial(
                self.get_stats, item_list=item_list, user=user, environment_ids=self.environment_ids
            )
            stats = partial_get_stats()
            filtered_stats = (
                partial_get_stats(conditions=self.conditions)
                if self.conditions and not self._collapse("filtered")
                else None
            )
            for item in item_list:
                if filtered_stats:
                    attrs[item].update({"filtered_stats": filtered_stats[item.id]})
                attrs[item].update({"stats": stats[item.id]})

            if self._expand("sessions"):
                uniq_project_ids = list({item.project_id for item in item_list})
                cache_keys = {pid: self._build_session_cache_key(pid) for pid in uniq_project_ids}
                cache_data = cache.get_many(cache_keys.values())
                missed_items = []
                for item in item_list:
                    num_sessions = cache_data.get(cache_keys[item.project_id])
                    if num_sessions is None:
                        found = "miss"
                        missed_items.append(item)
                    else:
                        found = "hit"
                        attrs[item].update(
                            {
                                "sessionCount": num_sessions,
                            }
                        )
                    metrics.incr(f"group.get_session_counts.{found}")

                if missed_items:
                    filters = {"project_id": list({item.project_id for item in missed_items})}
                    if self.environment_ids:
                        filters["environment"] = self.environment_ids

                    result_totals = raw_query(
                        selected_columns=["sessions"],
                        dataset=Dataset.Sessions,
                        start=self.start,
                        end=self.end,
                        filter_keys=filters,
                        groupby=["project_id"],
                        referrer="serializers.GroupSerializerSnuba.session_totals",
                    )

                    results = {}
                    for data in result_totals["data"]:
                        cache_key = self._build_session_cache_key(data["project_id"])
                        results[data["project_id"]] = data["sessions"]
                        cache.set(cache_key, data["sessions"], 3600)

                    for item in missed_items:
                        if item.project_id in results.keys():
                            attrs[item].update(
                                {
                                    "sessionCount": results[item.project_id],
                                }
                            )
                        else:
                            attrs[item].update({"sessionCount": None})

        if self._expand("inbox"):
            inbox_stats = get_inbox_details(item_list)
            for item in item_list:
                attrs[item].update({"inbox": inbox_stats.get(item.id)})

        if self._expand("owners"):
            owner_details = get_owner_details(item_list)
            for item in item_list:
                attrs[item].update({"owners": owner_details.get(item.id)})

        return attrs
Example #10
0
    def _query(self, projects, retention_window_start, group_queryset, tags, environments,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environments is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[projects[0].organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} IN ({})'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                            ', '.join(['%s' for e in environments])
                        ),
                    ],
                    params=[environment.id for environment in environments],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=projects[0].organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        now = timezone.now()
        end = parameters.get('date_to')
        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if cursor is None \
                    and sort_by == 'date' \
                    and not tags \
                    and not environments \
                    and not any(param in parameters for param in [
                        'age_from', 'age_to', 'last_seen_from',
                        'last_seen_to', 'times_seen', 'times_seen_lower',
                        'times_seen_upper'
                    ]):
                group_queryset = group_queryset.order_by('-last_seen')
                paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options)
                return paginator.get_result(limit, cursor, count_hits=False)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [
                retention_window_start,
                now - timedelta(days=90)
            ])
        )

        start = max(
            filter(None, [
                retention_date,
                parameters.get('date_from'),
            ])
        )

        end = max([
            retention_date,
            end
        ])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatability
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatability with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # num_candidates is the number of Group IDs to send down to Snuba, if
        # more Group ID candidates are found, a "bare" Snuba search is performed
        # and the result groups are then post-filtered via queries to the Sentry DB
        optimizer_enabled = options.get('snuba.search.pre-snuba-candidates-optimizer')
        if optimizer_enabled:
            missed_projects = []
            keys = [self._get_project_count_cache_key(p.id) for p in projects]

            counts_by_projects = {
                self._get_project_id_from_key(key): count for key, count in cache.get_many(keys).items()
            }

            missed_projects = {p.id for p in projects} - set(counts_by_projects.keys())

            if missed_projects:
                missing_counts = snuba.query(
                    start=max(
                        filter(None, [
                            retention_window_start,
                            now - timedelta(days=90)
                        ])
                    ),
                    end=now,
                    groupby=['project_id'],
                    filter_keys={
                        'project_id': list(missed_projects),
                    },
                    aggregations=[['uniq', 'group_id', 'group_count']],
                    referrer='search',
                )

                cache.set_many({
                    self._get_project_count_cache_key(project_id): count
                    for project_id, count in missing_counts.items()
                }, options.get('snuba.search.project-group-count-cache-time'))

                counts_by_projects.update(missing_counts)

            min_candidates = options.get('snuba.search.min-pre-snuba-candidates')
            max_candidates = options.get('snuba.search.max-pre-snuba-candidates')
            candidates_percentage = options.get('snuba.search.pre-snuba-candidates-percentage')

            num_candidates = max(
                min_candidates,
                min(
                    max_candidates,
                    sum(counts_by_projects.values()) * candidates_percentage
                )
            )
        else:
            num_candidates = options.get('snuba.search.min-pre-snuba-candidates')

        # pre-filter query
        candidate_ids = None
        if num_candidates and limit <= num_candidates:
            candidate_ids = list(
                group_queryset.values_list('id', flat=True)[:num_candidates + 1]
            )
            metrics.timing('snuba.search.num_candidates', len(candidate_ids))

            if not candidate_ids:
                # no matches could possibly be found from this point on
                metrics.incr('snuba.search.no_candidates', skip_internal=False)
                return EMPTY_RESULT
            elif len(candidate_ids) > num_candidates:
                # If the pre-filter query didn't include anything to significantly
                # filter down the number of results (from 'first_release', 'query',
                # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
                # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
                # might have surpassed the `num_candidates`. In this case,
                # we *don't* want to pass candidates down to Snuba, and instead we
                # want Snuba to do all the filtering/sorting it can and *then* apply
                # this queryset to the results from Snuba, which we call
                # post-filtering.
                metrics.incr('snuba.search.too_many_candidates', skip_internal=False)
                candidate_ids = None

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, more_results = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                tags=tags,
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba,
                # so we're finished with filtering and these are the
                # only results
                result_groups = snuba_groups
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, count_hits=False)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.
        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
Example #11
0
    def get_or_create_bulk(cls, project_id, environment_id, keys):
        # Attempt to create a bunch of models in one big batch with as few
        # queries and cache calls as possible.
        # In best case, this is all done in 1 cache get.
        # In ideal case, we'll do 3 queries total instead of N.
        # Absolute worst case, we still just do O(n) queries, but this should be rare.
        key_to_model = {key: None for key in keys}
        remaining_keys = set(keys)

        # First attempt to hit from cache, which in theory is the hot case
        cache_key_to_key = {cls.get_cache_key(project_id, environment_id, key): key for key in keys}
        cache_key_to_models = cache.get_many(cache_key_to_key.keys())
        for model in cache_key_to_models.values():
            key_to_model[model.key] = model
            remaining_keys.remove(model.key)

        if not remaining_keys:
            # 100% cache hit on all items, good work team
            return key_to_model

        # If we have some misses, we want to first check if
        # all of the misses actually exist in the database
        # already in one bulk query.
        to_cache = {}
        for model in cls.objects.filter(
            project_id=project_id,
            environment_id=environment_id,
            key__in=remaining_keys,
        ):
            key_to_model[model.key] = to_cache[cls.get_cache_key(
                project_id, environment_id, model.key)] = model
            remaining_keys.remove(model.key)

        # If we have found them all, cache all these misses
        # and return all the hits.
        if not remaining_keys:
            cache.set_many(to_cache, 3600)
            return key_to_model

        # At this point, we need to create all of our keys, since they
        # don't exist in cache or the database.

        # First attempt to create them all in one bulk query
        try:
            with transaction.atomic():
                cls.objects.bulk_create([
                    cls(
                        project_id=project_id,
                        environment_id=environment_id,
                        key=key,
                    )
                    for key in remaining_keys
                ])
        except IntegrityError:
            pass
        else:
            # If we succeed, the shitty part is we need one
            # more query to get back the actual rows with their ids.
            for model in cls.objects.filter(
                project_id=project_id,
                environment_id=environment_id,
                key__in=remaining_keys
            ):
                key_to_model[model.key] = to_cache[cls.get_cache_key(
                    project_id, environment_id, model.key)] = model
                remaining_keys.remove(model.key)

            cache.set_many(to_cache, 3600)

            # Not clear if this could actually happen, but if it does,
            # guard ourselves against returning bad data.
            if not remaining_keys:
                return key_to_model

        # Fall back to just doing it manually
        # This case will only ever happen in a race condition.
        for key in remaining_keys:
            key_to_model[key] = cls.get_or_create(project_id, environment_id, key)[0]

        return key_to_model
Example #12
0
    def get_attrs(self, item_list, user):
        if not self._collapse("base"):
            attrs = super().get_attrs(item_list, user)
        else:
            seen_stats = self._get_seen_stats(item_list, user)
            if seen_stats:
                attrs = {item: seen_stats.get(item, {}) for item in item_list}
            else:
                attrs = {item: {} for item in item_list}

        if self.stats_period and not self._collapse("stats"):
            partial_get_stats = functools.partial(
                self.get_stats,
                item_list=item_list,
                user=user,
                environment_ids=self.environment_ids)
            stats = partial_get_stats()
            filtered_stats = (partial_get_stats(
                conditions=self.conditions) if self.conditions
                              and not self._collapse("filtered") else None)
            for item in item_list:
                if filtered_stats:
                    attrs[item].update(
                        {"filtered_stats": filtered_stats[item.id]})
                attrs[item].update({"stats": stats[item.id]})

            if self._expand("sessions"):
                uniq_project_ids = list(
                    {item.project_id
                     for item in item_list})
                cache_keys = {
                    pid: self._build_session_cache_key(pid)
                    for pid in uniq_project_ids
                }
                cache_data = cache.get_many(cache_keys.values())
                missed_items = []
                for item in item_list:
                    num_sessions = cache_data.get(cache_keys[item.project_id])
                    if num_sessions is None:
                        found = "miss"
                        missed_items.append(item)
                    else:
                        found = "hit"
                        attrs[item].update({
                            "sessionCount": num_sessions,
                        })
                    metrics.incr(f"group.get_session_counts.{found}")

                if missed_items:
                    project_ids = list(
                        {item.project_id
                         for item in missed_items})
                    project_sessions = release_health.get_num_sessions_per_project(
                        project_ids,
                        self.start,
                        self.end,
                        self.environment_ids,
                    )

                    results = {}
                    for project_id, count in project_sessions:
                        cache_key = self._build_session_cache_key(project_id)
                        results[project_id] = count
                        cache.set(cache_key, count, 3600)

                    for item in missed_items:
                        if item.project_id in results.keys():
                            attrs[item].update({
                                "sessionCount":
                                results[item.project_id],
                            })
                        else:
                            attrs[item].update({"sessionCount": None})

        if self._expand("inbox"):
            inbox_stats = get_inbox_details(item_list)
            for item in item_list:
                attrs[item].update({"inbox": inbox_stats.get(item.id)})

        if self._expand("owners"):
            owner_details = get_owner_details(item_list)
            for item in item_list:
                attrs[item].update({"owners": owner_details.get(item.id)})

        return attrs