def get_or_create_bulk(cls, project_id, tags): # Attempt to create a bunch of models in one big batch with as few # queries and cache calls as possible. # In best case, this is all done in 1 cache get. # If we miss cache hit here, we have to fall back to old behavior. key_to_model = {tag: None for tag in tags} tags_by_key_id = {tag[0].id: tag for tag in tags} remaining_keys = set(tags) # First attempt to hit from cache, which in theory is the hot case cache_key_to_key = {cls.get_cache_key(project_id, tk.id, v): (tk, v) for tk, v in tags} cache_key_to_models = cache.get_many(cache_key_to_key.keys()) for model in cache_key_to_models.values(): key_to_model[tags_by_key_id[model._key_id]] = model remaining_keys.remove(tags_by_key_id[model._key_id]) if not remaining_keys: # 100% cache hit on all items, good work team return key_to_model # Fall back to just doing it manually # Further optimizations start to become not so great. # For some reason, when trying to do a bulk SELECT with all of the # key value pairs in big OR ends up using the wrong index and ultimating # generating a significantly less efficient query. The only alternative is to # splice this up a bit and do all of the SELECTs, then do a bulk INSERT for remaining for key in remaining_keys: key_to_model[key] = cls.get_or_create(project_id, key[0].id, key[1])[0] return key_to_model
def debounce_update_release_health_data(organization, project_ids): """This causes a flush of snuba health data to the postgres tables once per minute for the given projects. """ # Figure out which projects need to get updates from the snuba. should_update = {} cache_keys = ["debounce-health:%d" % id for id in project_ids] cache_data = cache.get_many(cache_keys) for project_id, cache_key in izip(project_ids, cache_keys): if cache_data.get(cache_key) is None: should_update[project_id] = cache_key if not should_update: return projects = {p.id: p for p in Project.objects.get_many_from_cache(should_update.keys())} # This gives us updates for all release-projects which have seen new # health data over the last days. It will miss releases where the last # date is longer than what `get_changed_project_release_model_adoptions` # considers recent. project_releases = release_health.get_changed_project_release_model_adoptions( should_update.keys() ) # Check which we already have rows for. existing = set( ReleaseProject.objects.filter( project_id__in=[x[0] for x in project_releases], release__version__in=[x[1] for x in project_releases], ).values_list("project_id", "release__version") ) to_upsert = [] for key in project_releases: if key not in existing: to_upsert.append(key) if to_upsert: dates = release_health.get_oldest_health_data_for_releases(to_upsert) for project_id, version in to_upsert: project = projects.get(project_id) if project is None: # should not happen continue # We might have never observed the release. This for instance can # happen if the release only had health data so far. For these cases # we want to create the release the first time we observed it on the # health side. release = Release.get_or_create( project=project, version=version, date_added=dates.get((project_id, version)) ) # Make sure that the release knows about this project. Like we had before # the project might not have been associated with this release yet. release.add_project(project) # Debounce updates for a minute cache.set_many(dict(izip(should_update.values(), [True] * len(should_update))), 60)
def _get_group_snuba_stats(self, item_list, seen_stats): start = self._get_start_from_seen_stats(seen_stats) unhandled = {} cache_keys = [] for item in item_list: cache_keys.append("group-mechanism-handled:%d" % item.id) cache_data = cache.get_many(cache_keys) for item, cache_key in zip(item_list, cache_keys): unhandled[item.id] = cache_data.get(cache_key) filter_keys = {} for item in item_list: if unhandled.get(item.id) is not None: continue filter_keys.setdefault("project_id", []).append(item.project_id) filter_keys.setdefault("group_id", []).append(item.id) if filter_keys: rv = raw_query( dataset=Dataset.Events, selected_columns=[ "group_id", [ "argMax", [["has", ["exception_stacks.mechanism_handled", 0]], "timestamp"], "unhandled", ], ], groupby=["group_id"], filter_keys=filter_keys, start=start, orderby="group_id", referrer="group.unhandled-flag", ) for x in rv["data"]: unhandled[x["group_id"]] = x["unhandled"] # cache the handled flag for 60 seconds. This is broadly in line with # the time we give for buffer flushes so the user experience is somewhat # consistent here. cache.set("group-mechanism-handled:%d" % x["group_id"], x["unhandled"], 60) return { group_id: { "unhandled": unhandled } for group_id, unhandled in unhandled.items() }
def debounce_update_release_health_data(organization, project_ids): """This causes a flush of snuba health data to the postgres tables once per minute for the given projects. """ # Figure out which projects need to get updates from the snuba. should_update = {} cache_keys = ["debounce-health:%d" % id for id in project_ids] cache_data = cache.get_many(cache_keys) for project_id, cache_key in izip(project_ids, cache_keys): if cache_data.get(cache_key) is None: should_update[project_id] = cache_key if not should_update: return projects = { p.id: p for p in Project.objects.get_many_from_cache(should_update.keys()) } # This gives us updates for all release-projects which have seen new # health data over the last 24 hours. It will miss releases where the last # date is <24h ago. We need to aggregate the data for the totals per release # manually here now. This does not take environments into account. for project_id, version in get_changed_project_release_model_adoptions( should_update.keys()): project = projects.get(project_id) if project is None: # should not happen continue # We might have never observed the release. This for instance can # happen if the release only had health data so far. For these cases # we want to create the release the first time we observed it on the # health side. release = Release.get_or_create(project=project, version=version) # Make sure that the release knows about this project. Like we had before # the project might not have been associated with this release yet. release.add_project(project) # Debounce updates for a minute cache.set_many( dict(izip(should_update.values(), [True] * len(should_update))), 60)
def get_many_from_cache(self, values, key="pk"): """ Wrapper around `QuerySet.filter(pk__in=values)` which supports caching of the intermediate value. Callee is responsible for making sure the cache key is cleared on save. NOTE: We can only query by primary key or some other unique identifier. It is not possible to e.g. run `Project.objects.get_many_from_cache([1, 2, 3], key="organization_id")` and get back all projects belonging to those orgs. The length of the return value is bounded by the length of `values`. For most models, if one attempts to use a non-PK value this will just degrade to a DB query, like with `get_from_cache`. """ pk_name = self.model._meta.pk.name if key == "pk": key = pk_name # Kill __exact since it's the default behavior if key.endswith("__exact"): key = key.split("__exact", 1)[0] if key not in self.cache_fields and key != pk_name: raise ValueError("We cannot cache this query. Just hit the database.") final_results = [] cache_lookup_cache_keys = [] cache_lookup_values = [] local_cache = self._get_local_cache() for value in values: cache_key = self.__get_lookup_cache_key(**{key: value}) result = local_cache and local_cache.get(cache_key) if result is not None: final_results.append(result) else: cache_lookup_cache_keys.append(cache_key) cache_lookup_values.append(value) if not cache_lookup_cache_keys: return final_results cache_results = cache.get_many(cache_lookup_cache_keys, version=self.cache_version) db_lookup_cache_keys = [] db_lookup_values = [] nested_lookup_cache_keys = [] nested_lookup_values = [] for cache_key, value in zip(cache_lookup_cache_keys, cache_lookup_values): cache_result = cache_results.get(cache_key) if cache_result is None: db_lookup_cache_keys.append(cache_key) db_lookup_values.append(value) continue # If we didn't look up by pk we need to hit the reffed key if key != pk_name: nested_lookup_cache_keys.append(cache_key) nested_lookup_values.append(cache_result) continue if not isinstance(cache_result, self.model): if settings.DEBUG: raise ValueError("Unexpected value type returned from cache") logger.error("Cache response returned invalid value %r", cache_result) db_lookup_cache_keys.append(cache_key) db_lookup_values.append(value) continue if key == pk_name and int(value) != cache_result.pk: if settings.DEBUG: raise ValueError("Unexpected value returned from cache") logger.error("Cache response returned invalid value %r", cache_result) db_lookup_cache_keys.append(cache_key) db_lookup_values.append(value) continue final_results.append(cache_result) if nested_lookup_values: nested_results = self.get_many_from_cache(nested_lookup_values, key=pk_name) final_results.extend(nested_results) if local_cache is not None: for nested_result in nested_results: value = getattr(nested_result, key) cache_key = self.__get_lookup_cache_key(**{key: value}) local_cache[cache_key] = nested_result if not db_lookup_values: return final_results cache_writes = [] db_results = {getattr(x, key): x for x in self.filter(**{key + "__in": db_lookup_values})} for cache_key, value in zip(db_lookup_cache_keys, db_lookup_values): db_result = db_results.get(value) if db_result is None: continue # This model ultimately does not exist # Ensure we're pushing it into the cache cache_writes.append(db_result) if local_cache is not None: local_cache[cache_key] = db_result final_results.append(db_result) # XXX: Should use set_many here, but __post_save code is too complex for instance in cache_writes: self.__post_save(instance=instance) return final_results
def get_many_from_cache(self, values, key="pk"): """ Wrapper around `QuerySet.filter(pk__in=values)` which supports caching of the intermediate value. Callee is responsible for making sure the cache key is cleared on save. """ pk_name = self.model._meta.pk.name if key == "pk": key = pk_name # Kill __exact since it's the default behavior if key.endswith("__exact"): key = key.split("__exact", 1)[0] if key not in self.cache_fields and key != pk_name: return self.filter(**{key + "__in": values}) final_results = [] cache_lookup_cache_keys = [] cache_lookup_values = [] local_cache = self._get_local_cache() for value in values: cache_key = self.__get_lookup_cache_key(**{key: value}) result = local_cache and local_cache.get(cache_key) if result is not None: final_results.append(result) else: cache_lookup_cache_keys.append(cache_key) cache_lookup_values.append(value) if not cache_lookup_cache_keys: return final_results cache_results = cache.get_many(cache_lookup_cache_keys, version=self.cache_version) db_lookup_cache_keys = [] db_lookup_values = [] nested_lookup_cache_keys = [] nested_lookup_values = [] for cache_key, value in zip(cache_lookup_cache_keys, cache_lookup_values): cache_result = cache_results.get(cache_key) if cache_result is None: db_lookup_cache_keys.append(cache_key) db_lookup_values.append(value) continue # If we didn't look up by pk we need to hit the reffed key if key != pk_name: nested_lookup_cache_keys.append(cache_key) nested_lookup_values.append(cache_result) continue if not isinstance(cache_result, self.model): if settings.DEBUG: raise ValueError("Unexpected value type returned from cache") logger.error("Cache response returned invalid value %r", cache_result) db_lookup_cache_keys.append(cache_key) db_lookup_values.append(value) continue if key == pk_name and int(value) != cache_result.pk: if settings.DEBUG: raise ValueError("Unexpected value returned from cache") logger.error("Cache response returned invalid value %r", cache_result) db_lookup_cache_keys.append(cache_key) db_lookup_values.append(value) continue final_results.append(cache_result) if nested_lookup_values: nested_results = self.get_many_from_cache(nested_lookup_values, key=pk_name) final_results.extend(nested_results) if local_cache is not None: for nested_result in nested_results: value = getattr(nested_result, key) cache_key = self.__get_lookup_cache_key(**{key: value}) local_cache[cache_key] = nested_result if not db_lookup_values: return final_results cache_writes = [] db_results = {getattr(x, key): x for x in self.filter(**{key + "__in": db_lookup_values})} for cache_key, value in zip(db_lookup_cache_keys, db_lookup_values): db_result = db_results.get(value) if db_result is None: continue # This model ultimately does not exist # Ensure we're pushing it into the cache cache_writes.append(db_result) if local_cache is not None: local_cache[cache_key] = db_result final_results.append(db_result) # XXX: Should use set_many here, but __post_save code is too complex for instance in cache_writes: self.__post_save(instance=instance) return final_results
def get_or_create_bulk(cls, project_id, environment_id, keys): # Attempt to create a bunch of models in one big batch with as few # queries and cache calls as possible. # In best case, this is all done in 1 cache get. # In ideal case, we'll do 3 queries total instead of N. # Absolute worst case, we still just do O(n) queries, but this should be rare. key_to_model = {key: None for key in keys} remaining_keys = set(keys) # First attempt to hit from cache, which in theory is the hot case cache_key_to_key = { cls.get_cache_key(project_id, environment_id, key): key for key in keys } cache_key_to_models = cache.get_many(cache_key_to_key.keys()) for model in cache_key_to_models.values(): key_to_model[model.key] = model remaining_keys.remove(model.key) if not remaining_keys: # 100% cache hit on all items, good work team return key_to_model # If we have some misses, we want to first check if # all of the misses actually exist in the database # already in one bulk query. to_cache = {} for model in cls.objects.filter( project_id=project_id, environment_id=environment_id, key__in=remaining_keys, ): key_to_model[model.key] = to_cache[cls.get_cache_key( project_id, environment_id, model.key)] = model remaining_keys.remove(model.key) # If we have found them all, cache all these misses # and return all the hits. if not remaining_keys: cache.set_many(to_cache, 3600) return key_to_model # At this point, we need to create all of our keys, since they # don't exist in cache or the database. # First attempt to create them all in one bulk query try: with transaction.atomic(): cls.objects.bulk_create([ cls( project_id=project_id, environment_id=environment_id, key=key, ) for key in remaining_keys ]) except IntegrityError: pass else: # If we succeed, the shitty part is we need one # more query to get back the actual rows with their ids. for model in cls.objects.filter(project_id=project_id, environment_id=environment_id, key__in=remaining_keys): key_to_model[model.key] = to_cache[cls.get_cache_key( project_id, environment_id, model.key)] = model remaining_keys.remove(model.key) cache.set_many(to_cache, 3600) # Not clear if this could actually happen, but if it does, # guard ourselves against returning bad data. if not remaining_keys: return key_to_model # Fall back to just doing it manually # This case will only ever happen in a race condition. for key in remaining_keys: key_to_model[key] = cls.get_or_create(project_id, environment_id, key)[0] return key_to_model
def get_attrs(self, item_list, user): if not self._collapse("base"): attrs = super().get_attrs(item_list, user) else: seen_stats = self._get_seen_stats(item_list, user) if seen_stats: attrs = {item: seen_stats.get(item, {}) for item in item_list} else: attrs = {item: {} for item in item_list} if self.stats_period and not self._collapse("stats"): partial_get_stats = functools.partial( self.get_stats, item_list=item_list, user=user, environment_ids=self.environment_ids ) stats = partial_get_stats() filtered_stats = ( partial_get_stats(conditions=self.conditions) if self.conditions and not self._collapse("filtered") else None ) for item in item_list: if filtered_stats: attrs[item].update({"filtered_stats": filtered_stats[item.id]}) attrs[item].update({"stats": stats[item.id]}) if self._expand("sessions"): uniq_project_ids = list({item.project_id for item in item_list}) cache_keys = {pid: self._build_session_cache_key(pid) for pid in uniq_project_ids} cache_data = cache.get_many(cache_keys.values()) missed_items = [] for item in item_list: num_sessions = cache_data.get(cache_keys[item.project_id]) if num_sessions is None: found = "miss" missed_items.append(item) else: found = "hit" attrs[item].update( { "sessionCount": num_sessions, } ) metrics.incr(f"group.get_session_counts.{found}") if missed_items: filters = {"project_id": list({item.project_id for item in missed_items})} if self.environment_ids: filters["environment"] = self.environment_ids result_totals = raw_query( selected_columns=["sessions"], dataset=Dataset.Sessions, start=self.start, end=self.end, filter_keys=filters, groupby=["project_id"], referrer="serializers.GroupSerializerSnuba.session_totals", ) results = {} for data in result_totals["data"]: cache_key = self._build_session_cache_key(data["project_id"]) results[data["project_id"]] = data["sessions"] cache.set(cache_key, data["sessions"], 3600) for item in missed_items: if item.project_id in results.keys(): attrs[item].update( { "sessionCount": results[item.project_id], } ) else: attrs[item].update({"sessionCount": None}) if self._expand("inbox"): inbox_stats = get_inbox_details(item_list) for item in item_list: attrs[item].update({"inbox": inbox_stats.get(item.id)}) if self._expand("owners"): owner_details = get_owner_details(item_list) for item in item_list: attrs[item].update({"owners": owner_details.get(item.id)}) return attrs
def _query(self, projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[projects[0].organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} IN ({})'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ', '.join(['%s' for e in environments]) ), ], params=[environment.id for environment in environments], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) now = timezone.now() end = parameters.get('date_to') if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if cursor is None \ and sort_by == 'date' \ and not tags \ and not environments \ and not any(param in parameters for param in [ 'age_from', 'age_to', 'last_seen_from', 'last_seen_to', 'times_seen', 'times_seen_lower', 'times_seen_upper' ]): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) return paginator.get_result(limit, cursor, count_hits=False) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) start = max( filter(None, [ retention_date, parameters.get('date_from'), ]) ) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # num_candidates is the number of Group IDs to send down to Snuba, if # more Group ID candidates are found, a "bare" Snuba search is performed # and the result groups are then post-filtered via queries to the Sentry DB optimizer_enabled = options.get('snuba.search.pre-snuba-candidates-optimizer') if optimizer_enabled: missed_projects = [] keys = [self._get_project_count_cache_key(p.id) for p in projects] counts_by_projects = { self._get_project_id_from_key(key): count for key, count in cache.get_many(keys).items() } missed_projects = {p.id for p in projects} - set(counts_by_projects.keys()) if missed_projects: missing_counts = snuba.query( start=max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ), end=now, groupby=['project_id'], filter_keys={ 'project_id': list(missed_projects), }, aggregations=[['uniq', 'group_id', 'group_count']], referrer='search', ) cache.set_many({ self._get_project_count_cache_key(project_id): count for project_id, count in missing_counts.items() }, options.get('snuba.search.project-group-count-cache-time')) counts_by_projects.update(missing_counts) min_candidates = options.get('snuba.search.min-pre-snuba-candidates') max_candidates = options.get('snuba.search.max-pre-snuba-candidates') candidates_percentage = options.get('snuba.search.pre-snuba-candidates-percentage') num_candidates = max( min_candidates, min( max_candidates, sum(counts_by_projects.values()) * candidates_percentage ) ) else: num_candidates = options.get('snuba.search.min-pre-snuba-candidates') # pre-filter query candidate_ids = None if num_candidates and limit <= num_candidates: candidate_ids = list( group_queryset.values_list('id', flat=True)[:num_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > num_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `num_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) candidate_ids = None sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, more_results = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering and these are the # only results result_groups = snuba_groups else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=False) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def get_or_create_bulk(cls, project_id, environment_id, keys): # Attempt to create a bunch of models in one big batch with as few # queries and cache calls as possible. # In best case, this is all done in 1 cache get. # In ideal case, we'll do 3 queries total instead of N. # Absolute worst case, we still just do O(n) queries, but this should be rare. key_to_model = {key: None for key in keys} remaining_keys = set(keys) # First attempt to hit from cache, which in theory is the hot case cache_key_to_key = {cls.get_cache_key(project_id, environment_id, key): key for key in keys} cache_key_to_models = cache.get_many(cache_key_to_key.keys()) for model in cache_key_to_models.values(): key_to_model[model.key] = model remaining_keys.remove(model.key) if not remaining_keys: # 100% cache hit on all items, good work team return key_to_model # If we have some misses, we want to first check if # all of the misses actually exist in the database # already in one bulk query. to_cache = {} for model in cls.objects.filter( project_id=project_id, environment_id=environment_id, key__in=remaining_keys, ): key_to_model[model.key] = to_cache[cls.get_cache_key( project_id, environment_id, model.key)] = model remaining_keys.remove(model.key) # If we have found them all, cache all these misses # and return all the hits. if not remaining_keys: cache.set_many(to_cache, 3600) return key_to_model # At this point, we need to create all of our keys, since they # don't exist in cache or the database. # First attempt to create them all in one bulk query try: with transaction.atomic(): cls.objects.bulk_create([ cls( project_id=project_id, environment_id=environment_id, key=key, ) for key in remaining_keys ]) except IntegrityError: pass else: # If we succeed, the shitty part is we need one # more query to get back the actual rows with their ids. for model in cls.objects.filter( project_id=project_id, environment_id=environment_id, key__in=remaining_keys ): key_to_model[model.key] = to_cache[cls.get_cache_key( project_id, environment_id, model.key)] = model remaining_keys.remove(model.key) cache.set_many(to_cache, 3600) # Not clear if this could actually happen, but if it does, # guard ourselves against returning bad data. if not remaining_keys: return key_to_model # Fall back to just doing it manually # This case will only ever happen in a race condition. for key in remaining_keys: key_to_model[key] = cls.get_or_create(project_id, environment_id, key)[0] return key_to_model
def get_attrs(self, item_list, user): if not self._collapse("base"): attrs = super().get_attrs(item_list, user) else: seen_stats = self._get_seen_stats(item_list, user) if seen_stats: attrs = {item: seen_stats.get(item, {}) for item in item_list} else: attrs = {item: {} for item in item_list} if self.stats_period and not self._collapse("stats"): partial_get_stats = functools.partial( self.get_stats, item_list=item_list, user=user, environment_ids=self.environment_ids) stats = partial_get_stats() filtered_stats = (partial_get_stats( conditions=self.conditions) if self.conditions and not self._collapse("filtered") else None) for item in item_list: if filtered_stats: attrs[item].update( {"filtered_stats": filtered_stats[item.id]}) attrs[item].update({"stats": stats[item.id]}) if self._expand("sessions"): uniq_project_ids = list( {item.project_id for item in item_list}) cache_keys = { pid: self._build_session_cache_key(pid) for pid in uniq_project_ids } cache_data = cache.get_many(cache_keys.values()) missed_items = [] for item in item_list: num_sessions = cache_data.get(cache_keys[item.project_id]) if num_sessions is None: found = "miss" missed_items.append(item) else: found = "hit" attrs[item].update({ "sessionCount": num_sessions, }) metrics.incr(f"group.get_session_counts.{found}") if missed_items: project_ids = list( {item.project_id for item in missed_items}) project_sessions = release_health.get_num_sessions_per_project( project_ids, self.start, self.end, self.environment_ids, ) results = {} for project_id, count in project_sessions: cache_key = self._build_session_cache_key(project_id) results[project_id] = count cache.set(cache_key, count, 3600) for item in missed_items: if item.project_id in results.keys(): attrs[item].update({ "sessionCount": results[item.project_id], }) else: attrs[item].update({"sessionCount": None}) if self._expand("inbox"): inbox_stats = get_inbox_details(item_list) for item in item_list: attrs[item].update({"inbox": inbox_stats.get(item.id)}) if self._expand("owners"): owner_details = get_owner_details(item_list) for item in item_list: attrs[item].update({"owners": owner_details.get(item.id)}) return attrs