def debounce_update_release_health_data(organization, project_ids): """This causes a flush of snuba health data to the postgres tables once per minute for the given projects. """ # Figure out which projects need to get updates from the snuba. should_update = {} cache_keys = ["debounce-health:%d" % id for id in project_ids] cache_data = cache.get_many(cache_keys) for project_id, cache_key in izip(project_ids, cache_keys): if cache_data.get(cache_key) is None: should_update[project_id] = cache_key if not should_update: return projects = {p.id: p for p in Project.objects.get_many_from_cache(should_update.keys())} # This gives us updates for all release-projects which have seen new # health data over the last days. It will miss releases where the last # date is longer than what `get_changed_project_release_model_adoptions` # considers recent. project_releases = release_health.get_changed_project_release_model_adoptions( should_update.keys() ) # Check which we already have rows for. existing = set( ReleaseProject.objects.filter( project_id__in=[x[0] for x in project_releases], release__version__in=[x[1] for x in project_releases], ).values_list("project_id", "release__version") ) to_upsert = [] for key in project_releases: if key not in existing: to_upsert.append(key) if to_upsert: dates = release_health.get_oldest_health_data_for_releases(to_upsert) for project_id, version in to_upsert: project = projects.get(project_id) if project is None: # should not happen continue # We might have never observed the release. This for instance can # happen if the release only had health data so far. For these cases # we want to create the release the first time we observed it on the # health side. release = Release.get_or_create( project=project, version=version, date_added=dates.get((project_id, version)) ) # Make sure that the release knows about this project. Like we had before # the project might not have been associated with this release yet. release.add_project(project) # Debounce updates for a minute cache.set_many(dict(izip(should_update.values(), [True] * len(should_update))), 60)
def debounce_update_release_health_data(organization, project_ids): """This causes a flush of snuba health data to the postgres tables once per minute for the given projects. """ # Figure out which projects need to get updates from the snuba. should_update = {} cache_keys = ["debounce-health:%d" % id for id in project_ids] cache_data = cache.get_many(cache_keys) for project_id, cache_key in izip(project_ids, cache_keys): if cache_data.get(cache_key) is None: should_update[project_id] = cache_key if not should_update: return projects = { p.id: p for p in Project.objects.get_many_from_cache(should_update.keys()) } # This gives us updates for all release-projects which have seen new # health data over the last 24 hours. It will miss releases where the last # date is <24h ago. We need to aggregate the data for the totals per release # manually here now. This does not take environments into account. for project_id, version in get_changed_project_release_model_adoptions( should_update.keys()): project = projects.get(project_id) if project is None: # should not happen continue # We might have never observed the release. This for instance can # happen if the release only had health data so far. For these cases # we want to create the release the first time we observed it on the # health side. release = Release.get_or_create(project=project, version=version) # Make sure that the release knows about this project. Like we had before # the project might not have been associated with this release yet. release.add_project(project) # Debounce updates for a minute cache.set_many( dict(izip(should_update.values(), [True] * len(should_update))), 60)
def get_or_create_bulk(cls, project_id, environment_id, keys): # Attempt to create a bunch of models in one big batch with as few # queries and cache calls as possible. # In best case, this is all done in 1 cache get. # In ideal case, we'll do 3 queries total instead of N. # Absolute worst case, we still just do O(n) queries, but this should be rare. key_to_model = {key: None for key in keys} remaining_keys = set(keys) # First attempt to hit from cache, which in theory is the hot case cache_key_to_key = { cls.get_cache_key(project_id, environment_id, key): key for key in keys } cache_key_to_models = cache.get_many(cache_key_to_key.keys()) for model in cache_key_to_models.values(): key_to_model[model.key] = model remaining_keys.remove(model.key) if not remaining_keys: # 100% cache hit on all items, good work team return key_to_model # If we have some misses, we want to first check if # all of the misses actually exist in the database # already in one bulk query. to_cache = {} for model in cls.objects.filter( project_id=project_id, environment_id=environment_id, key__in=remaining_keys, ): key_to_model[model.key] = to_cache[cls.get_cache_key( project_id, environment_id, model.key)] = model remaining_keys.remove(model.key) # If we have found them all, cache all these misses # and return all the hits. if not remaining_keys: cache.set_many(to_cache, 3600) return key_to_model # At this point, we need to create all of our keys, since they # don't exist in cache or the database. # First attempt to create them all in one bulk query try: with transaction.atomic(): cls.objects.bulk_create([ cls( project_id=project_id, environment_id=environment_id, key=key, ) for key in remaining_keys ]) except IntegrityError: pass else: # If we succeed, the shitty part is we need one # more query to get back the actual rows with their ids. for model in cls.objects.filter(project_id=project_id, environment_id=environment_id, key__in=remaining_keys): key_to_model[model.key] = to_cache[cls.get_cache_key( project_id, environment_id, model.key)] = model remaining_keys.remove(model.key) cache.set_many(to_cache, 3600) # Not clear if this could actually happen, but if it does, # guard ourselves against returning bad data. if not remaining_keys: return key_to_model # Fall back to just doing it manually # This case will only ever happen in a race condition. for key in remaining_keys: key_to_model[key] = cls.get_or_create(project_id, environment_id, key)[0] return key_to_model
def _query(self, projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[projects[0].organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} IN ({})'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ', '.join(['%s' for e in environments]) ), ], params=[environment.id for environment in environments], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) now = timezone.now() end = parameters.get('date_to') if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if cursor is None \ and sort_by == 'date' \ and not tags \ and not environments \ and not any(param in parameters for param in [ 'age_from', 'age_to', 'last_seen_from', 'last_seen_to', 'times_seen', 'times_seen_lower', 'times_seen_upper' ]): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) return paginator.get_result(limit, cursor, count_hits=False) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) start = max( filter(None, [ retention_date, parameters.get('date_from'), ]) ) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # num_candidates is the number of Group IDs to send down to Snuba, if # more Group ID candidates are found, a "bare" Snuba search is performed # and the result groups are then post-filtered via queries to the Sentry DB optimizer_enabled = options.get('snuba.search.pre-snuba-candidates-optimizer') if optimizer_enabled: missed_projects = [] keys = [self._get_project_count_cache_key(p.id) for p in projects] counts_by_projects = { self._get_project_id_from_key(key): count for key, count in cache.get_many(keys).items() } missed_projects = {p.id for p in projects} - set(counts_by_projects.keys()) if missed_projects: missing_counts = snuba.query( start=max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ), end=now, groupby=['project_id'], filter_keys={ 'project_id': list(missed_projects), }, aggregations=[['uniq', 'group_id', 'group_count']], referrer='search', ) cache.set_many({ self._get_project_count_cache_key(project_id): count for project_id, count in missing_counts.items() }, options.get('snuba.search.project-group-count-cache-time')) counts_by_projects.update(missing_counts) min_candidates = options.get('snuba.search.min-pre-snuba-candidates') max_candidates = options.get('snuba.search.max-pre-snuba-candidates') candidates_percentage = options.get('snuba.search.pre-snuba-candidates-percentage') num_candidates = max( min_candidates, min( max_candidates, sum(counts_by_projects.values()) * candidates_percentage ) ) else: num_candidates = options.get('snuba.search.min-pre-snuba-candidates') # pre-filter query candidate_ids = None if num_candidates and limit <= num_candidates: candidate_ids = list( group_queryset.values_list('id', flat=True)[:num_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > num_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `num_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) candidate_ids = None sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, more_results = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering and these are the # only results result_groups = snuba_groups else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=False) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def get_or_create_bulk(cls, project_id, environment_id, keys): # Attempt to create a bunch of models in one big batch with as few # queries and cache calls as possible. # In best case, this is all done in 1 cache get. # In ideal case, we'll do 3 queries total instead of N. # Absolute worst case, we still just do O(n) queries, but this should be rare. key_to_model = {key: None for key in keys} remaining_keys = set(keys) # First attempt to hit from cache, which in theory is the hot case cache_key_to_key = {cls.get_cache_key(project_id, environment_id, key): key for key in keys} cache_key_to_models = cache.get_many(cache_key_to_key.keys()) for model in cache_key_to_models.values(): key_to_model[model.key] = model remaining_keys.remove(model.key) if not remaining_keys: # 100% cache hit on all items, good work team return key_to_model # If we have some misses, we want to first check if # all of the misses actually exist in the database # already in one bulk query. to_cache = {} for model in cls.objects.filter( project_id=project_id, environment_id=environment_id, key__in=remaining_keys, ): key_to_model[model.key] = to_cache[cls.get_cache_key( project_id, environment_id, model.key)] = model remaining_keys.remove(model.key) # If we have found them all, cache all these misses # and return all the hits. if not remaining_keys: cache.set_many(to_cache, 3600) return key_to_model # At this point, we need to create all of our keys, since they # don't exist in cache or the database. # First attempt to create them all in one bulk query try: with transaction.atomic(): cls.objects.bulk_create([ cls( project_id=project_id, environment_id=environment_id, key=key, ) for key in remaining_keys ]) except IntegrityError: pass else: # If we succeed, the shitty part is we need one # more query to get back the actual rows with their ids. for model in cls.objects.filter( project_id=project_id, environment_id=environment_id, key__in=remaining_keys ): key_to_model[model.key] = to_cache[cls.get_cache_key( project_id, environment_id, model.key)] = model remaining_keys.remove(model.key) cache.set_many(to_cache, 3600) # Not clear if this could actually happen, but if it does, # guard ourselves against returning bad data. if not remaining_keys: return key_to_model # Fall back to just doing it manually # This case will only ever happen in a race condition. for key in remaining_keys: key_to_model[key] = cls.get_or_create(project_id, environment_id, key)[0] return key_to_model