def chart(request, team=None, project=None): gid = request.REQUEST.get('gid') days = int(request.REQUEST.get('days', '90')) if gid: try: group = Group.objects.get(pk=gid) except Group.DoesNotExist: return HttpResponseForbidden() data = Group.objects.get_chart_data(group, max_days=days) elif project: data = Project.objects.get_chart_data(project, max_days=days) elif team: cache_key = 'api.chart:team=%s,days=%s' % (team.id, days) data = cache.get(cache_key) if data is None: project_list = list(Project.objects.filter(team=team)) data = Project.objects.get_chart_data_for_group(project_list, max_days=days) cache.set(cache_key, data, 300) else: cache_key = 'api.chart:user=%s,days=%s' % (request.user.id, days) data = cache.get(cache_key) if data is None: project_list = Project.objects.get_for_user(request.user) data = Project.objects.get_chart_data_for_group(project_list, max_days=days) cache.set(cache_key, data, 300) response = HttpResponse(json.dumps(data)) response['Content-Type'] = 'application/json' return response
def fetch_url(url, project=None): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ cache_key = 'source:%s' % ( hashlib.md5(url.encode('utf-8')).hexdigest(),) result = cache.get(cache_key) if result is None: # lock down domains that are problematic domain = urlparse(url).netloc domain_key = 'source:%s' % (hashlib.md5(domain.encode('utf-8')).hexdigest(),) domain_result = cache.get(domain_key) if domain_result: return BAD_SOURCE headers = [] if project and is_valid_origin(url, project=project): token = project.get_option('sentry:token') if token: headers.append(('X-Sentry-Token', token)) try: request = safe_urlopen( url, allow_redirects=True, headers=headers, timeout=settings.SENTRY_SOURCE_FETCH_TIMEOUT, ) except HTTPError: result = BAD_SOURCE except Exception: # it's likely we've failed due to a timeout, dns, etc so let's # ensure we can't cascade the failure by pinning this for 5 minutes cache.set(domain_key, 1, 300) logger.warning('Disabling sources to %s for %ss', domain, 300, exc_info=True) return BAD_SOURCE else: try: body = safe_urlread(request) except Exception: result = BAD_SOURCE else: result = (dict(request.headers), body) cache.set(cache_key, result, 60) if result == BAD_SOURCE: return result return UrlResult(url, *result)
def get_or_create(cls, group, release, environment, datetime, **kwargs): if not environment: environment = '' cache_key = cls.get_cache_key(group.id, release.id, environment) instance = cache.get(cache_key) if instance is None: instance, created = cls.objects.get_or_create( release_id=release.id, group_id=group.id, environment=environment, defaults={ 'project_id': group.project_id, 'first_seen': datetime, 'last_seen': datetime, }, ) cache.set(cache_key, instance, 3600) else: created = False # TODO(dcramer): this would be good to buffer if not created: instance.update(last_seen=datetime) return instance
def fetch_url(url, logger=None): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ import sentry cache_key = "fetch_url:%s" % url result = cache.get(cache_key) if result is not None: return result try: opener = urllib2.build_opener() opener.addheaders = [("User-Agent", "Sentry/%s" % sentry.VERSION)] req = opener.open(url) headers = dict(req.headers) body = req.read().rstrip("\n") except Exception: if logger: logger.error("Unable to fetch remote source for %r", url, exc_info=True) return BAD_SOURCE result = UrlResult(url, headers, body) cache.set(cache_key, result, 60 * 5) return result
def get_from_cache(self, **kwargs): """ Wrapper around QuerySet.get which supports caching of the intermediate value. Callee is responsible for making sure the cache key is cleared on save. """ if not self.cache_fields or len(kwargs) > 1: return self.get(**kwargs) pk_name = self.model._meta.pk.name key, value = kwargs.items()[0] # Kill __exact since it's the default behavior if key.endswith('__exact'): key = key.split('__exact', 1)[0] if key in self.cache_fields or key in ('pk', pk_name): cache_key = self.__get_lookup_cache_key(**{key: value}) retval = cache.get(cache_key) if retval is None: result = self.get(**kwargs) # Ensure we're pushing it into the cache self.__post_save(instance=result) return result # If we didn't look up by pk we need to hit the reffed # key if key not in (pk_name, 'pk'): return self.get(pk=retval) return retval
def fetch_release_file(filename, release): cache_key = "releasefile:%s:%s" % (release.id, md5(filename).hexdigest()) logger.debug("Checking cache for release artifact %r (release_id=%s)", filename, release.id) result = cache.get(cache_key) if result is None: logger.debug("Checking database for release artifact %r (release_id=%s)", filename, release.id) ident = ReleaseFile.get_ident(filename) try: releasefile = ( ReleaseFile.objects.filter(release=release, ident=ident).select_related("file", "file__blob").get() ) except ReleaseFile.DoesNotExist: logger.debug("Release artifact %r not found in database (release_id=%s)", filename, release.id) cache.set(cache_key, -1, 60) return None logger.debug("Found release artifact %r (id=%s, release_id=%s)", filename, releasefile.id, release.id) try: with releasefile.file.getfile() as fp: body = fp.read() except Exception as e: logger.exception(unicode(e)) result = -1 else: result = (releasefile.file.headers, body, 200) cache.set(cache_key, result, 3600) if result == -1: result = None return result
def get_or_create(cls, project, release, environment, datetime, **kwargs): cache_key = cls.get_cache_key(project.id, release.id, environment.id) instance = cache.get(cache_key) if instance is None: instance, created = cls.objects.get_or_create( release_id=release.id, organization_id=project.organization_id, environment_id=environment.id, defaults={ 'first_seen': datetime, 'last_seen': datetime, } ) cache.set(cache_key, instance, 3600) else: created = False # TODO(dcramer): this would be good to buffer, but until then we minimize # updates to once a minute, and allow Postgres to optimistically skip # it even if we can't if not created and instance.last_seen < datetime - timedelta(seconds=60): cls.objects.filter( id=instance.id, last_seen__lt=datetime - timedelta(seconds=60), ).update( last_seen=datetime, ) instance.last_seen = datetime cache.set(cache_key, instance, 3600) return instance
def get_send_to(self, project=None): """ Returns a list of email addresses for the users that should be notified of alerts. The logic for this is a bit complicated, but it does the following: - Includes members if ``send_to_members`` is enabled **and** the user has not disabled alerts for this project The results of this call can be fairly expensive to calculate, so the send_to list gets cached for 60 seconds. """ if project: project_id = project.pk else: project_id = "" conf_key = self.get_conf_key() cache_key = "%s:send_to:%s" % (conf_key, project_id) send_to_list = cache.get(cache_key) if send_to_list is None: send_to_list = set() send_to_members = self.get_option("send_to_members", project) if send_to_members and project and project.team: member_set = self.get_sendable_users(project) send_to_list |= set(self.get_emails_for_users(member_set)) send_to_list = filter(bool, send_to_list) cache.set(cache_key, send_to_list, 60) # 1 minute cache return send_to_list
def fetch_url(url, logger=None): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ import sentry cache_key = 'fetch_url:v2:%s' % (hashlib.md5(url).hexdigest(),) result = cache.get(cache_key) if result is not None: return UrlResult(*result) try: opener = urllib2.build_opener() opener.addheaders = [('User-Agent', 'Sentry/%s' % sentry.VERSION)] req = opener.open(url) headers = dict(req.headers) body = req.read() if headers.get('content-encoding') == 'gzip': # Content doesn't *have* to respect the Accept-Encoding header # and may send gzipped data regardless. # See: http://stackoverflow.com/questions/2423866/python-decompressing-gzip-chunk-by-chunk/2424549#2424549 body = zlib.decompress(body, 16 + zlib.MAX_WBITS) body = body.rstrip('\n') except Exception: if logger: logger.error('Unable to fetch remote source for %r', url, exc_info=True) return BAD_SOURCE result = (url, headers, body) cache.set(cache_key, result, 60 * 5) return UrlResult(url, headers, body)
def get_rules(self): cache_key = 'project:%d:rules' % (self.project.id,) rules_list = cache.get(cache_key) if rules_list is None: rules_list = list(Rule.objects.filter(project=self.project)) cache.set(cache_key, rules_list, 60) return rules_list
def _get_project_enhancements_config(project): enhancements = project.get_option('sentry:grouping_enhancements') enhancements_base = project.get_option('sentry:grouping_enhancements_base') if not enhancements and not enhancements_base: return DEFAULT_ENHANCEMENTS_CONFIG if enhancements_base is None or enhancements_base not in ENHANCEMENT_BASES: enhancements_base = DEFAULT_ENHANCEMENT_BASE # Instead of parsing and dumping out config here, we can make a # shortcut from sentry.utils.cache import cache from sentry.utils.hashlib import md5_text cache_key = 'grouping-enhancements:' + \ md5_text('%s|%s' % (enhancements_base, enhancements)).hexdigest() rv = cache.get(cache_key) if rv is not None: return rv try: rv = Enhancements.from_config_string( enhancements or '', bases=[enhancements_base]).dumps() except InvalidEnhancerConfig: rv = DEFAULT_ENHANCEMENTS_CONFIG cache.set(cache_key, rv) return rv
def get_or_create(cls, release, project, environment, datetime, **kwargs): cache_key = cls.get_cache_key(project.id, release.id, environment.id) instance = cache.get(cache_key) if instance is None: instance, created = cls.objects.get_or_create( release=release, project=project, environment=environment, defaults={ 'first_seen': datetime, 'last_seen': datetime, } ) cache.set(cache_key, instance, 3600) else: created = False # Same as releaseenvironment model. Minimizes last_seen updates to once a minute if not created and instance.last_seen < datetime - timedelta(seconds=60): cls.objects.filter( id=instance.id, last_seen__lt=datetime - timedelta(seconds=60), ).update( last_seen=datetime, ) instance.last_seen = datetime cache.set(cache_key, instance, 3600) return instance
def get_send_to(self, project=None): """ Returns a list of email addresses for the users that should be notified of alerts. The logic for this is a bit complicated, but it does the following: The results of this call can be fairly expensive to calculate, so the send_to list gets cached for 60 seconds. """ if project: project_id = project.pk else: project_id = '' if not (project and project.team): return [] conf_key = self.get_conf_key() cache_key = '%s:send_to:%s' % (conf_key, project_id) send_to_list = cache.get(cache_key) if send_to_list is None: send_to_list = self.get_sendable_users(project) send_to_list = filter(bool, send_to_list) cache.set(cache_key, send_to_list, 60) # 1 minute cache return send_to_list
def fetch_release_file(filename, release): cache_key = 'release:%s:%s' % ( release.id, hashlib.sha1(filename.encode('utf-8')).hexdigest(), ) logger.debug('Checking cache for release artfiact %r (release_id=%s)', filename, release.id) result = cache.get(cache_key) if result is None: logger.debug('Checking database for release artifact %r (release_id=%s)', filename, release.id) ident = ReleaseFile.get_ident(filename) try: releasefile = ReleaseFile.objects.filter( release=release, ident=ident, ).select_related('file').get() except ReleaseFile.DoesNotExist: logger.debug('Release artifact %r not found in database (release_id=%s)', filename, release.id) return None logger.debug('Found release artifact %r (id=%s, release_id=%s)', filename, releasefile.id, release.id) with releasefile.file.getfile() as fp: body = fp.read() result = (releasefile.file.headers, body, 200) cache.set(cache_key, result, 60) return result
def get_cached_photo(self, size): if not self.file: return if size not in self.ALLOWED_SIZES: size = min(self.ALLOWED_SIZES, key=lambda x: abs(x - size)) cache_key = self.get_cache_key(size) photo = cache.get(cache_key) if photo is None: photo_file = self.file.getfile() with Image.open(photo_file) as image: image = image.resize((size, size)) image_file = StringIO() image.save(image_file, 'PNG') photo_file = image_file.getvalue() cache.set(cache_key, photo_file) photo = cache.get(cache_key) return photo
def all_keys(self, project): # TODO: cache invalidation via post_save/post_delete signals much like BaseManager key = self._get_cache_key(project.id) result = cache.get(key) if result is None: result = list(self.filter(project=project).values_list("key", flat=True)) cache.set(key, result, 60) return result
def _get_service_hooks(project_id): from sentry.models import ServiceHook cache_key = 'servicehooks:1:{}'.format(project_id) result = cache.get(cache_key) if result is None: result = [(h.id, h.events) for h in ServiceHook.objects.filter(project_id=project_id)] cache.set(cache_key, result, 60) return result
def _update_cachefiles(self, project, dsym_files): rv = [] # Find all the known bad files we could not convert last time # around conversion_errors = {} for dsym_file in dsym_files: cache_key = 'scbe:%s:%s' % (dsym_file.uuid, dsym_file.file.checksum) err = cache.get(cache_key) if err is not None: conversion_errors[dsym_file.uuid] = err for dsym_file in dsym_files: dsym_uuid = dsym_file.uuid if dsym_uuid in conversion_errors: continue try: with dsym_file.file.getfile(as_tempfile=True) as tf: fo = FatObject.from_path(tf.name) o = fo.get_object(uuid=dsym_file.uuid) if o is None: continue symcache = o.make_symcache() except SymbolicError as e: cache.set('scbe:%s:%s' % ( dsym_uuid, dsym_file.file.checksum), e.message, CONVERSION_ERROR_TTL) conversion_errors[dsym_uuid] = e.message logger.error('dsymfile.symcache-build-error', exc_info=True, extra=dict(dsym_uuid=dsym_uuid)) continue file = File.objects.create( name=dsym_file.uuid, type='project.symcache', ) file.putfile(symcache.open_stream()) try: with transaction.atomic(): rv.append((dsym_uuid, ProjectSymCacheFile.objects.get_or_create( project=project, cache_file=file, dsym_file=dsym_file, defaults=dict( checksum=dsym_file.file.checksum, version=symcache.file_format_version, ) )[0])) except IntegrityError: file.delete() rv.append((dsym_uuid, ProjectSymCacheFile.objects.get( project=project, dsym_file=dsym_file, ))) return rv, conversion_errors
def get_rules(project): from sentry.models import Rule cache_key = 'project:%d:rules' % (project.id,) rules_list = cache.get(cache_key) if rules_list is None: rules_list = list(Rule.objects.filter(project=project)) cache.set(cache_key, rules_list, 60) return rules_list
def get_for_project(cls, project_id): cache_key = 'project:{}:rules'.format(project_id) rules_list = cache.get(cache_key) if rules_list is None: rules_list = list(cls.objects.filter( project=project_id, status=RuleStatus.ACTIVE, )) cache.set(cache_key, rules_list, 60) return rules_list
def wrapper(*args, **kwargs): def get_cache_key(*args, **kwargs): params = list(args) + kwargs.values() return md5("".join(map(str, params))).hexdigest() key = get_cache_key(func.__name__, *args, **kwargs) result = cache.get(key) if not result: result = func(*args, **kwargs) cache.set(key, result, timeout) return result
def get_choices(self): key = 'filters:%s:%s' % (self.project.id, self.column) result = cache.get(key) if result is None: result = list(FilterValue.objects.filter( project=self.project, key=self.column, ).values_list('value', flat=True).order_by('value')[:self.max_choices]) cache.set(key, result, 60) return SortedDict((l, l) for l in result)
def get_choices(self): key = 'filters:%s:%s' % (self.project.id, hashlib.md5(self.column.encode('utf8')).hexdigest()) result = cache.get(key) if result is None: result = list(TagValue.objects.filter( project=self.project, key=self.column, ).values_list('value', flat=True).order_by('value')[:self.max_choices]) cache.set(key, result, 60) return SortedDict((l, l) for l in result)
def get_from_cache(self, **kwargs): """ Wrapper around QuerySet.get which supports caching of the intermediate value. Callee is responsible for making sure the cache key is cleared on save. """ if not self.cache_fields or len(kwargs) > 1: return self.get(**kwargs) key, value = next(six.iteritems(kwargs)) pk_name = self.model._meta.pk.name if key == "pk": key = pk_name # We store everything by key references (vs instances) if isinstance(value, Model): value = value.pk # Kill __exact since it's the default behavior if key.endswith("__exact"): key = key.split("__exact", 1)[0] if key in self.cache_fields or key == pk_name: cache_key = self.__get_lookup_cache_key(**{key: value}) retval = cache.get(cache_key, version=self.cache_version) if retval is None: result = self.get(**kwargs) # Ensure we're pushing it into the cache self.__post_save(instance=result) return result # If we didn't look up by pk we need to hit the reffed # key if key != pk_name: return self.get_from_cache(**{pk_name: retval}) if type(retval) != self.model: if settings.DEBUG: raise ValueError("Unexpected value type returned from cache") logger.error("Cache response returned invalid value %r", retval) return self.get(**kwargs) if key == pk_name and int(value) != retval.pk: if settings.DEBUG: raise ValueError("Unexpected value returned from cache") logger.error("Cache response returned invalid value %r", retval) return self.get(**kwargs) retval._state.db = router.db_for_read(self.model, **kwargs) return retval else: return self.get(**kwargs)
def get_or_create(cls, project, name): name = name or "" cache_key = cls.get_cache_key(project.id, name) env = cache.get(cache_key) if env is None: env = cls.objects.get_or_create(project_id=project.id, name=name)[0] cache.set(cache_key, env, 3600) return env
def add_project(self, project): cache_key = 'envproj:c:%s:%s' % (self.id, project.id) if cache.get(cache_key) is None: try: with transaction.atomic(): EnvironmentProject.objects.create(project=project, environment=self) cache.set(cache_key, 1, 3600) except IntegrityError: # We've already created the object, should still cache the action. cache.set(cache_key, 1, 3600)
def get_cached(self, full_url): """ Basic Caching mechanism for requests and responses. It only caches responses based on URL TODO: Implement GET attr in cache as well. (see self.create_meta for example) """ key = 'sentry-jira-2:' + md5(full_url, self.base_url).hexdigest() cached_result = cache.get(key) if not cached_result: cached_result = self.get(full_url) cache.set(key, cached_result, 60) return cached_result
def get_cached(self, full_url): """ Basic Caching mechanism for requests and responses. It only caches responses based on URL TODO: Implement GET attr in cache as well. (see self.create_meta for example) """ key = CACHE_KEY % (full_url, self.instance_url) cached_result = cache.get(key) if not cached_result: cached_result = self.make_request('get', full_url) cache.set(key, cached_result, 60) return cached_result
def all_keys(self, project): # TODO: cache invalidation via post_save/post_delete signals much like BaseManager key = self._get_cache_key(project.id) result = cache.get(key) if result is None: result = list( self.filter(project=project, status=TagKeyStatus.VISIBLE) .order_by("-values_seen") .values_list("key", flat=True)[:20] ) cache.set(key, result, 60) return result
def get_or_create(cls, project, release, environment, datetime, **kwargs): cache_key = cls.get_cache_key(project.id, release.id, environment.id) instance = cache.get(cache_key) if instance is None: release_envs = list(cls.objects.filter( release_id=release.id, organization_id=project.organization_id, environment_id=environment.id, )) if release_envs: instance = release_envs[0] for re in release_envs: if re.project_id == project.id: instance = re created = False else: lock_key = cls.get_lock_key(project.organization_id, release.id, environment.id) lock = locks.get(lock_key, duration=5) with TimedRetryPolicy(10)(lock.acquire): try: instance, created = cls.objects.get( release_id=release.id, organization_id=project.organization_id, environment_id=environment.id, ), False except cls.DoesNotExist: instance, created = cls.objects.create( release_id=release.id, project_id=project.id, organization_id=project.organization_id, environment_id=environment.id, first_seen=datetime, last_seen=datetime, ), True cache.set(cache_key, instance, 3600) else: created = False # TODO(dcramer): this would be good to buffer, but until then we minimize # updates to once a minute, and allow Postgres to optimistically skip # it even if we can't if not created and instance.last_seen < datetime - timedelta(seconds=60): cls.objects.filter( id=instance.id, last_seen__lt=datetime - timedelta(seconds=60), ).update( last_seen=datetime, ) instance.last_seen = datetime cache.set(cache_key, instance, 3600) return instance
def fetch_release_file(filename, release, dist=None): cache_key = "releasefile:v1:%s:%s" % (release.id, md5_text(filename).hexdigest()) logger.debug("Checking cache for release artifact %r (release_id=%s)", filename, release.id) result = cache.get(cache_key) dist_name = dist and dist.name or None if result is None: filename_choices = ReleaseFile.normalize(filename) filename_idents = [ ReleaseFile.get_ident(f, dist_name) for f in filename_choices ] logger.debug( "Checking database for release artifact %r (release_id=%s)", filename, release.id) possible_files = list( ReleaseFile.objects.filter( release=release, dist=dist, ident__in=filename_idents).select_related("file")) if len(possible_files) == 0: logger.debug( "Release artifact %r not found in database (release_id=%s)", filename, release.id) cache.set(cache_key, -1, 60) return None elif len(possible_files) == 1: releasefile = possible_files[0] else: # Pick first one that matches in priority order. # This is O(N*M) but there are only ever at most 4 things here # so not really worth optimizing. releasefile = next((rf for ident in filename_idents for rf in possible_files if rf.ident == ident)) logger.debug("Found release artifact %r (id=%s, release_id=%s)", filename, releasefile.id, release.id) try: with metrics.timer("sourcemaps.release_file_read"): with releasefile.file.getfile() as fp: z_body, body = compress_file(fp) except Exception: logger.error("sourcemap.compress_read_failed", exc_info=sys.exc_info()) result = None else: headers = { k.lower(): v for k, v in releasefile.file.headers.items() } encoding = get_encoding_from_headers(headers) result = http.UrlResult(filename, headers, body, 200, encoding) cache.set(cache_key, (headers, z_body, 200, encoding), 3600) elif result == -1: # We cached an error, so normalize # it down to None result = None else: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[3] except IndexError: encoding = None result = http.UrlResult(filename, result[0], zlib.decompress(result[1]), result[2], encoding) return result
def fetch_release_file(filename, release): cache_key = 'releasefile:v1:%s:%s' % ( release.id, md5_text(filename).hexdigest(), ) filename_path = None if filename is not None: # Reconstruct url without protocol + host # e.g. http://example.com/foo?bar => ~/foo?bar parsed_url = urlparse(filename) filename_path = '~' + parsed_url.path if parsed_url.query: filename_path += '?' + parsed_url.query logger.debug('Checking cache for release artifact %r (release_id=%s)', filename, release.id) result = cache.get(cache_key) if result is None: logger.debug('Checking database for release artifact %r (release_id=%s)', filename, release.id) filename_idents = [ReleaseFile.get_ident(filename)] if filename_path is not None and filename_path != filename: filename_idents.append(ReleaseFile.get_ident(filename_path)) possible_files = list(ReleaseFile.objects.filter( release=release, ident__in=filename_idents, ).select_related('file')) if len(possible_files) == 0: logger.debug('Release artifact %r not found in database (release_id=%s)', filename, release.id) cache.set(cache_key, -1, 60) return None elif len(possible_files) == 1: releasefile = possible_files[0] else: # Prioritize releasefile that matches full url (w/ host) # over hostless releasefile target_ident = filename_idents[0] releasefile = next((f for f in possible_files if f.ident == target_ident)) logger.debug('Found release artifact %r (id=%s, release_id=%s)', filename, releasefile.id, release.id) try: with metrics.timer('sourcemaps.release_file_read'): with releasefile.file.getfile() as fp: z_body, body = compress_file(fp) except Exception as e: logger.exception(six.text_type(e)) cache.set(cache_key, -1, 3600) result = None else: headers = {k.lower(): v for k, v in releasefile.file.headers.items()} encoding = get_encoding_from_headers(headers) result = (headers, body, 200, encoding) cache.set(cache_key, (headers, z_body, 200, encoding), 3600) elif result == -1: # We cached an error, so normalize # it down to None result = None else: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[3] except IndexError: encoding = None result = (result[0], zlib.decompress(result[1]), result[2], encoding) return result
def fetch_file(url, project=None, release=None, allow_scraping=True): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ if release: with metrics.timer('sourcemaps.release_file'): result = fetch_release_file(url, release) else: result = None cache_key = 'source:cache:v3:%s' % ( md5_text(url).hexdigest(), ) if result is None: if not allow_scraping or not url.startswith(('http:', 'https:')): error = { 'type': EventError.JS_MISSING_SOURCE, 'url': expose_url(url), } raise CannotFetchSource(error) logger.debug('Checking cache for url %r', url) result = cache.get(cache_key) if result is not None: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[3] except IndexError: encoding = None # We got a cache hit, but the body is compressed, so we # need to decompress it before handing it off result = (result[0], zlib.decompress(result[1]), result[2], encoding) if result is None: # lock down domains that are problematic domain = urlparse(url).netloc domain_key = 'source:blacklist:v2:%s' % ( md5_text(domain).hexdigest(), ) domain_result = cache.get(domain_key) if domain_result: domain_result['url'] = url raise CannotFetchSource(domain_result) headers = {} if project and is_valid_origin(url, project=project): token = project.get_option('sentry:token') if token: headers['X-Sentry-Token'] = token logger.debug('Fetching %r from the internet', url) with metrics.timer('sourcemaps.fetch'): http_session = http.build_session() response = None try: try: start = time.time() response = http_session.get( url, allow_redirects=True, verify=False, headers=headers, timeout=settings.SENTRY_SOURCE_FETCH_SOCKET_TIMEOUT, stream=True, ) try: cl = int(response.headers['content-length']) except (LookupError, ValueError): cl = 0 if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() contents = [] cl = 0 # Only need to even attempt to read the response body if we # got a 200 OK if response.status_code == 200: for chunk in response.iter_content(16 * 1024): if time.time() - start > settings.SENTRY_SOURCE_FETCH_TIMEOUT: raise Timeout() contents.append(chunk) cl += len(chunk) if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() except Exception as exc: logger.debug('Unable to fetch %r', url, exc_info=True) if isinstance(exc, RestrictedIPAddress): error = { 'type': EventError.RESTRICTED_IP, 'url': expose_url(url), } elif isinstance(exc, SuspiciousOperation): error = { 'type': EventError.SECURITY_VIOLATION, 'url': expose_url(url), } elif isinstance(exc, Timeout): error = { 'type': EventError.JS_FETCH_TIMEOUT, 'url': expose_url(url), 'timeout': settings.SENTRY_SOURCE_FETCH_TIMEOUT, } elif isinstance(exc, OverflowError): error = { 'type': EventError.JS_TOO_LARGE, 'url': expose_url(url), # We want size in megabytes to format nicely 'max_size': float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) / 1024 / 1024, } elif isinstance(exc, (RequestException, ZeroReturnError)): error = { 'type': EventError.JS_GENERIC_FETCH_ERROR, 'value': six.text_type(type(exc)), 'url': expose_url(url), } else: logger.exception(six.text_type(exc)) error = { 'type': EventError.UNKNOWN_ERROR, 'url': expose_url(url), } # TODO(dcramer): we want to be less aggressive on disabling domains cache.set(domain_key, error or '', 300) logger.warning('Disabling sources to %s for %ss', domain, 300, exc_info=True) raise CannotFetchSource(error) body = b''.join(contents) z_body = zlib.compress(body) headers = {k.lower(): v for k, v in response.headers.items()} encoding = response.encoding cache.set(cache_key, (headers, z_body, response.status_code, encoding), 60) result = (headers, body, response.status_code, encoding) finally: if response is not None: response.close() if result[2] != 200: logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True) error = { 'type': EventError.JS_INVALID_HTTP_CODE, 'value': result[2], 'url': expose_url(url), } raise CannotFetchSource(error) # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML) # NOTE: possible to have JS files that don't actually end w/ ".js", but this should catch 99% of cases if url.endswith('.js'): # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<'). # This cannot parse as valid JS/JSON. # NOTE: not relying on Content-Type header because apps often don't set this correctly body_start = result[1][:20].lstrip() # Discard leading whitespace (often found before doctype) if body_start[:1] == u'<': error = { 'type': EventError.JS_INVALID_CONTENT, 'url': url, } raise CannotFetchSource(error) # Make sure the file we're getting back is six.binary_type. The only # reason it'd not be binary would be from old cached blobs, so # for compatibility with current cached files, let's coerce back to # binary and say utf8 encoding. if not isinstance(result[1], six.binary_type): try: result = (result[0], result[1].encode('utf8'), None) except UnicodeEncodeError: error = { 'type': EventError.JS_INVALID_SOURCE_ENCODING, 'value': 'utf8', 'url': expose_url(url), } raise CannotFetchSource(error) return UrlResult(url, result[0], result[1], result[3])
def expand_frames(self, frames): last_state = None state = None cache = self.cache sourcemaps = self.sourcemaps all_errors = [] for frame in frames: errors = cache.get_errors(frame.abs_path) if errors: all_errors.extend(errors) source = cache.get(frame.abs_path) if source is None: logger.info('No source found for %s', frame.abs_path) continue sourcemap_url, sourcemap_idx = sourcemaps.get_link(frame.abs_path) if sourcemap_idx and frame.colno is not None: last_state = state state = find_source(sourcemap_idx, frame.lineno, frame.colno) if is_data_uri(sourcemap_url): sourcemap_label = frame.abs_path else: sourcemap_label = sourcemap_url abs_path = urljoin(sourcemap_url, state.src) logger.debug('Mapping compressed source %r to mapping in %r', frame.abs_path, abs_path) source = cache.get(abs_path) if not source: frame.data = { 'sourcemap': sourcemap_label, } errors = cache.get_errors(abs_path) if errors: all_errors.extend(errors) else: all_errors.append({ 'type': EventError.JS_MISSING_SOURCE, 'url': force_bytes(abs_path, errors='replace'), }) # Store original data in annotation frame.data = { 'orig_lineno': frame.lineno, 'orig_colno': frame.colno, 'orig_function': frame.function, 'orig_abs_path': frame.abs_path, 'orig_filename': frame.filename, 'sourcemap': sourcemap_label, } # SourceMap's return zero-indexed lineno's frame.lineno = state.src_line + 1 frame.colno = state.src_col # The offending function is always the previous function in the stack # Honestly, no idea what the bottom most frame is, so we're ignoring that atm if last_state: frame.function = last_state.name or frame.function else: frame.function = state.name or frame.function filename = state.src # special case webpack support if filename.startswith('webpack://'): abs_path = filename # webpack seems to use ~ to imply "relative to resolver root" # which is generally seen for third party deps # (i.e. node_modules) if '/~/' in filename: filename = '~/' + abs_path.split('/~/', 1)[-1] else: filename = filename.split('webpack:///', 1)[-1] if filename.startswith('~/') and frame.in_app is None: frame.in_app = False frame.abs_path = abs_path frame.filename = filename if abs_path.startswith(('http:', 'https:')): frame.module = generate_module(abs_path) elif sourcemap_url: frame.data = { 'sourcemap': sourcemap_url, } # TODO: theoretically a minified source could point to another mapped, minified source frame.pre_context, frame.context_line, frame.post_context = get_source_context( source=source, lineno=frame.lineno, colno=frame.colno or 0) return all_errors
def get_send_to(self, project, event=None): """ Returns a list of user IDs for the users that should receive notifications for the provided project. This result may come from cached data. """ if not (project and project.teams.exists()): logger.debug('Tried to send notification to invalid project: %r', project) return [] if event: owners, _ = ProjectOwnership.get_owners(project.id, event.data) if owners != ProjectOwnership.Everyone: if not owners: metrics.incr( 'features.owners.send_to', tags={ 'organization': project.organization_id, 'outcome': 'empty', }, skip_internal=True, ) return [] metrics.incr( 'features.owners.send_to', tags={ 'organization': project.organization_id, 'outcome': 'match', }, skip_internal=True, ) send_to_list = [] teams_to_resolve = [] for owner in owners: if owner.type == User: send_to_list.append(owner.id) else: teams_to_resolve.append(owner.id) # get all users in teams if teams_to_resolve: send_to_list += User.objects.filter( is_active=True, sentry_orgmember_set__organizationmemberteam__team__id__in =teams_to_resolve, ).values_list('id', flat=True) return send_to_list else: metrics.incr( 'features.owners.send_to', tags={ 'organization': project.organization_id, 'outcome': 'everyone', }, skip_internal=True, ) cache_key = '%s:send_to:%s' % (self.get_conf_key(), project.pk) send_to_list = cache.get(cache_key) if send_to_list is None: send_to_list = [s for s in self.get_sendable_users(project) if s] cache.set(cache_key, send_to_list, 60) # 1 minute cache return send_to_list
def fetch_file(url, domain_lock_enabled=True, outfile=None, headers=None, allow_redirects=True, verify_ssl=False, timeout=settings.SENTRY_SOURCE_FETCH_SOCKET_TIMEOUT, **kwargs): """ Pull down a URL, returning a UrlResult object. """ # lock down domains that are problematic if domain_lock_enabled: domain = urlparse(url).netloc domain_key = 'source:blacklist:v2:%s' % ( md5_text(domain).hexdigest(), ) domain_result = cache.get(domain_key) if domain_result: domain_result['url'] = url raise CannotFetch(domain_result) logger.debug('Fetching %r from the internet', url) http_session = build_session() response = None try: try: start = time.time() response = http_session.get(url, allow_redirects=allow_redirects, verify=verify_ssl, headers=headers, timeout=timeout, stream=True, **kwargs) try: cl = int(response.headers['content-length']) except (LookupError, ValueError): cl = 0 if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() return_body = False if outfile is None: outfile = six.BytesIO() return_body = True cl = 0 # Only need to even attempt to read the response body if we # got a 200 OK if response.status_code == 200: for chunk in response.iter_content(16 * 1024): if time.time( ) - start > settings.SENTRY_SOURCE_FETCH_TIMEOUT: raise Timeout() outfile.write(chunk) cl += len(chunk) if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() except Exception as exc: logger.debug('Unable to fetch %r', url, exc_info=True) if isinstance(exc, RestrictedIPAddress): error = { 'type': EventError.RESTRICTED_IP, 'url': expose_url(url), } elif isinstance(exc, SuspiciousOperation): error = { 'type': EventError.SECURITY_VIOLATION, 'url': expose_url(url), } elif isinstance(exc, (Timeout, ReadTimeout)): error = { 'type': EventError.FETCH_TIMEOUT, 'url': expose_url(url), 'timeout': settings.SENTRY_SOURCE_FETCH_TIMEOUT, } elif isinstance(exc, OverflowError): error = { 'type': EventError.FETCH_TOO_LARGE, 'url': expose_url(url), # We want size in megabytes to format nicely 'max_size': float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) / 1024 / 1024, } elif isinstance(exc, (RequestException, ZeroReturnError)): error = { 'type': EventError.FETCH_GENERIC_ERROR, 'value': six.text_type(type(exc)), 'url': expose_url(url), } else: logger.exception(six.text_type(exc)) error = { 'type': EventError.UNKNOWN_ERROR, 'url': expose_url(url), } # TODO(dcramer): we want to be less aggressive on disabling domains if domain_lock_enabled: cache.set(domain_key, error or '', 300) logger.warning('source.disabled', extra=error) raise CannotFetch(error) headers = {k.lower(): v for k, v in response.headers.items()} encoding = response.encoding body = None if return_body: body = outfile.getvalue() outfile.close() # we only want to close StringIO result = (headers, body, response.status_code, encoding) finally: if response is not None: response.close() if result[2] != 200: logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True) error = { 'type': EventError.FETCH_INVALID_HTTP_CODE, 'value': result[2], 'url': expose_url(url), } raise CannotFetch(error) return UrlResult(url, result[0], result[1], result[2], result[3])
def fetch_release_artifact(url, release, dist): """ Get a release artifact either by extracting it or fetching it directly. If a release archive was saved, the individual file will be extracted from the archive. """ cache_key, cache_key_meta = get_cache_keys(url, release, dist) result = cache.get(cache_key) if result == -1: # Cached as unavailable return None if result: return result_from_cache(url, result) start = time.monotonic() release_file = fetch_release_archive(release, dist) if release_file is not None: try: archive = ReleaseArchive(release_file) except BaseException as exc: logger.error("Failed to initialize archive for release %s", release.id, exc_info=exc) # TODO(jjbayer): cache error and return here else: with archive: try: fp, headers = get_from_archive(url, archive) except KeyError: logger.debug( "Release artifact %r not found in archive (release_id=%s)", url, release.id ) cache.set(cache_key, -1, 60) metrics.timing( "sourcemaps.release_artifact_from_archive", time.monotonic() - start ) return None except BaseException as exc: logger.error("Failed to read %s from release %s", url, release.id, exc_info=exc) # TODO(jjbayer): cache error and return here else: result = fetch_and_cache_artifact( url, lambda: fp, cache_key, cache_key_meta, headers, # Cannot use `compress_file` because `ZipExtFile` does not support chunks compress_fn=compress, ) metrics.timing( "sourcemaps.release_artifact_from_archive", time.monotonic() - start ) return result # Fall back to maintain compatibility with old releases and versions of # sentry-cli which upload files individually result = fetch_release_file(url, release, dist) return result
def fetch_file(url, project=None, release=None, dist=None, allow_scraping=True): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the database first (assuming there's a release on the event), then the internet. Caches the result of each of those two attempts separately, whether or not those attempts are successful. Used for both source files and source maps. """ # If our url has been truncated, it'd be impossible to fetch # so we check for this early and bail if url[-3:] == "...": raise http.CannotFetch({ "type": EventError.JS_MISSING_SOURCE, "url": http.expose_url(url) }) # if we've got a release to look on, try that first (incl associated cache) if release: with metrics.timer("sourcemaps.release_file"): result = fetch_release_file(url, release, dist) else: result = None # otherwise, try the web-scraping cache and then the web itself cache_key = "source:cache:v4:%s" % (md5_text(url).hexdigest(), ) if result is None: if not allow_scraping or not url.startswith(("http:", "https:")): error = { "type": EventError.JS_MISSING_SOURCE, "url": http.expose_url(url) } raise http.CannotFetch(error) logger.debug("Checking cache for url %r", url) result = cache.get(cache_key) if result is not None: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[4] except IndexError: encoding = None # We got a cache hit, but the body is compressed, so we # need to decompress it before handing it off result = http.UrlResult(result[0], result[1], zlib.decompress(result[2]), result[3], encoding) if result is None: headers = {} verify_ssl = False if project and is_valid_origin(url, project=project): verify_ssl = bool(project.get_option("sentry:verify_ssl", False)) token = project.get_option("sentry:token") if token: token_header = project.get_option( "sentry:token_header") or "X-Sentry-Token" headers[token_header] = token with metrics.timer("sourcemaps.fetch"): result = http.fetch_file(url, headers=headers, verify_ssl=verify_ssl) z_body = zlib.compress(result.body) cache.set( cache_key, (url, result.headers, z_body, result.status, result.encoding), get_max_age(result.headers), ) # If we did not get a 200 OK we just raise a cannot fetch here. if result.status != 200: raise http.CannotFetch({ "type": EventError.FETCH_INVALID_HTTP_CODE, "value": result.status, "url": http.expose_url(url), }) # Make sure the file we're getting back is six.binary_type. The only # reason it'd not be binary would be from old cached blobs, so # for compatibility with current cached files, let's coerce back to # binary and say utf8 encoding. if not isinstance(result.body, six.binary_type): try: result = http.UrlResult( result.url, result.headers, result.body.encode("utf8"), result.status, result.encoding, ) except UnicodeEncodeError: error = { "type": EventError.FETCH_INVALID_ENCODING, "value": "utf8", "url": http.expose_url(url), } raise http.CannotFetch(error) # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML) # NOTE: possible to have JS files that don't actually end w/ ".js", but # this should catch 99% of cases if urlsplit(url).path.endswith(".js"): # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<'). # This cannot parse as valid JS/JSON. # NOTE: not relying on Content-Type header because apps often don't set this correctly # Discard leading whitespace (often found before doctype) body_start = result.body[:20].lstrip() if body_start[:1] == u"<": error = {"type": EventError.JS_INVALID_CONTENT, "url": url} raise http.CannotFetch(error) return result
def get_send_to(self, project, event=None): """ Returns a list of user IDs for the users that should receive notifications for the provided project. This result may come from cached data. """ if not (project and project.teams.exists()): logger.debug("Tried to send notification to invalid project: %r", project) return [] if event: owners, _ = ProjectOwnership.get_owners(project.id, event.data) if owners != ProjectOwnership.Everyone: if not owners: metrics.incr( "features.owners.send_to", tags={ "organization": project.organization_id, "outcome": "empty" }, skip_internal=True, ) return [] metrics.incr( "features.owners.send_to", tags={ "organization": project.organization_id, "outcome": "match" }, skip_internal=True, ) send_to_list = set() teams_to_resolve = set() for owner in owners: if owner.type == User: send_to_list.add(owner.id) else: teams_to_resolve.add(owner.id) # get all users in teams if teams_to_resolve: send_to_list |= set( User.objects.filter( is_active=True, sentry_orgmember_set__organizationmemberteam__team__id__in =teams_to_resolve, ).values_list("id", flat=True)) alert_settings = project.get_member_alert_settings( self.alert_option_key) disabled_users = set( user for user, setting in alert_settings.items() if setting == 0) return send_to_list - disabled_users else: metrics.incr( "features.owners.send_to", tags={ "organization": project.organization_id, "outcome": "everyone" }, skip_internal=True, ) cache_key = "%s:send_to:%s" % (self.get_conf_key(), project.pk) send_to_list = cache.get(cache_key) if send_to_list is None: send_to_list = [s for s in self.get_sendable_users(project) if s] cache.set(cache_key, send_to_list, 60) # 1 minute cache return send_to_list
def lookup_frame_cache(keys): rv = {} for key in keys: rv[key] = cache.get(key) return rv
def _get_or_create_impl(cls, project, version, date_added, metric_tags): from sentry.models import Project if date_added is None: date_added = timezone.now() cache_key = cls.get_cache_key(project.organization_id, version) release = cache.get(cache_key) if release in (None, -1): # TODO(dcramer): if the cache result is -1 we could attempt a # default create here instead of default get project_version = ("%s-%s" % (project.slug, version))[:DB_VERSION_LENGTH] releases = list( cls.objects.filter( organization_id=project.organization_id, version__in=[version, project_version], projects=project, )) if releases: try: release = [ r for r in releases if r.version == project_version ][0] except IndexError: release = releases[0] metric_tags["created"] = "false" else: try: with transaction.atomic(): release = cls.objects.create( organization_id=project.organization_id, version=version, date_added=date_added, total_deploys=0, ) metric_tags["created"] = "true" except IntegrityError: metric_tags["created"] = "false" release = cls.objects.get( organization_id=project.organization_id, version=version) release.add_project(project) if not project.flags.has_releases: project.flags.has_releases = True project.update( flags=F("flags").bitor(Project.flags.has_releases)) # TODO(dcramer): upon creating a new release, check if it should be # the new "latest release" for this project cache.set(cache_key, release, 3600) metric_tags["cache_hit"] = "false" else: metric_tags["cache_hit"] = "true" return release
def fetch_url(url, project=None, release=None): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ cache_key = 'source:cache:v2:%s' % (md5(url).hexdigest(), ) if release: result = fetch_release_file(url, release) else: result = None if result is None: logger.debug('Checking cache for url %r', url) result = cache.get(cache_key) if result is None: # lock down domains that are problematic domain = urlparse(url).netloc domain_key = 'source:blacklist:v2:%s' % (md5(domain).hexdigest(), ) domain_result = cache.get(domain_key) if domain_result: domain_result['url'] = url raise CannotFetchSource(domain_result) headers = {} if project and is_valid_origin(url, project=project): token = project.get_option('sentry:token') if token: headers['X-Sentry-Token'] = token logger.debug('Fetching %r from the internet', url) http_session = http.build_session() try: response = http_session.get( url, allow_redirects=True, verify=False, headers=headers, timeout=settings.SENTRY_SOURCE_FETCH_TIMEOUT, ) except Exception as exc: logger.debug('Unable to fetch %r', url, exc_info=True) if isinstance(exc, SuspiciousOperation): error = { 'type': EventError.SECURITY_VIOLATION, 'value': unicode(exc), 'url': url, } elif isinstance(exc, (RequestException, ZeroReturnError)): error = { 'type': EventError.JS_GENERIC_FETCH_ERROR, 'value': str(type(exc)), 'url': url, } else: logger.exception(unicode(exc)) error = { 'type': EventError.UNKNOWN_ERROR, 'url': url, } # TODO(dcramer): we want to be less aggressive on disabling domains cache.set(domain_key, error or '', 300) logger.warning('Disabling sources to %s for %ss', domain, 300, exc_info=True) raise CannotFetchSource(error) # requests' attempts to use chardet internally when no encoding is found # and we want to avoid that slow behavior if not response.encoding: response.encoding = 'utf-8' result = ( {k.lower(): v for k, v in response.headers.items()}, response.text, response.status_code, ) cache.set(cache_key, result, 60) if result[2] != 200: logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True) error = { 'type': EventError.JS_INVALID_HTTP_CODE, 'value': result[2], 'url': url, } raise CannotFetchSource(error) return UrlResult(url, result[0], result[1])
def post_process_group( is_new, is_regression, is_new_group_environment, cache_key, group_id=None, **kwargs ): """ Fires post processing hooks for a group. """ from sentry.eventstore.models import Event from sentry.eventstore.processing import event_processing_store from sentry.reprocessing2 import is_reprocessed_event from sentry.utils import snuba with snuba.options_override({"consistent": True}): # We use the data being present/missing in the processing store # to ensure that we don't duplicate work should the forwarding consumers # need to rewind history. data = event_processing_store.get(cache_key) if not data: logger.info( "post_process.skipped", extra={"cache_key": cache_key, "reason": "missing_cache"}, ) return event = Event( project_id=data["project"], event_id=data["event_id"], group_id=group_id, data=data ) set_current_event_project(event.project_id) is_reprocessed = is_reprocessed_event(event.data) # NOTE: we must pass through the full Event object, and not an # event_id since the Event object may not actually have been stored # in the database due to sampling. from sentry.models import Commit, EventDict, GroupInboxReason, Organization, Project from sentry.models.group import get_group_with_redirect from sentry.models.groupinbox import add_group_to_inbox from sentry.rules.processor import RuleProcessor from sentry.tasks.groupowner import process_suspect_commits from sentry.tasks.servicehooks import process_service_hook # Re-bind node data to avoid renormalization. We only want to # renormalize when loading old data from the database. event.data = EventDict(event.data, skip_renormalization=True) # Re-bind Project and Org since we're reading the Event object # from cache which may contain stale parent models. event.project = Project.objects.get_from_cache(id=event.project_id) event.project._organization_cache = Organization.objects.get_from_cache( id=event.project.organization_id ) if event.group_id: # Re-bind Group since we're reading the Event object # from cache, which may contain a stale group and project event.group, _ = get_group_with_redirect(event.group_id) event.group_id = event.group.id event.group.project = event.project event.group.project._organization_cache = event.project._organization_cache bind_organization_context(event.project.organization) _capture_stats(event, is_new) if event.group_id and is_reprocessed and is_new: add_group_to_inbox(event.group, GroupInboxReason.REPROCESSED) if event.group_id and not is_reprocessed: # we process snoozes before rules as it might create a regression # but not if it's new because you can't immediately snooze a new group has_reappeared = False if is_new else process_snoozes(event.group) if not has_reappeared: # If true, we added the .UNIGNORED reason already if is_new: add_group_to_inbox(event.group, GroupInboxReason.NEW) elif is_regression: add_group_to_inbox(event.group, GroupInboxReason.REGRESSION) handle_owner_assignment(event.project, event.group, event) rp = RuleProcessor( event, is_new, is_regression, is_new_group_environment, has_reappeared ) has_alert = False # TODO(dcramer): ideally this would fanout, but serializing giant # objects back and forth isn't super efficient for callback, futures in rp.apply(): has_alert = True safe_execute(callback, event, futures, _with_transaction=False) try: lock = locks.get( f"w-o:{event.group_id}-d-l", duration=10, ) with lock.acquire(): has_commit_key = f"w-o:{event.project.organization_id}-h-c" org_has_commit = cache.get(has_commit_key) if org_has_commit is None: org_has_commit = Commit.objects.filter( organization_id=event.project.organization_id ).exists() cache.set(has_commit_key, org_has_commit, 3600) if org_has_commit: group_cache_key = f"w-o-i:g-{event.group_id}" if cache.get(group_cache_key): metrics.incr( "sentry.tasks.process_suspect_commits.debounce", tags={"detail": "w-o-i:g debounce"}, ) else: from sentry.utils.committers import get_frame_paths cache.set(group_cache_key, True, 604800) # 1 week in seconds event_frames = get_frame_paths(event.data) process_suspect_commits.delay( event_id=event.event_id, event_platform=event.platform, event_frames=event_frames, group_id=event.group_id, project_id=event.project_id, ) except UnableToAcquireLock: pass except Exception: logger.exception("Failed to process suspect commits") if features.has("projects:servicehooks", project=event.project): allowed_events = {"event.created"} if has_alert: allowed_events.add("event.alert") if allowed_events: for servicehook_id, events in _get_service_hooks(project_id=event.project_id): if any(e in allowed_events for e in events): process_service_hook.delay(servicehook_id=servicehook_id, event=event) from sentry.tasks.sentry_apps import process_resource_change_bound if event.get_event_type() == "error" and _should_send_error_created_hooks( event.project ): process_resource_change_bound.delay( action="created", sender="Error", instance_id=event.event_id, instance=event ) if is_new: process_resource_change_bound.delay( action="created", sender="Group", instance_id=event.group_id ) from sentry.plugins.base import plugins for plugin in plugins.for_project(event.project): plugin_post_process_group( plugin_slug=plugin.slug, event=event, is_new=is_new, is_regresion=is_regression ) from sentry import similarity safe_execute(similarity.record, event.project, [event], _with_transaction=False) if event.group_id: # Patch attachments that were ingested on the standalone path. update_existing_attachments(event) if not is_reprocessed: event_processed.send_robust( sender=post_process_group, project=event.project, event=event, primary_hash=kwargs.get("primary_hash"), ) with metrics.timer("tasks.post_process.delete_event_cache"): event_processing_store.delete_by_key(cache_key)
def fetch_file(url, project=None, release=None, dist=None, allow_scraping=True): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ # If our url has been truncated, it'd be impossible to fetch # so we check for this early and bail if url[-3:] == '...': raise http.CannotFetch({ 'type': EventError.JS_MISSING_SOURCE, 'url': http.expose_url(url), }) if release: with metrics.timer('sourcemaps.release_file'): result = fetch_release_file(url, release, dist) else: result = None cache_key = 'source:cache:v4:%s' % (md5_text(url).hexdigest(), ) if result is None: if not allow_scraping or not url.startswith(('http:', 'https:')): error = { 'type': EventError.JS_MISSING_SOURCE, 'url': http.expose_url(url), } raise http.CannotFetch(error) logger.debug('Checking cache for url %r', url) result = cache.get(cache_key) if result is not None: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[4] except IndexError: encoding = None # We got a cache hit, but the body is compressed, so we # need to decompress it before handing it off result = http.UrlResult(result[0], result[1], zlib.decompress(result[2]), result[3], encoding) if result is None: headers = {} verify_ssl = False if project and is_valid_origin(url, project=project): verify_ssl = bool(project.get_option('sentry:verify_ssl', False)) token = project.get_option('sentry:token') if token: token_header = project.get_option( 'sentry:token_header') or 'X-Sentry-Token' headers[token_header] = token with metrics.timer('sourcemaps.fetch'): result = http.fetch_file(url, headers=headers, verify_ssl=verify_ssl) z_body = zlib.compress(result.body) cache.set( cache_key, (url, result.headers, z_body, result.status, result.encoding), get_max_age(result.headers)) # If we did not get a 200 OK we just raise a cannot fetch here. if result.status != 200: raise http.CannotFetch({ 'type': EventError.FETCH_INVALID_HTTP_CODE, 'value': result.status, 'url': http.expose_url(url), }) # Make sure the file we're getting back is six.binary_type. The only # reason it'd not be binary would be from old cached blobs, so # for compatibility with current cached files, let's coerce back to # binary and say utf8 encoding. if not isinstance(result.body, six.binary_type): try: result = http.UrlResult(result.url, result.headers, result.body.encode('utf8'), result.status, result.encoding) except UnicodeEncodeError: error = { 'type': EventError.FETCH_INVALID_ENCODING, 'value': 'utf8', 'url': http.expose_url(url), } raise http.CannotFetch(error) # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML) # NOTE: possible to have JS files that don't actually end w/ ".js", but # this should catch 99% of cases if url.endswith('.js'): # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<'). # This cannot parse as valid JS/JSON. # NOTE: not relying on Content-Type header because apps often don't set this correctly # Discard leading whitespace (often found before doctype) body_start = result.body[:20].lstrip() if body_start[:1] == u'<': error = { 'type': EventError.JS_INVALID_CONTENT, 'url': url, } raise http.CannotFetch(error) return result
def fetch_release_file(filename, release, dist=None): """ Attempt to retrieve a release artifact from the database. Caches the result of that attempt (whether successful or not). """ dist_name = dist and dist.name or None cache_key = "releasefile:v1:%s:%s" % ( release.id, ReleaseFile.get_ident(filename, dist_name)) logger.debug("Checking cache for release artifact %r (release_id=%s)", filename, release.id) result = cache.get(cache_key) # not in the cache (meaning we haven't checked the database recently), so check the database if result is None: filename_choices = ReleaseFile.normalize(filename) filename_idents = [ ReleaseFile.get_ident(f, dist_name) for f in filename_choices ] logger.debug( "Checking database for release artifact %r (release_id=%s)", filename, release.id) possible_files = list( ReleaseFile.objects.filter( release=release, dist=dist, ident__in=filename_idents).select_related("file")) if len(possible_files) == 0: logger.debug( "Release artifact %r not found in database (release_id=%s)", filename, release.id) cache.set(cache_key, -1, 60) return None elif len(possible_files) == 1: releasefile = possible_files[0] else: # Pick first one that matches in priority order. # This is O(N*M) but there are only ever at most 4 things here # so not really worth optimizing. releasefile = next((rf for ident in filename_idents for rf in possible_files if rf.ident == ident)) logger.debug("Found release artifact %r (id=%s, release_id=%s)", filename, releasefile.id, release.id) try: with metrics.timer("sourcemaps.release_file_read"): with ReleaseFile.cache.getfile(releasefile) as fp: z_body, body = compress_file(fp) except Exception: logger.error("sourcemap.compress_read_failed", exc_info=sys.exc_info()) result = None else: headers = { k.lower(): v for k, v in releasefile.file.headers.items() } encoding = get_encoding_from_headers(headers) result = http.UrlResult(filename, headers, body, 200, encoding) # This will implicitly skip too large payloads. Those will be cached # on the file system by `ReleaseFile.cache`, instead. cache.set(cache_key, (headers, z_body, 200, encoding), 3600) # in the cache as an unsuccessful attempt elif result == -1: result = None # in the cache as a successful attempt, including the zipped contents of the file else: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[3] except IndexError: encoding = None result = http.UrlResult(filename, result[0], zlib.decompress(result[1]), result[2], encoding) return result
def fetch_release_file(filename, release, dist=None): """ Attempt to retrieve a release artifact from the database. Caches the result of that attempt (whether successful or not). """ dist_name = dist and dist.name or None cache_key, cache_key_meta = get_cache_keys(filename, release, dist) logger.debug("Checking cache for release artifact %r (release_id=%s)", filename, release.id) result = cache.get(cache_key) # not in the cache (meaning we haven't checked the database recently), so check the database if result is None: with metrics.timer("sourcemaps.release_artifact_from_file"): filename_choices = ReleaseFile.normalize(filename) filename_idents = [ReleaseFile.get_ident(f, dist_name) for f in filename_choices] logger.debug( "Checking database for release artifact %r (release_id=%s)", filename, release.id ) possible_files = list( ReleaseFile.objects.filter( release=release, dist=dist, ident__in=filename_idents ).select_related("file") ) if len(possible_files) == 0: logger.debug( "Release artifact %r not found in database (release_id=%s)", filename, release.id, ) cache.set(cache_key, -1, 60) return None elif len(possible_files) == 1: releasefile = possible_files[0] else: # Pick first one that matches in priority order. # This is O(N*M) but there are only ever at most 4 things here # so not really worth optimizing. releasefile = next( rf for ident in filename_idents for rf in possible_files if rf.ident == ident ) logger.debug( "Found release artifact %r (id=%s, release_id=%s)", filename, releasefile.id, release.id, ) result = fetch_and_cache_artifact( filename, lambda: ReleaseFile.cache.getfile(releasefile), cache_key, cache_key_meta, releasefile.file.headers, compress_file, ) # in the cache as an unsuccessful attempt elif result == -1: result = None # in the cache as a successful attempt, including the zipped contents of the file else: result = result_from_cache(filename, result) return result
def expand_frames(self, frames): last_state = None state = None has_changes = False cache = self.cache sourcemaps = self.sourcemaps for frame in frames: errors = cache.get_errors(frame.abs_path) if errors: has_changes = True frame.errors = errors source = cache.get(frame.abs_path) if source is None: logger.info('No source found for %s', frame.abs_path) continue sourcemap_url, sourcemap_idx = sourcemaps.get_link(frame.abs_path) if sourcemap_idx and frame.colno is not None: last_state = state state = find_source(sourcemap_idx, frame.lineno, frame.colno) abs_path = urljoin(sourcemap_url, state.src) logger.debug('Mapping compressed source %r to mapping in %r', frame.abs_path, abs_path) source = cache.get(abs_path) if not source: frame.data = { 'sourcemap': sourcemap_url, } errors = cache.get_errors(abs_path) if errors: frame.errors.extend(errors) else: frame.errors.append( ERR_MISSING_SOURCE.format( filename=abs_path.encode('utf-8'), )) # Store original data in annotation frame.data = { 'orig_lineno': frame.lineno, 'orig_colno': frame.colno, 'orig_function': frame.function, 'orig_abs_path': frame.abs_path, 'orig_filename': frame.filename, 'sourcemap': sourcemap_url, } # SourceMap's return zero-indexed lineno's frame.lineno = state.src_line + 1 frame.colno = state.src_col # The offending function is always the previous function in the stack # Honestly, no idea what the bottom most frame is, so we're ignoring that atm if last_state: frame.function = last_state.name or frame.function else: frame.function = state.name or frame.function frame.abs_path = abs_path frame.filename = state.src frame.module = generate_module(state.src) elif sourcemap_url: frame.data = { 'sourcemap': sourcemap_url, } # TODO: theoretically a minified source could point to another mapped, minified source frame.pre_context, frame.context_line, frame.post_context = get_source_context( source=source, lineno=frame.lineno, colno=frame.colno or 0)
def get_from_cache(self, **kwargs: Any) -> M: """ Wrapper around QuerySet.get which supports caching of the intermediate value. Callee is responsible for making sure the cache key is cleared on save. """ if not self.cache_fields or len(kwargs) > 1: raise ValueError( "We cannot cache this query. Just hit the database.") key, value = next(iter(kwargs.items())) pk_name = self.model._meta.pk.name if key == "pk": key = pk_name # We store everything by key references (vs instances) if isinstance(value, Model): value = value.pk # Kill __exact since it's the default behavior if key.endswith("__exact"): key = key.split("__exact", 1)[0] if key in self.cache_fields or key == pk_name: cache_key = self.__get_lookup_cache_key(**{key: value}) local_cache = self._get_local_cache() if local_cache is not None: result = local_cache.get(cache_key) if result is not None: return result retval = cache.get(cache_key, version=self.cache_version) if retval is None: result = self.get(**kwargs) # Ensure we're pushing it into the cache self.__post_save(instance=result) if local_cache is not None: local_cache[cache_key] = result return result # If we didn't look up by pk we need to hit the reffed # key if key != pk_name: result = self.get_from_cache(**{pk_name: retval}) if local_cache is not None: local_cache[cache_key] = result return result if not isinstance(retval, self.model): if settings.DEBUG: raise ValueError( "Unexpected value type returned from cache") logger.error("Cache response returned invalid value %r", retval) return self.get(**kwargs) if key == pk_name and int(value) != retval.pk: if settings.DEBUG: raise ValueError("Unexpected value returned from cache") logger.error("Cache response returned invalid value %r", retval) return self.get(**kwargs) retval._state.db = router.db_for_read(self.model, **kwargs) # Explicitly typing to satisfy mypy. r: M = retval return r else: raise ValueError( "We cannot cache this query. Just hit the database.")
def fetch_url(url, project=None, release=None): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ cache_key = 'source:cache:v2:%s' % (hashlib.md5( url.encode('utf-8')).hexdigest(), ) if release: result = fetch_release_file(url, release) else: result = None if result is None: logger.debug('Checking cache for url %r', url) result = cache.get(cache_key) if result is None: # lock down domains that are problematic domain = urlparse(url).netloc domain_key = 'source:blacklist:%s' % (hashlib.md5( domain.encode('utf-8')).hexdigest(), ) domain_result = cache.get(domain_key) if domain_result: raise DomainBlacklisted( ERR_DOMAIN_BLACKLISTED.format(reason=domain_result, )) headers = {} if project and is_valid_origin(url, project=project): token = project.get_option('sentry:token') if token: headers['X-Sentry-Token'] = token logger.debug('Fetching %r from the internet', url) http_session = http.build_session() try: response = http_session.get( url, allow_redirects=True, verify=False, headers=headers, timeout=settings.SENTRY_SOURCE_FETCH_TIMEOUT, ) except Exception as exc: logger.debug('Unable to fetch %r', url, exc_info=True) if isinstance(exc, SuspiciousOperation): error = unicode(exc) elif isinstance(exc, RequestException): error = ERR_GENERIC_FETCH_FAILURE.format(type=type(exc), ) else: logger.exception(unicode(exc)) error = ERR_UNKNOWN_INTERNAL_ERROR # TODO(dcramer): we want to be less aggressive on disabling domains cache.set(domain_key, error or '', 300) logger.warning('Disabling sources to %s for %ss', domain, 300, exc_info=True) raise CannotFetchSource(error) result = ( {k.lower(): v for k, v in response.headers.items()}, response.content, response.status_code, ) cache.set(cache_key, result, 60) if result[2] != 200: logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True) error = ERR_HTTP_CODE.format(status_code=result[2], ) raise CannotFetchSource(error) return UrlResult(url, result[0], result[1])
def fetch_file(url, project=None, release=None, allow_scraping=True): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ if release: result = fetch_release_file(url, release) elif not allow_scraping or not url.startswith(('http:', 'https:')): error = { 'type': EventError.JS_MISSING_SOURCE, 'url': url, } raise CannotFetchSource(error) else: result = None cache_key = 'source:cache:v3:%s' % (md5(url).hexdigest(), ) if result is None: logger.debug('Checking cache for url %r', url) result = cache.get(cache_key) if result is not None: # We got a cache hit, but the body is compressed, so we # need to decompress it before handing it off body = zlib.decompress(result[1]) result = (result[0], force_text(body), result[2]) if result is None: # lock down domains that are problematic domain = urlparse(url).netloc domain_key = 'source:blacklist:v2:%s' % (md5(domain).hexdigest(), ) domain_result = cache.get(domain_key) if domain_result: domain_result['url'] = url raise CannotFetchSource(domain_result) headers = {} if project and is_valid_origin(url, project=project): token = project.get_option('sentry:token') if token: headers['X-Sentry-Token'] = token logger.debug('Fetching %r from the internet', url) http_session = http.build_session() try: response = http_session.get( url, allow_redirects=True, verify=False, headers=headers, timeout=settings.SENTRY_SOURCE_FETCH_TIMEOUT, ) except Exception as exc: logger.debug('Unable to fetch %r', url, exc_info=True) if isinstance(exc, RestrictedIPAddress): error = { 'type': EventError.RESTRICTED_IP, 'url': url, } elif isinstance(exc, SuspiciousOperation): error = { 'type': EventError.SECURITY_VIOLATION, 'url': url, } elif isinstance(exc, (RequestException, ZeroReturnError)): error = { 'type': EventError.JS_GENERIC_FETCH_ERROR, 'value': str(type(exc)), 'url': url, } else: logger.exception(unicode(exc)) error = { 'type': EventError.UNKNOWN_ERROR, 'url': url, } # TODO(dcramer): we want to be less aggressive on disabling domains cache.set(domain_key, error or '', 300) logger.warning('Disabling sources to %s for %ss', domain, 300, exc_info=True) raise CannotFetchSource(error) # requests' attempts to use chardet internally when no encoding is found # and we want to avoid that slow behavior if not response.encoding: response.encoding = 'utf-8' body = response.text z_body = zlib.compress(force_bytes(body)) headers = {k.lower(): v for k, v in response.headers.items()} cache.set(cache_key, (headers, z_body, response.status_code), 60) result = (headers, body, response.status_code) if result[2] != 200: logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True) error = { 'type': EventError.JS_INVALID_HTTP_CODE, 'value': result[2], 'url': url, } raise CannotFetchSource(error) # Make sure the file we're getting back is unicode, if it's not, # it's either some encoding that we don't understand, or it's binary # data which we can't process. if not isinstance(result[1], unicode): try: result = (result[0], result[1].decode('utf8'), result[2]) except UnicodeDecodeError: error = { 'type': EventError.JS_INVALID_SOURCE_ENCODING, 'value': 'utf8', 'url': url, } raise CannotFetchSource(error) return UrlResult(url, result[0], result[1])
def fetch_release_archive_for_url(release, dist, url) -> Optional[IO]: """Fetch release archive and cache if possible. Multiple archives might have been uploaded, so we need the URL to get the correct archive from the artifact index. If return value is not empty, the caller is responsible for closing the stream. """ with sentry_sdk.start_span( op="fetch_release_archive_for_url.get_index_entry"): info = get_index_entry(release, dist, url) if info is None: # Cannot write negative cache entry here because ID of release archive # is not yet known return None archive_ident = info["archive_ident"] # TODO(jjbayer): Could already extract filename from info and return # it later cache_key = get_release_file_cache_key(release_id=release.id, releasefile_ident=archive_ident) result = cache.get(cache_key) if result == -1: return None elif result: return BytesIO(result) else: try: with sentry_sdk.start_span( op="fetch_release_archive_for_url.get_releasefile_db_entry" ): qs = ReleaseFile.objects.filter( release_id=release.id, dist_id=dist.id if dist else dist, ident=archive_ident).select_related("file") releasefile = qs[0] except IndexError: # This should not happen when there is an archive_ident in the manifest logger.error("sourcemaps.missing_archive", exc_info=sys.exc_info()) # Cache as nonexistent: cache.set(cache_key, -1, 60) return None else: try: with sentry_sdk.start_span( op="fetch_release_archive_for_url.fetch_releasefile"): if releasefile.file.size <= options.get( "releasefile.cache-max-archive-size"): getfile = lambda: ReleaseFile.cache.getfile(releasefile ) else: # For very large ZIP archives, pulling the entire file into cache takes too long. # Only the blobs required to extract the current artifact (central directory and the file entry itself) # should be loaded in this case. getfile = releasefile.file.getfile file_ = fetch_retry_policy(getfile) except Exception: logger.error("sourcemaps.read_archive_failed", exc_info=sys.exc_info()) return None # `cache.set` will only keep values up to a certain size, # so we should not read the entire file if it's too large for caching if CACHE_MAX_VALUE_SIZE is not None and file_.size > CACHE_MAX_VALUE_SIZE: return file_ with sentry_sdk.start_span( op="fetch_release_archive_for_url.read_for_caching" ) as span: span.set_data("file_size", file_.size) contents = file_.read() with sentry_sdk.start_span( op="fetch_release_archive_for_url.write_to_cache") as span: span.set_data("file_size", len(contents)) cache.set(cache_key, contents, 3600) file_.seek(0) return file_