Example #1
0
def chart(request, team=None, project=None):
    gid = request.REQUEST.get('gid')
    days = int(request.REQUEST.get('days', '90'))
    if gid:
        try:
            group = Group.objects.get(pk=gid)
        except Group.DoesNotExist:
            return HttpResponseForbidden()

        data = Group.objects.get_chart_data(group, max_days=days)
    elif project:
        data = Project.objects.get_chart_data(project, max_days=days)
    elif team:
        cache_key = 'api.chart:team=%s,days=%s' % (team.id, days)

        data = cache.get(cache_key)
        if data is None:
            project_list = list(Project.objects.filter(team=team))
            data = Project.objects.get_chart_data_for_group(project_list, max_days=days)
            cache.set(cache_key, data, 300)
    else:
        cache_key = 'api.chart:user=%s,days=%s' % (request.user.id, days)

        data = cache.get(cache_key)
        if data is None:
            project_list = Project.objects.get_for_user(request.user)
            data = Project.objects.get_chart_data_for_group(project_list, max_days=days)
            cache.set(cache_key, data, 300)

    response = HttpResponse(json.dumps(data))
    response['Content-Type'] = 'application/json'
    return response
Example #2
0
def fetch_url(url, project=None):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """

    cache_key = 'source:%s' % (
        hashlib.md5(url.encode('utf-8')).hexdigest(),)
    result = cache.get(cache_key)
    if result is None:
        # lock down domains that are problematic
        domain = urlparse(url).netloc
        domain_key = 'source:%s' % (hashlib.md5(domain.encode('utf-8')).hexdigest(),)
        domain_result = cache.get(domain_key)
        if domain_result:
            return BAD_SOURCE

        headers = []
        if project and is_valid_origin(url, project=project):
            token = project.get_option('sentry:token')
            if token:
                headers.append(('X-Sentry-Token', token))

        try:
            request = safe_urlopen(
                url,
                allow_redirects=True,
                headers=headers,
                timeout=settings.SENTRY_SOURCE_FETCH_TIMEOUT,
            )
        except HTTPError:
            result = BAD_SOURCE
        except Exception:
            # it's likely we've failed due to a timeout, dns, etc so let's
            # ensure we can't cascade the failure by pinning this for 5 minutes
            cache.set(domain_key, 1, 300)
            logger.warning('Disabling sources to %s for %ss', domain, 300,
                           exc_info=True)
            return BAD_SOURCE
        else:
            try:
                body = safe_urlread(request)
            except Exception:
                result = BAD_SOURCE
            else:
                result = (dict(request.headers), body)

        cache.set(cache_key, result, 60)

    if result == BAD_SOURCE:
        return result

    return UrlResult(url, *result)
    def get_or_create(cls, group, release, environment, datetime, **kwargs):
        if not environment:
            environment = ''
        cache_key = cls.get_cache_key(group.id, release.id, environment)

        instance = cache.get(cache_key)
        if instance is None:
            instance, created = cls.objects.get_or_create(
                release_id=release.id,
                group_id=group.id,
                environment=environment,
                defaults={
                    'project_id': group.project_id,
                    'first_seen': datetime,
                    'last_seen': datetime,
                },
            )
            cache.set(cache_key, instance, 3600)
        else:
            created = False

        # TODO(dcramer): this would be good to buffer
        if not created:
            instance.update(last_seen=datetime)
        return instance
Example #4
0
def fetch_url(url, logger=None):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    import sentry

    cache_key = "fetch_url:%s" % url
    result = cache.get(cache_key)
    if result is not None:
        return result

    try:
        opener = urllib2.build_opener()
        opener.addheaders = [("User-Agent", "Sentry/%s" % sentry.VERSION)]
        req = opener.open(url)
        headers = dict(req.headers)
        body = req.read().rstrip("\n")
    except Exception:
        if logger:
            logger.error("Unable to fetch remote source for %r", url, exc_info=True)
        return BAD_SOURCE

    result = UrlResult(url, headers, body)

    cache.set(cache_key, result, 60 * 5)

    return result
Example #5
0
    def get_from_cache(self, **kwargs):
        """
        Wrapper around QuerySet.get which supports caching of the
        intermediate value.  Callee is responsible for making sure
        the cache key is cleared on save.
        """
        if not self.cache_fields or len(kwargs) > 1:
            return self.get(**kwargs)

        pk_name = self.model._meta.pk.name
        key, value = kwargs.items()[0]

        # Kill __exact since it's the default behavior
        if key.endswith('__exact'):
            key = key.split('__exact', 1)[0]

        if key in self.cache_fields or key in ('pk', pk_name):
            cache_key = self.__get_lookup_cache_key(**{key: value})

            retval = cache.get(cache_key)
            if retval is None:
                result = self.get(**kwargs)
                # Ensure we're pushing it into the cache
                self.__post_save(instance=result)
                return result

            # If we didn't look up by pk we need to hit the reffed
            # key
            if key not in (pk_name, 'pk'):
                return self.get(pk=retval)

            return retval
Example #6
0
def fetch_release_file(filename, release):
    cache_key = "releasefile:%s:%s" % (release.id, md5(filename).hexdigest())
    logger.debug("Checking cache for release artifact %r (release_id=%s)", filename, release.id)
    result = cache.get(cache_key)
    if result is None:
        logger.debug("Checking database for release artifact %r (release_id=%s)", filename, release.id)
        ident = ReleaseFile.get_ident(filename)
        try:
            releasefile = (
                ReleaseFile.objects.filter(release=release, ident=ident).select_related("file", "file__blob").get()
            )
        except ReleaseFile.DoesNotExist:
            logger.debug("Release artifact %r not found in database (release_id=%s)", filename, release.id)
            cache.set(cache_key, -1, 60)
            return None

        logger.debug("Found release artifact %r (id=%s, release_id=%s)", filename, releasefile.id, release.id)
        try:
            with releasefile.file.getfile() as fp:
                body = fp.read()
        except Exception as e:
            logger.exception(unicode(e))
            result = -1
        else:
            result = (releasefile.file.headers, body, 200)
        cache.set(cache_key, result, 3600)

    if result == -1:
        result = None

    return result
    def get_or_create(cls, project, release, environment, datetime, **kwargs):
        cache_key = cls.get_cache_key(project.id, release.id, environment.id)

        instance = cache.get(cache_key)
        if instance is None:
            instance, created = cls.objects.get_or_create(
                release_id=release.id,
                organization_id=project.organization_id,
                environment_id=environment.id,
                defaults={
                    'first_seen': datetime,
                    'last_seen': datetime,
                }
            )
            cache.set(cache_key, instance, 3600)
        else:
            created = False

        # TODO(dcramer): this would be good to buffer, but until then we minimize
        # updates to once a minute, and allow Postgres to optimistically skip
        # it even if we can't
        if not created and instance.last_seen < datetime - timedelta(seconds=60):
            cls.objects.filter(
                id=instance.id,
                last_seen__lt=datetime - timedelta(seconds=60),
            ).update(
                last_seen=datetime,
            )
            instance.last_seen = datetime
            cache.set(cache_key, instance, 3600)
        return instance
Example #8
0
    def get_send_to(self, project=None):
        """
        Returns a list of email addresses for the users that should be notified of alerts.

        The logic for this is a bit complicated, but it does the following:

        - Includes members if ``send_to_members`` is enabled **and** the user has not disabled alerts
          for this project

        The results of this call can be fairly expensive to calculate, so the send_to list gets cached
        for 60 seconds.
        """
        if project:
            project_id = project.pk
        else:
            project_id = ""
        conf_key = self.get_conf_key()
        cache_key = "%s:send_to:%s" % (conf_key, project_id)

        send_to_list = cache.get(cache_key)
        if send_to_list is None:
            send_to_list = set()

            send_to_members = self.get_option("send_to_members", project)
            if send_to_members and project and project.team:
                member_set = self.get_sendable_users(project)
                send_to_list |= set(self.get_emails_for_users(member_set))

            send_to_list = filter(bool, send_to_list)
            cache.set(cache_key, send_to_list, 60)  # 1 minute cache
        return send_to_list
Example #9
0
def fetch_url(url, logger=None):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    import sentry

    cache_key = 'fetch_url:v2:%s' % (hashlib.md5(url).hexdigest(),)
    result = cache.get(cache_key)
    if result is not None:
        return UrlResult(*result)

    try:
        opener = urllib2.build_opener()
        opener.addheaders = [('User-Agent', 'Sentry/%s' % sentry.VERSION)]
        req = opener.open(url)
        headers = dict(req.headers)
        body = req.read()
        if headers.get('content-encoding') == 'gzip':
            # Content doesn't *have* to respect the Accept-Encoding header
            # and may send gzipped data regardless.
            # See: http://stackoverflow.com/questions/2423866/python-decompressing-gzip-chunk-by-chunk/2424549#2424549
            body = zlib.decompress(body, 16 + zlib.MAX_WBITS)
        body = body.rstrip('\n')
    except Exception:
        if logger:
            logger.error('Unable to fetch remote source for %r', url, exc_info=True)
        return BAD_SOURCE

    result = (url, headers, body)

    cache.set(cache_key, result, 60 * 5)

    return UrlResult(url, headers, body)
Example #10
0
 def get_rules(self):
     cache_key = 'project:%d:rules' % (self.project.id,)
     rules_list = cache.get(cache_key)
     if rules_list is None:
         rules_list = list(Rule.objects.filter(project=self.project))
         cache.set(cache_key, rules_list, 60)
     return rules_list
Example #11
0
def _get_project_enhancements_config(project):
    enhancements = project.get_option('sentry:grouping_enhancements')
    enhancements_base = project.get_option('sentry:grouping_enhancements_base')
    if not enhancements and not enhancements_base:
        return DEFAULT_ENHANCEMENTS_CONFIG

    if enhancements_base is None or enhancements_base not in ENHANCEMENT_BASES:
        enhancements_base = DEFAULT_ENHANCEMENT_BASE

    # Instead of parsing and dumping out config here, we can make a
    # shortcut
    from sentry.utils.cache import cache
    from sentry.utils.hashlib import md5_text
    cache_key = 'grouping-enhancements:' + \
        md5_text('%s|%s' % (enhancements_base, enhancements)).hexdigest()
    rv = cache.get(cache_key)
    if rv is not None:
        return rv

    try:
        rv = Enhancements.from_config_string(
            enhancements or '', bases=[enhancements_base]).dumps()
    except InvalidEnhancerConfig:
        rv = DEFAULT_ENHANCEMENTS_CONFIG
    cache.set(cache_key, rv)
    return rv
    def get_or_create(cls, release, project, environment, datetime, **kwargs):
        cache_key = cls.get_cache_key(project.id, release.id, environment.id)

        instance = cache.get(cache_key)
        if instance is None:
            instance, created = cls.objects.get_or_create(
                release=release,
                project=project,
                environment=environment,
                defaults={
                    'first_seen': datetime,
                    'last_seen': datetime,
                }
            )
            cache.set(cache_key, instance, 3600)
        else:
            created = False

        # Same as releaseenvironment model. Minimizes last_seen updates to once a minute
        if not created and instance.last_seen < datetime - timedelta(seconds=60):
            cls.objects.filter(
                id=instance.id,
                last_seen__lt=datetime - timedelta(seconds=60),
            ).update(
                last_seen=datetime,
            )
            instance.last_seen = datetime
            cache.set(cache_key, instance, 3600)
        return instance
Example #13
0
    def get_send_to(self, project=None):
        """
        Returns a list of email addresses for the users that should be notified of alerts.

        The logic for this is a bit complicated, but it does the following:

        The results of this call can be fairly expensive to calculate, so the send_to list gets cached
        for 60 seconds.
        """
        if project:
            project_id = project.pk
        else:
            project_id = ''

        if not (project and project.team):
            return []

        conf_key = self.get_conf_key()
        cache_key = '%s:send_to:%s' % (conf_key, project_id)

        send_to_list = cache.get(cache_key)
        if send_to_list is None:
            send_to_list = self.get_sendable_users(project)

            send_to_list = filter(bool, send_to_list)
            cache.set(cache_key, send_to_list, 60)  # 1 minute cache

        return send_to_list
Example #14
0
def fetch_release_file(filename, release):
    cache_key = 'release:%s:%s' % (
        release.id,
        hashlib.sha1(filename.encode('utf-8')).hexdigest(),
    )
    logger.debug('Checking cache for release artfiact %r (release_id=%s)',
                 filename, release.id)
    result = cache.get(cache_key)
    if result is None:
        logger.debug('Checking database for release artifact %r (release_id=%s)',
                     filename, release.id)
        ident = ReleaseFile.get_ident(filename)
        try:
            releasefile = ReleaseFile.objects.filter(
                release=release,
                ident=ident,
            ).select_related('file').get()
        except ReleaseFile.DoesNotExist:
            logger.debug('Release artifact %r not found in database (release_id=%s)',
                         filename, release.id)
            return None

        logger.debug('Found release artifact %r (id=%s, release_id=%s)',
                     filename, releasefile.id, release.id)
        with releasefile.file.getfile() as fp:
            body = fp.read()
        result = (releasefile.file.headers, body, 200)
        cache.set(cache_key, result, 60)

    return result
Example #15
0
 def get_cached_photo(self, size):
     if not self.file:
         return
     if size not in self.ALLOWED_SIZES:
         size = min(self.ALLOWED_SIZES, key=lambda x: abs(x - size))
     cache_key = self.get_cache_key(size)
     photo = cache.get(cache_key)
     if photo is None:
         photo_file = self.file.getfile()
         with Image.open(photo_file) as image:
             image = image.resize((size, size))
             image_file = StringIO()
             image.save(image_file, 'PNG')
             photo_file = image_file.getvalue()
             cache.set(cache_key, photo_file)
             photo = cache.get(cache_key)
     return photo
Example #16
0
 def all_keys(self, project):
     # TODO: cache invalidation via post_save/post_delete signals much like BaseManager
     key = self._get_cache_key(project.id)
     result = cache.get(key)
     if result is None:
         result = list(self.filter(project=project).values_list("key", flat=True))
         cache.set(key, result, 60)
     return result
Example #17
0
def _get_service_hooks(project_id):
    from sentry.models import ServiceHook
    cache_key = 'servicehooks:1:{}'.format(project_id)
    result = cache.get(cache_key)
    if result is None:
        result = [(h.id, h.events) for h in
                  ServiceHook.objects.filter(project_id=project_id)]
        cache.set(cache_key, result, 60)
    return result
Example #18
0
    def _update_cachefiles(self, project, dsym_files):
        rv = []

        # Find all the known bad files we could not convert last time
        # around
        conversion_errors = {}
        for dsym_file in dsym_files:
            cache_key = 'scbe:%s:%s' % (dsym_file.uuid, dsym_file.file.checksum)
            err = cache.get(cache_key)
            if err is not None:
                conversion_errors[dsym_file.uuid] = err

        for dsym_file in dsym_files:
            dsym_uuid = dsym_file.uuid
            if dsym_uuid in conversion_errors:
                continue

            try:
                with dsym_file.file.getfile(as_tempfile=True) as tf:
                    fo = FatObject.from_path(tf.name)
                    o = fo.get_object(uuid=dsym_file.uuid)
                    if o is None:
                        continue
                    symcache = o.make_symcache()
            except SymbolicError as e:
                cache.set('scbe:%s:%s' % (
                    dsym_uuid, dsym_file.file.checksum), e.message,
                    CONVERSION_ERROR_TTL)
                conversion_errors[dsym_uuid] = e.message
                logger.error('dsymfile.symcache-build-error',
                             exc_info=True, extra=dict(dsym_uuid=dsym_uuid))
                continue

            file = File.objects.create(
                name=dsym_file.uuid,
                type='project.symcache',
            )
            file.putfile(symcache.open_stream())
            try:
                with transaction.atomic():
                    rv.append((dsym_uuid, ProjectSymCacheFile.objects.get_or_create(
                        project=project,
                        cache_file=file,
                        dsym_file=dsym_file,
                        defaults=dict(
                            checksum=dsym_file.file.checksum,
                            version=symcache.file_format_version,
                        )
                    )[0]))
            except IntegrityError:
                file.delete()
                rv.append((dsym_uuid, ProjectSymCacheFile.objects.get(
                    project=project,
                    dsym_file=dsym_file,
                )))

        return rv, conversion_errors
Example #19
0
def get_rules(project):
    from sentry.models import Rule

    cache_key = 'project:%d:rules' % (project.id,)
    rules_list = cache.get(cache_key)
    if rules_list is None:
        rules_list = list(Rule.objects.filter(project=project))
        cache.set(cache_key, rules_list, 60)
    return rules_list
Example #20
0
 def get_for_project(cls, project_id):
     cache_key = 'project:{}:rules'.format(project_id)
     rules_list = cache.get(cache_key)
     if rules_list is None:
         rules_list = list(cls.objects.filter(
             project=project_id,
             status=RuleStatus.ACTIVE,
         ))
         cache.set(cache_key, rules_list, 60)
     return rules_list
Example #21
0
 def wrapper(*args, **kwargs):
     def get_cache_key(*args, **kwargs):
         params = list(args) + kwargs.values()
         return md5("".join(map(str, params))).hexdigest()
     key = get_cache_key(func.__name__, *args, **kwargs)
     result = cache.get(key)
     if not result:
         result = func(*args, **kwargs)
         cache.set(key, result, timeout)
     return result
Example #22
0
 def get_choices(self):
     key = 'filters:%s:%s' % (self.project.id, self.column)
     result = cache.get(key)
     if result is None:
         result = list(FilterValue.objects.filter(
             project=self.project,
             key=self.column,
         ).values_list('value', flat=True).order_by('value')[:self.max_choices])
         cache.set(key, result, 60)
     return SortedDict((l, l) for l in result)
Example #23
0
 def get_choices(self):
     key = 'filters:%s:%s' % (self.project.id, hashlib.md5(self.column.encode('utf8')).hexdigest())
     result = cache.get(key)
     if result is None:
         result = list(TagValue.objects.filter(
             project=self.project,
             key=self.column,
         ).values_list('value', flat=True).order_by('value')[:self.max_choices])
         cache.set(key, result, 60)
     return SortedDict((l, l) for l in result)
Example #24
0
    def get_from_cache(self, **kwargs):
        """
        Wrapper around QuerySet.get which supports caching of the
        intermediate value.  Callee is responsible for making sure
        the cache key is cleared on save.
        """
        if not self.cache_fields or len(kwargs) > 1:
            return self.get(**kwargs)

        key, value = next(six.iteritems(kwargs))
        pk_name = self.model._meta.pk.name
        if key == "pk":
            key = pk_name

        # We store everything by key references (vs instances)
        if isinstance(value, Model):
            value = value.pk

        # Kill __exact since it's the default behavior
        if key.endswith("__exact"):
            key = key.split("__exact", 1)[0]

        if key in self.cache_fields or key == pk_name:
            cache_key = self.__get_lookup_cache_key(**{key: value})

            retval = cache.get(cache_key, version=self.cache_version)
            if retval is None:
                result = self.get(**kwargs)
                # Ensure we're pushing it into the cache
                self.__post_save(instance=result)
                return result

            # If we didn't look up by pk we need to hit the reffed
            # key
            if key != pk_name:
                return self.get_from_cache(**{pk_name: retval})

            if type(retval) != self.model:
                if settings.DEBUG:
                    raise ValueError("Unexpected value type returned from cache")
                logger.error("Cache response returned invalid value %r", retval)
                return self.get(**kwargs)

            if key == pk_name and int(value) != retval.pk:
                if settings.DEBUG:
                    raise ValueError("Unexpected value returned from cache")
                logger.error("Cache response returned invalid value %r", retval)
                return self.get(**kwargs)

            retval._state.db = router.db_for_read(self.model, **kwargs)

            return retval
        else:
            return self.get(**kwargs)
Example #25
0
    def get_or_create(cls, project, name):
        name = name or ""

        cache_key = cls.get_cache_key(project.id, name)

        env = cache.get(cache_key)
        if env is None:
            env = cls.objects.get_or_create(project_id=project.id, name=name)[0]
            cache.set(cache_key, env, 3600)

        return env
Example #26
0
    def add_project(self, project):
        cache_key = 'envproj:c:%s:%s' % (self.id, project.id)

        if cache.get(cache_key) is None:
            try:
                with transaction.atomic():
                    EnvironmentProject.objects.create(project=project, environment=self)
                cache.set(cache_key, 1, 3600)
            except IntegrityError:
                # We've already created the object, should still cache the action.
                cache.set(cache_key, 1, 3600)
Example #27
0
 def get_cached(self, full_url):
     """
     Basic Caching mechanism for requests and responses. It only caches responses
     based on URL
     TODO: Implement GET attr in cache as well. (see self.create_meta for example)
     """
     key = 'sentry-jira-2:' + md5(full_url, self.base_url).hexdigest()
     cached_result = cache.get(key)
     if not cached_result:
         cached_result = self.get(full_url)
         cache.set(key, cached_result, 60)
     return cached_result
Example #28
0
 def get_cached(self, full_url):
     """
     Basic Caching mechanism for requests and responses. It only caches responses
     based on URL
     TODO: Implement GET attr in cache as well. (see self.create_meta for example)
     """
     key = CACHE_KEY % (full_url, self.instance_url)
     cached_result = cache.get(key)
     if not cached_result:
         cached_result = self.make_request('get', full_url)
         cache.set(key, cached_result, 60)
     return cached_result
Example #29
0
 def all_keys(self, project):
     # TODO: cache invalidation via post_save/post_delete signals much like BaseManager
     key = self._get_cache_key(project.id)
     result = cache.get(key)
     if result is None:
         result = list(
             self.filter(project=project, status=TagKeyStatus.VISIBLE)
             .order_by("-values_seen")
             .values_list("key", flat=True)[:20]
         )
         cache.set(key, result, 60)
     return result
    def get_or_create(cls, project, release, environment, datetime, **kwargs):
        cache_key = cls.get_cache_key(project.id, release.id, environment.id)

        instance = cache.get(cache_key)
        if instance is None:
            release_envs = list(cls.objects.filter(
                release_id=release.id,
                organization_id=project.organization_id,
                environment_id=environment.id,
            ))
            if release_envs:
                instance = release_envs[0]
                for re in release_envs:
                    if re.project_id == project.id:
                        instance = re
                created = False
            else:
                lock_key = cls.get_lock_key(project.organization_id, release.id, environment.id)
                lock = locks.get(lock_key, duration=5)
                with TimedRetryPolicy(10)(lock.acquire):
                    try:
                        instance, created = cls.objects.get(
                            release_id=release.id,
                            organization_id=project.organization_id,
                            environment_id=environment.id,
                        ), False
                    except cls.DoesNotExist:
                        instance, created = cls.objects.create(
                            release_id=release.id,
                            project_id=project.id,
                            organization_id=project.organization_id,
                            environment_id=environment.id,
                            first_seen=datetime,
                            last_seen=datetime,
                        ), True
            cache.set(cache_key, instance, 3600)
        else:
            created = False

        # TODO(dcramer): this would be good to buffer, but until then we minimize
        # updates to once a minute, and allow Postgres to optimistically skip
        # it even if we can't
        if not created and instance.last_seen < datetime - timedelta(seconds=60):
            cls.objects.filter(
                id=instance.id,
                last_seen__lt=datetime - timedelta(seconds=60),
            ).update(
                last_seen=datetime,
            )
            instance.last_seen = datetime
            cache.set(cache_key, instance, 3600)
        return instance
Example #31
0
def fetch_release_file(filename, release, dist=None):
    cache_key = "releasefile:v1:%s:%s" % (release.id,
                                          md5_text(filename).hexdigest())

    logger.debug("Checking cache for release artifact %r (release_id=%s)",
                 filename, release.id)
    result = cache.get(cache_key)

    dist_name = dist and dist.name or None

    if result is None:
        filename_choices = ReleaseFile.normalize(filename)
        filename_idents = [
            ReleaseFile.get_ident(f, dist_name) for f in filename_choices
        ]

        logger.debug(
            "Checking database for release artifact %r (release_id=%s)",
            filename, release.id)

        possible_files = list(
            ReleaseFile.objects.filter(
                release=release, dist=dist,
                ident__in=filename_idents).select_related("file"))

        if len(possible_files) == 0:
            logger.debug(
                "Release artifact %r not found in database (release_id=%s)",
                filename, release.id)
            cache.set(cache_key, -1, 60)
            return None
        elif len(possible_files) == 1:
            releasefile = possible_files[0]
        else:
            # Pick first one that matches in priority order.
            # This is O(N*M) but there are only ever at most 4 things here
            # so not really worth optimizing.
            releasefile = next((rf for ident in filename_idents
                                for rf in possible_files if rf.ident == ident))

        logger.debug("Found release artifact %r (id=%s, release_id=%s)",
                     filename, releasefile.id, release.id)
        try:
            with metrics.timer("sourcemaps.release_file_read"):
                with releasefile.file.getfile() as fp:
                    z_body, body = compress_file(fp)
        except Exception:
            logger.error("sourcemap.compress_read_failed",
                         exc_info=sys.exc_info())
            result = None
        else:
            headers = {
                k.lower(): v
                for k, v in releasefile.file.headers.items()
            }
            encoding = get_encoding_from_headers(headers)
            result = http.UrlResult(filename, headers, body, 200, encoding)
            cache.set(cache_key, (headers, z_body, 200, encoding), 3600)

    elif result == -1:
        # We cached an error, so normalize
        # it down to None
        result = None
    else:
        # Previous caches would be a 3-tuple instead of a 4-tuple,
        # so this is being maintained for backwards compatibility
        try:
            encoding = result[3]
        except IndexError:
            encoding = None
        result = http.UrlResult(filename, result[0],
                                zlib.decompress(result[1]), result[2],
                                encoding)

    return result
Example #32
0
def fetch_release_file(filename, release):
    cache_key = 'releasefile:v1:%s:%s' % (
        release.id,
        md5_text(filename).hexdigest(),
    )

    filename_path = None
    if filename is not None:
        # Reconstruct url without protocol + host
        # e.g. http://example.com/foo?bar => ~/foo?bar
        parsed_url = urlparse(filename)
        filename_path = '~' + parsed_url.path
        if parsed_url.query:
            filename_path += '?' + parsed_url.query

    logger.debug('Checking cache for release artifact %r (release_id=%s)',
                 filename, release.id)
    result = cache.get(cache_key)

    if result is None:
        logger.debug('Checking database for release artifact %r (release_id=%s)',
                     filename, release.id)

        filename_idents = [ReleaseFile.get_ident(filename)]
        if filename_path is not None and filename_path != filename:
            filename_idents.append(ReleaseFile.get_ident(filename_path))

        possible_files = list(ReleaseFile.objects.filter(
            release=release,
            ident__in=filename_idents,
        ).select_related('file'))

        if len(possible_files) == 0:
            logger.debug('Release artifact %r not found in database (release_id=%s)',
                         filename, release.id)
            cache.set(cache_key, -1, 60)
            return None
        elif len(possible_files) == 1:
            releasefile = possible_files[0]
        else:
            # Prioritize releasefile that matches full url (w/ host)
            # over hostless releasefile
            target_ident = filename_idents[0]
            releasefile = next((f for f in possible_files if f.ident == target_ident))

        logger.debug('Found release artifact %r (id=%s, release_id=%s)',
                     filename, releasefile.id, release.id)
        try:
            with metrics.timer('sourcemaps.release_file_read'):
                with releasefile.file.getfile() as fp:
                    z_body, body = compress_file(fp)
        except Exception as e:
            logger.exception(six.text_type(e))
            cache.set(cache_key, -1, 3600)
            result = None
        else:
            headers = {k.lower(): v for k, v in releasefile.file.headers.items()}
            encoding = get_encoding_from_headers(headers)
            result = (headers, body, 200, encoding)
            cache.set(cache_key, (headers, z_body, 200, encoding), 3600)

    elif result == -1:
        # We cached an error, so normalize
        # it down to None
        result = None
    else:
        # Previous caches would be a 3-tuple instead of a 4-tuple,
        # so this is being maintained for backwards compatibility
        try:
            encoding = result[3]
        except IndexError:
            encoding = None
        result = (result[0], zlib.decompress(result[1]), result[2], encoding)

    return result
Example #33
0
def fetch_file(url, project=None, release=None, allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    if release:
        with metrics.timer('sourcemaps.release_file'):
            result = fetch_release_file(url, release)
    else:
        result = None

    cache_key = 'source:cache:v3:%s' % (
        md5_text(url).hexdigest(),
    )

    if result is None:
        if not allow_scraping or not url.startswith(('http:', 'https:')):
            error = {
                'type': EventError.JS_MISSING_SOURCE,
                'url': expose_url(url),
            }
            raise CannotFetchSource(error)

        logger.debug('Checking cache for url %r', url)
        result = cache.get(cache_key)
        if result is not None:
            # Previous caches would be a 3-tuple instead of a 4-tuple,
            # so this is being maintained for backwards compatibility
            try:
                encoding = result[3]
            except IndexError:
                encoding = None
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            result = (result[0], zlib.decompress(result[1]), result[2], encoding)

    if result is None:
        # lock down domains that are problematic
        domain = urlparse(url).netloc
        domain_key = 'source:blacklist:v2:%s' % (
            md5_text(domain).hexdigest(),
        )
        domain_result = cache.get(domain_key)
        if domain_result:
            domain_result['url'] = url
            raise CannotFetchSource(domain_result)

        headers = {}
        if project and is_valid_origin(url, project=project):
            token = project.get_option('sentry:token')
            if token:
                headers['X-Sentry-Token'] = token

        logger.debug('Fetching %r from the internet', url)

        with metrics.timer('sourcemaps.fetch'):
            http_session = http.build_session()
            response = None
            try:
                try:
                    start = time.time()
                    response = http_session.get(
                        url,
                        allow_redirects=True,
                        verify=False,
                        headers=headers,
                        timeout=settings.SENTRY_SOURCE_FETCH_SOCKET_TIMEOUT,
                        stream=True,
                    )

                    try:
                        cl = int(response.headers['content-length'])
                    except (LookupError, ValueError):
                        cl = 0
                    if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE:
                        raise OverflowError()

                    contents = []
                    cl = 0

                    # Only need to even attempt to read the response body if we
                    # got a 200 OK
                    if response.status_code == 200:
                        for chunk in response.iter_content(16 * 1024):
                            if time.time() - start > settings.SENTRY_SOURCE_FETCH_TIMEOUT:
                                raise Timeout()
                            contents.append(chunk)
                            cl += len(chunk)
                            if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE:
                                raise OverflowError()

                except Exception as exc:
                    logger.debug('Unable to fetch %r', url, exc_info=True)
                    if isinstance(exc, RestrictedIPAddress):
                        error = {
                            'type': EventError.RESTRICTED_IP,
                            'url': expose_url(url),
                        }
                    elif isinstance(exc, SuspiciousOperation):
                        error = {
                            'type': EventError.SECURITY_VIOLATION,
                            'url': expose_url(url),
                        }
                    elif isinstance(exc, Timeout):
                        error = {
                            'type': EventError.JS_FETCH_TIMEOUT,
                            'url': expose_url(url),
                            'timeout': settings.SENTRY_SOURCE_FETCH_TIMEOUT,
                        }
                    elif isinstance(exc, OverflowError):
                        error = {
                            'type': EventError.JS_TOO_LARGE,
                            'url': expose_url(url),
                            # We want size in megabytes to format nicely
                            'max_size': float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) / 1024 / 1024,
                        }
                    elif isinstance(exc, (RequestException, ZeroReturnError)):
                        error = {
                            'type': EventError.JS_GENERIC_FETCH_ERROR,
                            'value': six.text_type(type(exc)),
                            'url': expose_url(url),
                        }
                    else:
                        logger.exception(six.text_type(exc))
                        error = {
                            'type': EventError.UNKNOWN_ERROR,
                            'url': expose_url(url),
                        }

                    # TODO(dcramer): we want to be less aggressive on disabling domains
                    cache.set(domain_key, error or '', 300)
                    logger.warning('Disabling sources to %s for %ss', domain, 300,
                                   exc_info=True)
                    raise CannotFetchSource(error)

                body = b''.join(contents)
                z_body = zlib.compress(body)
                headers = {k.lower(): v for k, v in response.headers.items()}
                encoding = response.encoding

                cache.set(cache_key, (headers, z_body, response.status_code, encoding), 60)
                result = (headers, body, response.status_code, encoding)
            finally:
                if response is not None:
                    response.close()

    if result[2] != 200:
        logger.debug('HTTP %s when fetching %r', result[2], url,
                     exc_info=True)
        error = {
            'type': EventError.JS_INVALID_HTTP_CODE,
            'value': result[2],
            'url': expose_url(url),
        }
        raise CannotFetchSource(error)

    # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML)
    # NOTE: possible to have JS files that don't actually end w/ ".js", but this should catch 99% of cases
    if url.endswith('.js'):
        # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<').
        # This cannot parse as valid JS/JSON.
        # NOTE: not relying on Content-Type header because apps often don't set this correctly
        body_start = result[1][:20].lstrip()  # Discard leading whitespace (often found before doctype)

        if body_start[:1] == u'<':
            error = {
                'type': EventError.JS_INVALID_CONTENT,
                'url': url,
            }
            raise CannotFetchSource(error)

    # Make sure the file we're getting back is six.binary_type. The only
    # reason it'd not be binary would be from old cached blobs, so
    # for compatibility with current cached files, let's coerce back to
    # binary and say utf8 encoding.
    if not isinstance(result[1], six.binary_type):
        try:
            result = (result[0], result[1].encode('utf8'), None)
        except UnicodeEncodeError:
            error = {
                'type': EventError.JS_INVALID_SOURCE_ENCODING,
                'value': 'utf8',
                'url': expose_url(url),
            }
            raise CannotFetchSource(error)

    return UrlResult(url, result[0], result[1], result[3])
Example #34
0
    def expand_frames(self, frames):
        last_state = None
        state = None

        cache = self.cache
        sourcemaps = self.sourcemaps
        all_errors = []

        for frame in frames:
            errors = cache.get_errors(frame.abs_path)
            if errors:
                all_errors.extend(errors)

            source = cache.get(frame.abs_path)
            if source is None:
                logger.info('No source found for %s', frame.abs_path)
                continue

            sourcemap_url, sourcemap_idx = sourcemaps.get_link(frame.abs_path)
            if sourcemap_idx and frame.colno is not None:
                last_state = state
                state = find_source(sourcemap_idx, frame.lineno, frame.colno)

                if is_data_uri(sourcemap_url):
                    sourcemap_label = frame.abs_path
                else:
                    sourcemap_label = sourcemap_url

                abs_path = urljoin(sourcemap_url, state.src)

                logger.debug('Mapping compressed source %r to mapping in %r',
                             frame.abs_path, abs_path)
                source = cache.get(abs_path)
                if not source:
                    frame.data = {
                        'sourcemap': sourcemap_label,
                    }
                    errors = cache.get_errors(abs_path)
                    if errors:
                        all_errors.extend(errors)
                    else:
                        all_errors.append({
                            'type':
                            EventError.JS_MISSING_SOURCE,
                            'url':
                            force_bytes(abs_path, errors='replace'),
                        })

                # Store original data in annotation
                frame.data = {
                    'orig_lineno': frame.lineno,
                    'orig_colno': frame.colno,
                    'orig_function': frame.function,
                    'orig_abs_path': frame.abs_path,
                    'orig_filename': frame.filename,
                    'sourcemap': sourcemap_label,
                }

                # SourceMap's return zero-indexed lineno's
                frame.lineno = state.src_line + 1
                frame.colno = state.src_col
                # The offending function is always the previous function in the stack
                # Honestly, no idea what the bottom most frame is, so we're ignoring that atm
                if last_state:
                    frame.function = last_state.name or frame.function
                else:
                    frame.function = state.name or frame.function

                filename = state.src
                # special case webpack support
                if filename.startswith('webpack://'):
                    abs_path = filename
                    # webpack seems to use ~ to imply "relative to resolver root"
                    # which is generally seen for third party deps
                    # (i.e. node_modules)
                    if '/~/' in filename:
                        filename = '~/' + abs_path.split('/~/', 1)[-1]
                    else:
                        filename = filename.split('webpack:///', 1)[-1]

                    if filename.startswith('~/') and frame.in_app is None:
                        frame.in_app = False

                frame.abs_path = abs_path
                frame.filename = filename
                if abs_path.startswith(('http:', 'https:')):
                    frame.module = generate_module(abs_path)

            elif sourcemap_url:
                frame.data = {
                    'sourcemap': sourcemap_url,
                }

            # TODO: theoretically a minified source could point to another mapped, minified source
            frame.pre_context, frame.context_line, frame.post_context = get_source_context(
                source=source, lineno=frame.lineno, colno=frame.colno or 0)
        return all_errors
Example #35
0
    def get_send_to(self, project, event=None):
        """
        Returns a list of user IDs for the users that should receive
        notifications for the provided project.

        This result may come from cached data.
        """
        if not (project and project.teams.exists()):
            logger.debug('Tried to send notification to invalid project: %r',
                         project)
            return []

        if event:
            owners, _ = ProjectOwnership.get_owners(project.id, event.data)
            if owners != ProjectOwnership.Everyone:
                if not owners:
                    metrics.incr(
                        'features.owners.send_to',
                        tags={
                            'organization': project.organization_id,
                            'outcome': 'empty',
                        },
                        skip_internal=True,
                    )
                    return []

                metrics.incr(
                    'features.owners.send_to',
                    tags={
                        'organization': project.organization_id,
                        'outcome': 'match',
                    },
                    skip_internal=True,
                )
                send_to_list = []
                teams_to_resolve = []
                for owner in owners:
                    if owner.type == User:
                        send_to_list.append(owner.id)
                    else:
                        teams_to_resolve.append(owner.id)

                # get all users in teams
                if teams_to_resolve:
                    send_to_list += User.objects.filter(
                        is_active=True,
                        sentry_orgmember_set__organizationmemberteam__team__id__in
                        =teams_to_resolve,
                    ).values_list('id', flat=True)
                return send_to_list
            else:
                metrics.incr(
                    'features.owners.send_to',
                    tags={
                        'organization': project.organization_id,
                        'outcome': 'everyone',
                    },
                    skip_internal=True,
                )

        cache_key = '%s:send_to:%s' % (self.get_conf_key(), project.pk)
        send_to_list = cache.get(cache_key)
        if send_to_list is None:
            send_to_list = [s for s in self.get_sendable_users(project) if s]
            cache.set(cache_key, send_to_list, 60)  # 1 minute cache

        return send_to_list
Example #36
0
def fetch_file(url,
               domain_lock_enabled=True,
               outfile=None,
               headers=None,
               allow_redirects=True,
               verify_ssl=False,
               timeout=settings.SENTRY_SOURCE_FETCH_SOCKET_TIMEOUT,
               **kwargs):
    """
    Pull down a URL, returning a UrlResult object.
    """
    # lock down domains that are problematic
    if domain_lock_enabled:
        domain = urlparse(url).netloc
        domain_key = 'source:blacklist:v2:%s' % (
            md5_text(domain).hexdigest(), )
        domain_result = cache.get(domain_key)
        if domain_result:
            domain_result['url'] = url
            raise CannotFetch(domain_result)

    logger.debug('Fetching %r from the internet', url)

    http_session = build_session()
    response = None

    try:
        try:
            start = time.time()
            response = http_session.get(url,
                                        allow_redirects=allow_redirects,
                                        verify=verify_ssl,
                                        headers=headers,
                                        timeout=timeout,
                                        stream=True,
                                        **kwargs)

            try:
                cl = int(response.headers['content-length'])
            except (LookupError, ValueError):
                cl = 0
            if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE:
                raise OverflowError()

            return_body = False
            if outfile is None:
                outfile = six.BytesIO()
                return_body = True

            cl = 0

            # Only need to even attempt to read the response body if we
            # got a 200 OK
            if response.status_code == 200:
                for chunk in response.iter_content(16 * 1024):
                    if time.time(
                    ) - start > settings.SENTRY_SOURCE_FETCH_TIMEOUT:
                        raise Timeout()
                    outfile.write(chunk)
                    cl += len(chunk)
                    if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE:
                        raise OverflowError()

        except Exception as exc:
            logger.debug('Unable to fetch %r', url, exc_info=True)
            if isinstance(exc, RestrictedIPAddress):
                error = {
                    'type': EventError.RESTRICTED_IP,
                    'url': expose_url(url),
                }
            elif isinstance(exc, SuspiciousOperation):
                error = {
                    'type': EventError.SECURITY_VIOLATION,
                    'url': expose_url(url),
                }
            elif isinstance(exc, (Timeout, ReadTimeout)):
                error = {
                    'type': EventError.FETCH_TIMEOUT,
                    'url': expose_url(url),
                    'timeout': settings.SENTRY_SOURCE_FETCH_TIMEOUT,
                }
            elif isinstance(exc, OverflowError):
                error = {
                    'type':
                    EventError.FETCH_TOO_LARGE,
                    'url':
                    expose_url(url),
                    # We want size in megabytes to format nicely
                    'max_size':
                    float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) / 1024 / 1024,
                }
            elif isinstance(exc, (RequestException, ZeroReturnError)):
                error = {
                    'type': EventError.FETCH_GENERIC_ERROR,
                    'value': six.text_type(type(exc)),
                    'url': expose_url(url),
                }
            else:
                logger.exception(six.text_type(exc))
                error = {
                    'type': EventError.UNKNOWN_ERROR,
                    'url': expose_url(url),
                }

            # TODO(dcramer): we want to be less aggressive on disabling domains
            if domain_lock_enabled:
                cache.set(domain_key, error or '', 300)
                logger.warning('source.disabled', extra=error)
            raise CannotFetch(error)

        headers = {k.lower(): v for k, v in response.headers.items()}
        encoding = response.encoding

        body = None
        if return_body:
            body = outfile.getvalue()
            outfile.close()  # we only want to close StringIO

        result = (headers, body, response.status_code, encoding)
    finally:
        if response is not None:
            response.close()

    if result[2] != 200:
        logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True)
        error = {
            'type': EventError.FETCH_INVALID_HTTP_CODE,
            'value': result[2],
            'url': expose_url(url),
        }
        raise CannotFetch(error)

    return UrlResult(url, result[0], result[1], result[2], result[3])
Example #37
0
def fetch_release_artifact(url, release, dist):
    """
    Get a release artifact either by extracting it or fetching it directly.

    If a release archive was saved, the individual file will be extracted
    from the archive.

    """
    cache_key, cache_key_meta = get_cache_keys(url, release, dist)

    result = cache.get(cache_key)

    if result == -1:  # Cached as unavailable
        return None

    if result:
        return result_from_cache(url, result)

    start = time.monotonic()

    release_file = fetch_release_archive(release, dist)
    if release_file is not None:
        try:
            archive = ReleaseArchive(release_file)
        except BaseException as exc:
            logger.error("Failed to initialize archive for release %s", release.id, exc_info=exc)
            # TODO(jjbayer): cache error and return here
        else:
            with archive:
                try:
                    fp, headers = get_from_archive(url, archive)
                except KeyError:
                    logger.debug(
                        "Release artifact %r not found in archive (release_id=%s)", url, release.id
                    )
                    cache.set(cache_key, -1, 60)
                    metrics.timing(
                        "sourcemaps.release_artifact_from_archive", time.monotonic() - start
                    )
                    return None
                except BaseException as exc:
                    logger.error("Failed to read %s from release %s", url, release.id, exc_info=exc)
                    # TODO(jjbayer): cache error and return here
                else:
                    result = fetch_and_cache_artifact(
                        url,
                        lambda: fp,
                        cache_key,
                        cache_key_meta,
                        headers,
                        # Cannot use `compress_file` because `ZipExtFile` does not support chunks
                        compress_fn=compress,
                    )
                    metrics.timing(
                        "sourcemaps.release_artifact_from_archive", time.monotonic() - start
                    )

                    return result

    # Fall back to maintain compatibility with old releases and versions of
    # sentry-cli which upload files individually
    result = fetch_release_file(url, release, dist)

    return result
Example #38
0
def fetch_file(url,
               project=None,
               release=None,
               dist=None,
               allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the database first (assuming there's a release on the
    event), then the internet. Caches the result of each of those two attempts
    separately, whether or not those attempts are successful. Used for both
    source files and source maps.
    """

    # If our url has been truncated, it'd be impossible to fetch
    # so we check for this early and bail
    if url[-3:] == "...":
        raise http.CannotFetch({
            "type": EventError.JS_MISSING_SOURCE,
            "url": http.expose_url(url)
        })

    # if we've got a release to look on, try that first (incl associated cache)
    if release:
        with metrics.timer("sourcemaps.release_file"):
            result = fetch_release_file(url, release, dist)
    else:
        result = None

    # otherwise, try the web-scraping cache and then the web itself

    cache_key = "source:cache:v4:%s" % (md5_text(url).hexdigest(), )

    if result is None:
        if not allow_scraping or not url.startswith(("http:", "https:")):
            error = {
                "type": EventError.JS_MISSING_SOURCE,
                "url": http.expose_url(url)
            }
            raise http.CannotFetch(error)

        logger.debug("Checking cache for url %r", url)
        result = cache.get(cache_key)
        if result is not None:
            # Previous caches would be a 3-tuple instead of a 4-tuple,
            # so this is being maintained for backwards compatibility
            try:
                encoding = result[4]
            except IndexError:
                encoding = None
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            result = http.UrlResult(result[0], result[1],
                                    zlib.decompress(result[2]), result[3],
                                    encoding)

    if result is None:
        headers = {}
        verify_ssl = False
        if project and is_valid_origin(url, project=project):
            verify_ssl = bool(project.get_option("sentry:verify_ssl", False))
            token = project.get_option("sentry:token")
            if token:
                token_header = project.get_option(
                    "sentry:token_header") or "X-Sentry-Token"
                headers[token_header] = token

        with metrics.timer("sourcemaps.fetch"):
            result = http.fetch_file(url,
                                     headers=headers,
                                     verify_ssl=verify_ssl)
            z_body = zlib.compress(result.body)
            cache.set(
                cache_key,
                (url, result.headers, z_body, result.status, result.encoding),
                get_max_age(result.headers),
            )

    # If we did not get a 200 OK we just raise a cannot fetch here.
    if result.status != 200:
        raise http.CannotFetch({
            "type": EventError.FETCH_INVALID_HTTP_CODE,
            "value": result.status,
            "url": http.expose_url(url),
        })

    # Make sure the file we're getting back is six.binary_type. The only
    # reason it'd not be binary would be from old cached blobs, so
    # for compatibility with current cached files, let's coerce back to
    # binary and say utf8 encoding.
    if not isinstance(result.body, six.binary_type):
        try:
            result = http.UrlResult(
                result.url,
                result.headers,
                result.body.encode("utf8"),
                result.status,
                result.encoding,
            )
        except UnicodeEncodeError:
            error = {
                "type": EventError.FETCH_INVALID_ENCODING,
                "value": "utf8",
                "url": http.expose_url(url),
            }
            raise http.CannotFetch(error)

    # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML)
    # NOTE: possible to have JS files that don't actually end w/ ".js", but
    # this should catch 99% of cases
    if urlsplit(url).path.endswith(".js"):
        # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<').
        # This cannot parse as valid JS/JSON.
        # NOTE: not relying on Content-Type header because apps often don't set this correctly
        # Discard leading whitespace (often found before doctype)
        body_start = result.body[:20].lstrip()

        if body_start[:1] == u"<":
            error = {"type": EventError.JS_INVALID_CONTENT, "url": url}
            raise http.CannotFetch(error)

    return result
Example #39
0
    def get_send_to(self, project, event=None):
        """
        Returns a list of user IDs for the users that should receive
        notifications for the provided project.

        This result may come from cached data.
        """
        if not (project and project.teams.exists()):
            logger.debug("Tried to send notification to invalid project: %r",
                         project)
            return []

        if event:
            owners, _ = ProjectOwnership.get_owners(project.id, event.data)
            if owners != ProjectOwnership.Everyone:
                if not owners:
                    metrics.incr(
                        "features.owners.send_to",
                        tags={
                            "organization": project.organization_id,
                            "outcome": "empty"
                        },
                        skip_internal=True,
                    )
                    return []

                metrics.incr(
                    "features.owners.send_to",
                    tags={
                        "organization": project.organization_id,
                        "outcome": "match"
                    },
                    skip_internal=True,
                )
                send_to_list = set()
                teams_to_resolve = set()
                for owner in owners:
                    if owner.type == User:
                        send_to_list.add(owner.id)
                    else:
                        teams_to_resolve.add(owner.id)

                # get all users in teams
                if teams_to_resolve:
                    send_to_list |= set(
                        User.objects.filter(
                            is_active=True,
                            sentry_orgmember_set__organizationmemberteam__team__id__in
                            =teams_to_resolve,
                        ).values_list("id", flat=True))

                alert_settings = project.get_member_alert_settings(
                    self.alert_option_key)
                disabled_users = set(
                    user for user, setting in alert_settings.items()
                    if setting == 0)
                return send_to_list - disabled_users
            else:
                metrics.incr(
                    "features.owners.send_to",
                    tags={
                        "organization": project.organization_id,
                        "outcome": "everyone"
                    },
                    skip_internal=True,
                )

        cache_key = "%s:send_to:%s" % (self.get_conf_key(), project.pk)
        send_to_list = cache.get(cache_key)
        if send_to_list is None:
            send_to_list = [s for s in self.get_sendable_users(project) if s]
            cache.set(cache_key, send_to_list, 60)  # 1 minute cache

        return send_to_list
Example #40
0
def lookup_frame_cache(keys):
    rv = {}
    for key in keys:
        rv[key] = cache.get(key)
    return rv
Example #41
0
    def _get_or_create_impl(cls, project, version, date_added, metric_tags):
        from sentry.models import Project

        if date_added is None:
            date_added = timezone.now()

        cache_key = cls.get_cache_key(project.organization_id, version)

        release = cache.get(cache_key)

        if release in (None, -1):
            # TODO(dcramer): if the cache result is -1 we could attempt a
            # default create here instead of default get
            project_version = ("%s-%s" %
                               (project.slug, version))[:DB_VERSION_LENGTH]
            releases = list(
                cls.objects.filter(
                    organization_id=project.organization_id,
                    version__in=[version, project_version],
                    projects=project,
                ))

            if releases:
                try:
                    release = [
                        r for r in releases if r.version == project_version
                    ][0]
                except IndexError:
                    release = releases[0]
                metric_tags["created"] = "false"
            else:
                try:
                    with transaction.atomic():
                        release = cls.objects.create(
                            organization_id=project.organization_id,
                            version=version,
                            date_added=date_added,
                            total_deploys=0,
                        )

                    metric_tags["created"] = "true"
                except IntegrityError:
                    metric_tags["created"] = "false"
                    release = cls.objects.get(
                        organization_id=project.organization_id,
                        version=version)

                release.add_project(project)
                if not project.flags.has_releases:
                    project.flags.has_releases = True
                    project.update(
                        flags=F("flags").bitor(Project.flags.has_releases))

            # TODO(dcramer): upon creating a new release, check if it should be
            # the new "latest release" for this project
            cache.set(cache_key, release, 3600)
            metric_tags["cache_hit"] = "false"
        else:
            metric_tags["cache_hit"] = "true"

        return release
Example #42
0
def fetch_url(url, project=None, release=None):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    cache_key = 'source:cache:v2:%s' % (md5(url).hexdigest(), )

    if release:
        result = fetch_release_file(url, release)
    else:
        result = None

    if result is None:
        logger.debug('Checking cache for url %r', url)
        result = cache.get(cache_key)

    if result is None:
        # lock down domains that are problematic
        domain = urlparse(url).netloc
        domain_key = 'source:blacklist:v2:%s' % (md5(domain).hexdigest(), )
        domain_result = cache.get(domain_key)
        if domain_result:
            domain_result['url'] = url
            raise CannotFetchSource(domain_result)

        headers = {}
        if project and is_valid_origin(url, project=project):
            token = project.get_option('sentry:token')
            if token:
                headers['X-Sentry-Token'] = token

        logger.debug('Fetching %r from the internet', url)

        http_session = http.build_session()
        try:
            response = http_session.get(
                url,
                allow_redirects=True,
                verify=False,
                headers=headers,
                timeout=settings.SENTRY_SOURCE_FETCH_TIMEOUT,
            )
        except Exception as exc:
            logger.debug('Unable to fetch %r', url, exc_info=True)
            if isinstance(exc, SuspiciousOperation):
                error = {
                    'type': EventError.SECURITY_VIOLATION,
                    'value': unicode(exc),
                    'url': url,
                }
            elif isinstance(exc, (RequestException, ZeroReturnError)):
                error = {
                    'type': EventError.JS_GENERIC_FETCH_ERROR,
                    'value': str(type(exc)),
                    'url': url,
                }
            else:
                logger.exception(unicode(exc))
                error = {
                    'type': EventError.UNKNOWN_ERROR,
                    'url': url,
                }

            # TODO(dcramer): we want to be less aggressive on disabling domains
            cache.set(domain_key, error or '', 300)
            logger.warning('Disabling sources to %s for %ss',
                           domain,
                           300,
                           exc_info=True)
            raise CannotFetchSource(error)

        # requests' attempts to use chardet internally when no encoding is found
        # and we want to avoid that slow behavior
        if not response.encoding:
            response.encoding = 'utf-8'

        result = (
            {k.lower(): v
             for k, v in response.headers.items()},
            response.text,
            response.status_code,
        )
        cache.set(cache_key, result, 60)

    if result[2] != 200:
        logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True)
        error = {
            'type': EventError.JS_INVALID_HTTP_CODE,
            'value': result[2],
            'url': url,
        }
        raise CannotFetchSource(error)

    return UrlResult(url, result[0], result[1])
Example #43
0
def post_process_group(
    is_new, is_regression, is_new_group_environment, cache_key, group_id=None, **kwargs
):
    """
    Fires post processing hooks for a group.
    """
    from sentry.eventstore.models import Event
    from sentry.eventstore.processing import event_processing_store
    from sentry.reprocessing2 import is_reprocessed_event
    from sentry.utils import snuba

    with snuba.options_override({"consistent": True}):
        # We use the data being present/missing in the processing store
        # to ensure that we don't duplicate work should the forwarding consumers
        # need to rewind history.
        data = event_processing_store.get(cache_key)
        if not data:
            logger.info(
                "post_process.skipped",
                extra={"cache_key": cache_key, "reason": "missing_cache"},
            )
            return
        event = Event(
            project_id=data["project"], event_id=data["event_id"], group_id=group_id, data=data
        )

        set_current_event_project(event.project_id)

        is_reprocessed = is_reprocessed_event(event.data)

        # NOTE: we must pass through the full Event object, and not an
        # event_id since the Event object may not actually have been stored
        # in the database due to sampling.
        from sentry.models import Commit, EventDict, GroupInboxReason, Organization, Project
        from sentry.models.group import get_group_with_redirect
        from sentry.models.groupinbox import add_group_to_inbox
        from sentry.rules.processor import RuleProcessor
        from sentry.tasks.groupowner import process_suspect_commits
        from sentry.tasks.servicehooks import process_service_hook

        # Re-bind node data to avoid renormalization. We only want to
        # renormalize when loading old data from the database.
        event.data = EventDict(event.data, skip_renormalization=True)

        # Re-bind Project and Org since we're reading the Event object
        # from cache which may contain stale parent models.
        event.project = Project.objects.get_from_cache(id=event.project_id)
        event.project._organization_cache = Organization.objects.get_from_cache(
            id=event.project.organization_id
        )

        if event.group_id:
            # Re-bind Group since we're reading the Event object
            # from cache, which may contain a stale group and project
            event.group, _ = get_group_with_redirect(event.group_id)
            event.group_id = event.group.id

            event.group.project = event.project
            event.group.project._organization_cache = event.project._organization_cache

        bind_organization_context(event.project.organization)

        _capture_stats(event, is_new)

        if event.group_id and is_reprocessed and is_new:
            add_group_to_inbox(event.group, GroupInboxReason.REPROCESSED)

        if event.group_id and not is_reprocessed:
            # we process snoozes before rules as it might create a regression
            # but not if it's new because you can't immediately snooze a new group
            has_reappeared = False if is_new else process_snoozes(event.group)
            if not has_reappeared:  # If true, we added the .UNIGNORED reason already
                if is_new:
                    add_group_to_inbox(event.group, GroupInboxReason.NEW)
                elif is_regression:
                    add_group_to_inbox(event.group, GroupInboxReason.REGRESSION)

            handle_owner_assignment(event.project, event.group, event)

            rp = RuleProcessor(
                event, is_new, is_regression, is_new_group_environment, has_reappeared
            )
            has_alert = False
            # TODO(dcramer): ideally this would fanout, but serializing giant
            # objects back and forth isn't super efficient
            for callback, futures in rp.apply():
                has_alert = True
                safe_execute(callback, event, futures, _with_transaction=False)

            try:
                lock = locks.get(
                    f"w-o:{event.group_id}-d-l",
                    duration=10,
                )
                with lock.acquire():
                    has_commit_key = f"w-o:{event.project.organization_id}-h-c"
                    org_has_commit = cache.get(has_commit_key)
                    if org_has_commit is None:
                        org_has_commit = Commit.objects.filter(
                            organization_id=event.project.organization_id
                        ).exists()
                        cache.set(has_commit_key, org_has_commit, 3600)

                    if org_has_commit:
                        group_cache_key = f"w-o-i:g-{event.group_id}"
                        if cache.get(group_cache_key):
                            metrics.incr(
                                "sentry.tasks.process_suspect_commits.debounce",
                                tags={"detail": "w-o-i:g debounce"},
                            )
                        else:
                            from sentry.utils.committers import get_frame_paths

                            cache.set(group_cache_key, True, 604800)  # 1 week in seconds
                            event_frames = get_frame_paths(event.data)
                            process_suspect_commits.delay(
                                event_id=event.event_id,
                                event_platform=event.platform,
                                event_frames=event_frames,
                                group_id=event.group_id,
                                project_id=event.project_id,
                            )
            except UnableToAcquireLock:
                pass
            except Exception:
                logger.exception("Failed to process suspect commits")

            if features.has("projects:servicehooks", project=event.project):
                allowed_events = {"event.created"}
                if has_alert:
                    allowed_events.add("event.alert")

                if allowed_events:
                    for servicehook_id, events in _get_service_hooks(project_id=event.project_id):
                        if any(e in allowed_events for e in events):
                            process_service_hook.delay(servicehook_id=servicehook_id, event=event)

            from sentry.tasks.sentry_apps import process_resource_change_bound

            if event.get_event_type() == "error" and _should_send_error_created_hooks(
                event.project
            ):
                process_resource_change_bound.delay(
                    action="created", sender="Error", instance_id=event.event_id, instance=event
                )
            if is_new:
                process_resource_change_bound.delay(
                    action="created", sender="Group", instance_id=event.group_id
                )

            from sentry.plugins.base import plugins

            for plugin in plugins.for_project(event.project):
                plugin_post_process_group(
                    plugin_slug=plugin.slug, event=event, is_new=is_new, is_regresion=is_regression
                )

            from sentry import similarity

            safe_execute(similarity.record, event.project, [event], _with_transaction=False)

        if event.group_id:
            # Patch attachments that were ingested on the standalone path.
            update_existing_attachments(event)

        if not is_reprocessed:
            event_processed.send_robust(
                sender=post_process_group,
                project=event.project,
                event=event,
                primary_hash=kwargs.get("primary_hash"),
            )

        with metrics.timer("tasks.post_process.delete_event_cache"):
            event_processing_store.delete_by_key(cache_key)
Example #44
0
def fetch_file(url,
               project=None,
               release=None,
               dist=None,
               allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    # If our url has been truncated, it'd be impossible to fetch
    # so we check for this early and bail
    if url[-3:] == '...':
        raise http.CannotFetch({
            'type': EventError.JS_MISSING_SOURCE,
            'url': http.expose_url(url),
        })
    if release:
        with metrics.timer('sourcemaps.release_file'):
            result = fetch_release_file(url, release, dist)
    else:
        result = None

    cache_key = 'source:cache:v4:%s' % (md5_text(url).hexdigest(), )

    if result is None:
        if not allow_scraping or not url.startswith(('http:', 'https:')):
            error = {
                'type': EventError.JS_MISSING_SOURCE,
                'url': http.expose_url(url),
            }
            raise http.CannotFetch(error)

        logger.debug('Checking cache for url %r', url)
        result = cache.get(cache_key)
        if result is not None:
            # Previous caches would be a 3-tuple instead of a 4-tuple,
            # so this is being maintained for backwards compatibility
            try:
                encoding = result[4]
            except IndexError:
                encoding = None
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            result = http.UrlResult(result[0], result[1],
                                    zlib.decompress(result[2]), result[3],
                                    encoding)

    if result is None:
        headers = {}
        verify_ssl = False
        if project and is_valid_origin(url, project=project):
            verify_ssl = bool(project.get_option('sentry:verify_ssl', False))
            token = project.get_option('sentry:token')
            if token:
                token_header = project.get_option(
                    'sentry:token_header') or 'X-Sentry-Token'
                headers[token_header] = token

        with metrics.timer('sourcemaps.fetch'):
            result = http.fetch_file(url,
                                     headers=headers,
                                     verify_ssl=verify_ssl)
            z_body = zlib.compress(result.body)
            cache.set(
                cache_key,
                (url, result.headers, z_body, result.status, result.encoding),
                get_max_age(result.headers))

    # If we did not get a 200 OK we just raise a cannot fetch here.
    if result.status != 200:
        raise http.CannotFetch({
            'type': EventError.FETCH_INVALID_HTTP_CODE,
            'value': result.status,
            'url': http.expose_url(url),
        })

    # Make sure the file we're getting back is six.binary_type. The only
    # reason it'd not be binary would be from old cached blobs, so
    # for compatibility with current cached files, let's coerce back to
    # binary and say utf8 encoding.
    if not isinstance(result.body, six.binary_type):
        try:
            result = http.UrlResult(result.url, result.headers,
                                    result.body.encode('utf8'), result.status,
                                    result.encoding)
        except UnicodeEncodeError:
            error = {
                'type': EventError.FETCH_INVALID_ENCODING,
                'value': 'utf8',
                'url': http.expose_url(url),
            }
            raise http.CannotFetch(error)

    # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML)
    # NOTE: possible to have JS files that don't actually end w/ ".js", but
    # this should catch 99% of cases
    if url.endswith('.js'):
        # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<').
        # This cannot parse as valid JS/JSON.
        # NOTE: not relying on Content-Type header because apps often don't set this correctly
        # Discard leading whitespace (often found before doctype)
        body_start = result.body[:20].lstrip()

        if body_start[:1] == u'<':
            error = {
                'type': EventError.JS_INVALID_CONTENT,
                'url': url,
            }
            raise http.CannotFetch(error)

    return result
Example #45
0
def fetch_release_file(filename, release, dist=None):
    """
    Attempt to retrieve a release artifact from the database.

    Caches the result of that attempt (whether successful or not).
    """

    dist_name = dist and dist.name or None
    cache_key = "releasefile:v1:%s:%s" % (
        release.id, ReleaseFile.get_ident(filename, dist_name))

    logger.debug("Checking cache for release artifact %r (release_id=%s)",
                 filename, release.id)
    result = cache.get(cache_key)

    # not in the cache (meaning we haven't checked the database recently), so check the database
    if result is None:
        filename_choices = ReleaseFile.normalize(filename)
        filename_idents = [
            ReleaseFile.get_ident(f, dist_name) for f in filename_choices
        ]

        logger.debug(
            "Checking database for release artifact %r (release_id=%s)",
            filename, release.id)

        possible_files = list(
            ReleaseFile.objects.filter(
                release=release, dist=dist,
                ident__in=filename_idents).select_related("file"))

        if len(possible_files) == 0:
            logger.debug(
                "Release artifact %r not found in database (release_id=%s)",
                filename, release.id)
            cache.set(cache_key, -1, 60)
            return None

        elif len(possible_files) == 1:
            releasefile = possible_files[0]

        else:
            # Pick first one that matches in priority order.
            # This is O(N*M) but there are only ever at most 4 things here
            # so not really worth optimizing.
            releasefile = next((rf for ident in filename_idents
                                for rf in possible_files if rf.ident == ident))

        logger.debug("Found release artifact %r (id=%s, release_id=%s)",
                     filename, releasefile.id, release.id)
        try:
            with metrics.timer("sourcemaps.release_file_read"):
                with ReleaseFile.cache.getfile(releasefile) as fp:
                    z_body, body = compress_file(fp)
        except Exception:
            logger.error("sourcemap.compress_read_failed",
                         exc_info=sys.exc_info())
            result = None
        else:
            headers = {
                k.lower(): v
                for k, v in releasefile.file.headers.items()
            }
            encoding = get_encoding_from_headers(headers)
            result = http.UrlResult(filename, headers, body, 200, encoding)
            # This will implicitly skip too large payloads. Those will be cached
            # on the file system by `ReleaseFile.cache`, instead.
            cache.set(cache_key, (headers, z_body, 200, encoding), 3600)

    # in the cache as an unsuccessful attempt
    elif result == -1:
        result = None

    # in the cache as a successful attempt, including the zipped contents of the file
    else:
        # Previous caches would be a 3-tuple instead of a 4-tuple,
        # so this is being maintained for backwards compatibility
        try:
            encoding = result[3]
        except IndexError:
            encoding = None
        result = http.UrlResult(filename, result[0],
                                zlib.decompress(result[1]), result[2],
                                encoding)

    return result
Example #46
0
def fetch_release_file(filename, release, dist=None):
    """
    Attempt to retrieve a release artifact from the database.

    Caches the result of that attempt (whether successful or not).
    """
    dist_name = dist and dist.name or None
    cache_key, cache_key_meta = get_cache_keys(filename, release, dist)

    logger.debug("Checking cache for release artifact %r (release_id=%s)", filename, release.id)
    result = cache.get(cache_key)

    # not in the cache (meaning we haven't checked the database recently), so check the database
    if result is None:
        with metrics.timer("sourcemaps.release_artifact_from_file"):
            filename_choices = ReleaseFile.normalize(filename)
            filename_idents = [ReleaseFile.get_ident(f, dist_name) for f in filename_choices]

            logger.debug(
                "Checking database for release artifact %r (release_id=%s)", filename, release.id
            )

            possible_files = list(
                ReleaseFile.objects.filter(
                    release=release, dist=dist, ident__in=filename_idents
                ).select_related("file")
            )

            if len(possible_files) == 0:
                logger.debug(
                    "Release artifact %r not found in database (release_id=%s)",
                    filename,
                    release.id,
                )
                cache.set(cache_key, -1, 60)
                return None

            elif len(possible_files) == 1:
                releasefile = possible_files[0]

            else:
                # Pick first one that matches in priority order.
                # This is O(N*M) but there are only ever at most 4 things here
                # so not really worth optimizing.
                releasefile = next(
                    rf for ident in filename_idents for rf in possible_files if rf.ident == ident
                )

            logger.debug(
                "Found release artifact %r (id=%s, release_id=%s)",
                filename,
                releasefile.id,
                release.id,
            )

            result = fetch_and_cache_artifact(
                filename,
                lambda: ReleaseFile.cache.getfile(releasefile),
                cache_key,
                cache_key_meta,
                releasefile.file.headers,
                compress_file,
            )

    # in the cache as an unsuccessful attempt
    elif result == -1:
        result = None

    # in the cache as a successful attempt, including the zipped contents of the file
    else:
        result = result_from_cache(filename, result)

    return result
Example #47
0
    def expand_frames(self, frames):
        last_state = None
        state = None
        has_changes = False

        cache = self.cache
        sourcemaps = self.sourcemaps

        for frame in frames:
            errors = cache.get_errors(frame.abs_path)
            if errors:
                has_changes = True

            frame.errors = errors

            source = cache.get(frame.abs_path)
            if source is None:
                logger.info('No source found for %s', frame.abs_path)
                continue

            sourcemap_url, sourcemap_idx = sourcemaps.get_link(frame.abs_path)
            if sourcemap_idx and frame.colno is not None:
                last_state = state
                state = find_source(sourcemap_idx, frame.lineno, frame.colno)
                abs_path = urljoin(sourcemap_url, state.src)

                logger.debug('Mapping compressed source %r to mapping in %r',
                             frame.abs_path, abs_path)
                source = cache.get(abs_path)
                if not source:
                    frame.data = {
                        'sourcemap': sourcemap_url,
                    }
                    errors = cache.get_errors(abs_path)
                    if errors:
                        frame.errors.extend(errors)
                    else:
                        frame.errors.append(
                            ERR_MISSING_SOURCE.format(
                                filename=abs_path.encode('utf-8'), ))

                # Store original data in annotation
                frame.data = {
                    'orig_lineno': frame.lineno,
                    'orig_colno': frame.colno,
                    'orig_function': frame.function,
                    'orig_abs_path': frame.abs_path,
                    'orig_filename': frame.filename,
                    'sourcemap': sourcemap_url,
                }

                # SourceMap's return zero-indexed lineno's
                frame.lineno = state.src_line + 1
                frame.colno = state.src_col
                # The offending function is always the previous function in the stack
                # Honestly, no idea what the bottom most frame is, so we're ignoring that atm
                if last_state:
                    frame.function = last_state.name or frame.function
                else:
                    frame.function = state.name or frame.function
                frame.abs_path = abs_path
                frame.filename = state.src
                frame.module = generate_module(state.src)

            elif sourcemap_url:
                frame.data = {
                    'sourcemap': sourcemap_url,
                }

            # TODO: theoretically a minified source could point to another mapped, minified source
            frame.pre_context, frame.context_line, frame.post_context = get_source_context(
                source=source, lineno=frame.lineno, colno=frame.colno or 0)
Example #48
0
    def get_from_cache(self, **kwargs: Any) -> M:
        """
        Wrapper around QuerySet.get which supports caching of the
        intermediate value.  Callee is responsible for making sure
        the cache key is cleared on save.
        """
        if not self.cache_fields or len(kwargs) > 1:
            raise ValueError(
                "We cannot cache this query. Just hit the database.")

        key, value = next(iter(kwargs.items()))
        pk_name = self.model._meta.pk.name
        if key == "pk":
            key = pk_name

        # We store everything by key references (vs instances)
        if isinstance(value, Model):
            value = value.pk

        # Kill __exact since it's the default behavior
        if key.endswith("__exact"):
            key = key.split("__exact", 1)[0]

        if key in self.cache_fields or key == pk_name:
            cache_key = self.__get_lookup_cache_key(**{key: value})
            local_cache = self._get_local_cache()
            if local_cache is not None:
                result = local_cache.get(cache_key)
                if result is not None:
                    return result

            retval = cache.get(cache_key, version=self.cache_version)
            if retval is None:
                result = self.get(**kwargs)
                # Ensure we're pushing it into the cache
                self.__post_save(instance=result)
                if local_cache is not None:
                    local_cache[cache_key] = result
                return result

            # If we didn't look up by pk we need to hit the reffed
            # key
            if key != pk_name:
                result = self.get_from_cache(**{pk_name: retval})
                if local_cache is not None:
                    local_cache[cache_key] = result
                return result

            if not isinstance(retval, self.model):
                if settings.DEBUG:
                    raise ValueError(
                        "Unexpected value type returned from cache")
                logger.error("Cache response returned invalid value %r",
                             retval)
                return self.get(**kwargs)

            if key == pk_name and int(value) != retval.pk:
                if settings.DEBUG:
                    raise ValueError("Unexpected value returned from cache")
                logger.error("Cache response returned invalid value %r",
                             retval)
                return self.get(**kwargs)

            retval._state.db = router.db_for_read(self.model, **kwargs)

            # Explicitly typing to satisfy mypy.
            r: M = retval
            return r
        else:
            raise ValueError(
                "We cannot cache this query. Just hit the database.")
Example #49
0
def fetch_url(url, project=None, release=None):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    cache_key = 'source:cache:v2:%s' % (hashlib.md5(
        url.encode('utf-8')).hexdigest(), )

    if release:
        result = fetch_release_file(url, release)
    else:
        result = None

    if result is None:
        logger.debug('Checking cache for url %r', url)
        result = cache.get(cache_key)

    if result is None:
        # lock down domains that are problematic
        domain = urlparse(url).netloc
        domain_key = 'source:blacklist:%s' % (hashlib.md5(
            domain.encode('utf-8')).hexdigest(), )
        domain_result = cache.get(domain_key)
        if domain_result:
            raise DomainBlacklisted(
                ERR_DOMAIN_BLACKLISTED.format(reason=domain_result, ))

        headers = {}
        if project and is_valid_origin(url, project=project):
            token = project.get_option('sentry:token')
            if token:
                headers['X-Sentry-Token'] = token

        logger.debug('Fetching %r from the internet', url)

        http_session = http.build_session()
        try:
            response = http_session.get(
                url,
                allow_redirects=True,
                verify=False,
                headers=headers,
                timeout=settings.SENTRY_SOURCE_FETCH_TIMEOUT,
            )
        except Exception as exc:
            logger.debug('Unable to fetch %r', url, exc_info=True)
            if isinstance(exc, SuspiciousOperation):
                error = unicode(exc)
            elif isinstance(exc, RequestException):
                error = ERR_GENERIC_FETCH_FAILURE.format(type=type(exc), )
            else:
                logger.exception(unicode(exc))
                error = ERR_UNKNOWN_INTERNAL_ERROR

            # TODO(dcramer): we want to be less aggressive on disabling domains
            cache.set(domain_key, error or '', 300)
            logger.warning('Disabling sources to %s for %ss',
                           domain,
                           300,
                           exc_info=True)
            raise CannotFetchSource(error)

        result = (
            {k.lower(): v
             for k, v in response.headers.items()},
            response.content,
            response.status_code,
        )
        cache.set(cache_key, result, 60)

    if result[2] != 200:
        logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True)
        error = ERR_HTTP_CODE.format(status_code=result[2], )
        raise CannotFetchSource(error)

    return UrlResult(url, result[0], result[1])
Example #50
0
def fetch_file(url, project=None, release=None, allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    if release:
        result = fetch_release_file(url, release)
    elif not allow_scraping or not url.startswith(('http:', 'https:')):
        error = {
            'type': EventError.JS_MISSING_SOURCE,
            'url': url,
        }
        raise CannotFetchSource(error)
    else:
        result = None

    cache_key = 'source:cache:v3:%s' % (md5(url).hexdigest(), )

    if result is None:
        logger.debug('Checking cache for url %r', url)
        result = cache.get(cache_key)
        if result is not None:
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            body = zlib.decompress(result[1])
            result = (result[0], force_text(body), result[2])

    if result is None:
        # lock down domains that are problematic
        domain = urlparse(url).netloc
        domain_key = 'source:blacklist:v2:%s' % (md5(domain).hexdigest(), )
        domain_result = cache.get(domain_key)
        if domain_result:
            domain_result['url'] = url
            raise CannotFetchSource(domain_result)

        headers = {}
        if project and is_valid_origin(url, project=project):
            token = project.get_option('sentry:token')
            if token:
                headers['X-Sentry-Token'] = token

        logger.debug('Fetching %r from the internet', url)

        http_session = http.build_session()
        try:
            response = http_session.get(
                url,
                allow_redirects=True,
                verify=False,
                headers=headers,
                timeout=settings.SENTRY_SOURCE_FETCH_TIMEOUT,
            )
        except Exception as exc:
            logger.debug('Unable to fetch %r', url, exc_info=True)
            if isinstance(exc, RestrictedIPAddress):
                error = {
                    'type': EventError.RESTRICTED_IP,
                    'url': url,
                }
            elif isinstance(exc, SuspiciousOperation):
                error = {
                    'type': EventError.SECURITY_VIOLATION,
                    'url': url,
                }
            elif isinstance(exc, (RequestException, ZeroReturnError)):
                error = {
                    'type': EventError.JS_GENERIC_FETCH_ERROR,
                    'value': str(type(exc)),
                    'url': url,
                }
            else:
                logger.exception(unicode(exc))
                error = {
                    'type': EventError.UNKNOWN_ERROR,
                    'url': url,
                }

            # TODO(dcramer): we want to be less aggressive on disabling domains
            cache.set(domain_key, error or '', 300)
            logger.warning('Disabling sources to %s for %ss',
                           domain,
                           300,
                           exc_info=True)
            raise CannotFetchSource(error)

        # requests' attempts to use chardet internally when no encoding is found
        # and we want to avoid that slow behavior
        if not response.encoding:
            response.encoding = 'utf-8'

        body = response.text
        z_body = zlib.compress(force_bytes(body))
        headers = {k.lower(): v for k, v in response.headers.items()}

        cache.set(cache_key, (headers, z_body, response.status_code), 60)
        result = (headers, body, response.status_code)

    if result[2] != 200:
        logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True)
        error = {
            'type': EventError.JS_INVALID_HTTP_CODE,
            'value': result[2],
            'url': url,
        }
        raise CannotFetchSource(error)

    # Make sure the file we're getting back is unicode, if it's not,
    # it's either some encoding that we don't understand, or it's binary
    # data which we can't process.
    if not isinstance(result[1], unicode):
        try:
            result = (result[0], result[1].decode('utf8'), result[2])
        except UnicodeDecodeError:
            error = {
                'type': EventError.JS_INVALID_SOURCE_ENCODING,
                'value': 'utf8',
                'url': url,
            }
            raise CannotFetchSource(error)

    return UrlResult(url, result[0], result[1])
Example #51
0
def fetch_release_archive_for_url(release, dist, url) -> Optional[IO]:
    """Fetch release archive and cache if possible.

    Multiple archives might have been uploaded, so we need the URL
    to get the correct archive from the artifact index.

    If return value is not empty, the caller is responsible for closing the stream.
    """
    with sentry_sdk.start_span(
            op="fetch_release_archive_for_url.get_index_entry"):
        info = get_index_entry(release, dist, url)
    if info is None:
        # Cannot write negative cache entry here because ID of release archive
        # is not yet known
        return None

    archive_ident = info["archive_ident"]

    # TODO(jjbayer): Could already extract filename from info and return
    # it later

    cache_key = get_release_file_cache_key(release_id=release.id,
                                           releasefile_ident=archive_ident)

    result = cache.get(cache_key)

    if result == -1:
        return None
    elif result:
        return BytesIO(result)
    else:
        try:
            with sentry_sdk.start_span(
                    op="fetch_release_archive_for_url.get_releasefile_db_entry"
            ):
                qs = ReleaseFile.objects.filter(
                    release_id=release.id,
                    dist_id=dist.id if dist else dist,
                    ident=archive_ident).select_related("file")
                releasefile = qs[0]
        except IndexError:
            # This should not happen when there is an archive_ident in the manifest
            logger.error("sourcemaps.missing_archive", exc_info=sys.exc_info())
            # Cache as nonexistent:
            cache.set(cache_key, -1, 60)
            return None
        else:
            try:
                with sentry_sdk.start_span(
                        op="fetch_release_archive_for_url.fetch_releasefile"):
                    if releasefile.file.size <= options.get(
                            "releasefile.cache-max-archive-size"):
                        getfile = lambda: ReleaseFile.cache.getfile(releasefile
                                                                    )
                    else:
                        # For very large ZIP archives, pulling the entire file into cache takes too long.
                        # Only the blobs required to extract the current artifact (central directory and the file entry itself)
                        # should be loaded in this case.
                        getfile = releasefile.file.getfile

                    file_ = fetch_retry_policy(getfile)
            except Exception:
                logger.error("sourcemaps.read_archive_failed",
                             exc_info=sys.exc_info())

                return None

            # `cache.set` will only keep values up to a certain size,
            # so we should not read the entire file if it's too large for caching
            if CACHE_MAX_VALUE_SIZE is not None and file_.size > CACHE_MAX_VALUE_SIZE:

                return file_

            with sentry_sdk.start_span(
                    op="fetch_release_archive_for_url.read_for_caching"
            ) as span:
                span.set_data("file_size", file_.size)
                contents = file_.read()
            with sentry_sdk.start_span(
                    op="fetch_release_archive_for_url.write_to_cache") as span:
                span.set_data("file_size", len(contents))
                cache.set(cache_key, contents, 3600)

            file_.seek(0)

            return file_