Esempio n. 1
0
    def _save(self, name, content):
        with metrics.timer('filestore.save', instance='s3'):
            cleaned_name = self._clean_name(name)
            name = self._normalize_name(cleaned_name)
            parameters = self.object_parameters.copy()
            content_type = getattr(
                content, 'content_type', mimetypes.guess_type(name)[0] or self.default_content_type
            )

            # setting the content_type in the key object is not enough.
            parameters.update({'ContentType': content_type})

            if self.gzip and content_type in self.gzip_content_types:
                content = self._compress_content(content)
                parameters.update({'ContentEncoding': 'gzip'})

            encoded_name = self._encode_name(name)
            obj = self.bucket.Object(encoded_name)
            if self.preload_metadata:
                self._entries[encoded_name] = obj

            self._save_content(obj, content, parameters=parameters)
            # Note: In boto3, after a put, last_modified is automatically reloaded
            # the next time it is accessed; no need to specifically reload it.
        return cleaned_name
Esempio n. 2
0
File: base.py Progetto: DZTPY/sentry
 def _wrapped(*args, **kwargs):
     key = 'jobs.duration.{name}'.format(name=name)
     if stat_suffix:
         key += '.{key}'.format(key=stat_suffix(*args, **kwargs))
     with metrics.timer(key):
         result = func(*args, **kwargs)
     return result
Esempio n. 3
0
    def process(self, data):
        stacktraces = self.get_stacktraces(data)
        if not stacktraces:
            logger.debug('No stacktrace for event %r', data['event_id'])
            return

        # TODO(dcramer): we need this to do more than just sourcemaps
        frames = self.get_valid_frames(stacktraces)
        if not frames:
            logger.debug('Event %r has no frames with enough context to fetch remote source', data['event_id'])
            return

        data.setdefault('errors', [])
        errors = data['errors']

        release = self.get_release(data)
        # all of these methods assume mutation on the original
        # objects rather than re-creation
        self.populate_source_cache(frames, release)
        with metrics.timer('sourcemaps.expand_frames'):
            expand_errors, sourcemap_applied = self.expand_frames(frames, release)
        errors.extend(expand_errors or [])
        self.ensure_module_names(frames)
        self.fix_culprit(data, stacktraces)
        self.update_stacktraces(stacktraces)
        if sourcemap_applied:
            self.add_raw_stacktraces(data, release)
        return data
Esempio n. 4
0
    def normalize(self):
        with metrics.timer('events.store.normalize.duration'):
            self._normalize_impl()

        metrics.timing(
            'events.store.normalize.errors',
            len(self._data.get("errors") or ()),
        )
Esempio n. 5
0
 def _wrapped(*args, **kwargs):
     key = 'jobs.duration'
     if stat_suffix:
         instance = '{}.{}'.format(name, stat_suffix(*args, **kwargs))
     else:
         instance = name
     with metrics.timer(key, instance=instance):
         result = func(*args, **kwargs)
     return result
Esempio n. 6
0
 def _get_file(self):
     if self._file is None:
         with metrics.timer('filestore.read', instance='s3'):
             self._file = BytesIO()
             if 'r' in self._mode:
                 self._is_dirty = False
                 self._file.write(self.obj.get()['Body'].read())
                 self._file.seek(0)
             if self._storage.gzip and self.obj.content_encoding == 'gzip':
                 self._file = GzipFile(mode=self._mode, fileobj=self._file, mtime=0.0)
     return self._file
Esempio n. 7
0
    def _save(self, name, content):
        with metrics.timer('filestore.save', instance='gcs'):
            cleaned_name = clean_name(name)
            name = self._normalize_name(cleaned_name)

            content.name = cleaned_name
            encoded_name = self._encode_name(name)
            file = GoogleCloudFile(encoded_name, 'w', self)
            content.seek(0, os.SEEK_SET)
            file.blob.upload_from_file(content, size=content.size,
                                       content_type=file.mime_type)
        return cleaned_name
Esempio n. 8
0
def test_timer_success():
    with mock.patch('sentry.utils.metrics.timing') as timing:
        with timer('key', tags={'foo': True}) as tags:
            tags['bar'] = False

        assert timing.call_count is 1
        args, kwargs = timing.call_args
        assert args[0] is 'key'
        assert args[3] == {
            'foo': True,
            'bar': False,
            'result': 'success',
        }
Esempio n. 9
0
def test_timer_failure():
    with mock.patch('sentry.utils.metrics.timing') as timing:
        with pytest.raises(ExpectedError):
            with timer('key', tags={'foo': True}) as tags:
                raise ExpectedError

        assert timing.call_count is 1
        args, kwargs = timing.call_args
        assert args[0] is 'key'
        assert args[3] == {
            'foo': True,
            'result': 'failure',
        }
Esempio n. 10
0
 def _get_file(self):
     if self._file is None:
         with metrics.timer('filestore.read', instance='gcs'):
             self._file = SpooledTemporaryFile(
                 max_size=self._storage.max_memory_size,
                 suffix=".GSStorageFile",
                 dir=None,
             )
             if 'r' in self._mode:
                 self._is_dirty = False
                 self.blob.download_to_file(self._file)
                 self._file.seek(0)
     return self._file
Esempio n. 11
0
 def _wrapped(*args, **kwargs):
     key = 'jobs.duration'
     if stat_suffix:
         instance = '{}.{}'.format(name, stat_suffix(*args, **kwargs))
     else:
         instance = name
     Raven.tags_context({'task_name': name})
     with metrics.timer(key, instance=instance):
         try:
             result = func(*args, **kwargs)
         finally:
             Raven.context.clear()
     return result
Esempio n. 12
0
def preprocess_event(data):
    if settings.SENTRY_SCRAPE_JAVASCRIPT_CONTEXT:
        project = Project.objects.get_from_cache(id=data["project"])

        allow_scraping = bool(project.get_option("sentry:scrape_javascript", True))

        processor = SourceProcessor(project=project, allow_scraping=allow_scraping)
        with metrics.timer("sourcemaps.process", instance=project.id):
            processor.process(data)

    rewrite_exception(data)

    inject_device_data(data)

    return data
Esempio n. 13
0
        def _wrapped(*args, **kwargs):
            # TODO(dcramer): we want to tag a transaction ID, but overriding
            # the base on app.task seems to cause problems w/ Celery internals
            transaction_id = kwargs.pop("__transaction_id", None)

            key = "jobs.duration"
            if stat_suffix:
                instance = "{}.{}".format(name, stat_suffix(*args, **kwargs))
            else:
                instance = name
            Raven.tags_context({"task_name": name, "transaction_id": transaction_id})
            with metrics.timer(key, instance=instance), track_memory_usage("jobs.memory_change", instance=instance):
                try:
                    result = func(*args, **kwargs)
                finally:
                    Raven.context.clear()
            return result
Esempio n. 14
0
def process_stacktraces(data, make_processors=None):
    infos = find_stacktraces_in_data(data)
    if make_processors is None:
        processors = get_processors_for_stacktraces(data, infos)
    else:
        processors = make_processors(data, infos)

    # Early out if we have no processors.  We don't want to record a timer
    # in that case.
    if not processors:
        return

    changed = False

    mkey = get_metrics_key(infos)
    with metrics.timer(mkey, instance=data['project']):
        # Build a new processing task
        processing_task = get_stacktrace_processing_task(infos, processors)

        # Preprocess step
        for processor in processing_task.iter_processors():
            if processor.preprocess_step(processing_task):
                changed = True

        # Process all stacktraces
        for stacktrace_info, processable_frames in processing_task.iter_processable_stacktraces():
            new_frames, new_raw_frames, errors = process_single_stacktrace(
                processing_task, stacktrace_info, processable_frames)
            if new_frames is not None:
                stacktrace_info.stacktrace['frames'] = new_frames
                changed = True
            if new_raw_frames is not None and \
               stacktrace_info.container is not None:
                stacktrace_info.container['raw_stacktrace'] = dict(
                    stacktrace_info.stacktrace, frames=new_raw_frames)
                changed = True
            if errors:
                data.setdefault('errors', []).extend(errors)
                changed = True

        # Close down everything
        for processor in processors:
            processor.close()

    if changed:
        return data
Esempio n. 15
0
 def _wrapped(*args, **kwargs):
     key = 'jobs.duration'
     if stat_suffix:
         instance = '{}.{}'.format(name, stat_suffix(*args, **kwargs))
     else:
         instance = name
     Raven.tags_context({'task_name': name})
     with metrics.timer(key, instance=instance), \
             track_memory_usage('jobs.memory_change', instance=instance):
         try:
             return func(*args, **kwargs)
         except SoftTimeLimitExceeded as error:
             Raven.context.merge({
                 'fingerprint': [type(error).__name__, instance],
             })
             raise
         finally:
             Raven.context.clear()
Esempio n. 16
0
        def _wrapped(*args, **kwargs):
            # TODO(dcramer): we want to tag a transaction ID, but overriding
            # the base on app.task seems to cause problems w/ Celery internals
            transaction_id = kwargs.pop('__transaction_id', None)

            key = 'jobs.duration'
            if stat_suffix:
                instance = u'{}.{}'.format(name, stat_suffix(*args, **kwargs))
            else:
                instance = name

            with push_scope() as scope:
                scope.set_tag('task_name', name)
                scope.set_tag('transaction_id', transaction_id)

                with metrics.timer(key, instance=instance), \
                        track_memory_usage('jobs.memory_change', instance=instance):
                    result = func(*args, **kwargs)
            return result
Esempio n. 17
0
def process_stacktraces(data, make_processors=None):
    infos = find_stacktraces_in_data(data)
    if make_processors is None:
        processors = get_processors_for_stacktraces(data, infos)
    else:
        processors = make_processors(data, infos)

    # Early out if we have no processors.  We don't want to record a timer
    # in that case.
    if not processors:
        return

    changed = False

    mkey = get_metrics_key(infos)

    with metrics.timer(mkey, instance=data['project']):
        for processor in processors:
            if processor.preprocess_related_data():
                changed = True

        for stacktrace_info in infos:
            new_stacktrace, raw_stacktrace, errors = process_single_stacktrace(
                stacktrace_info, processors)
            if new_stacktrace is not None:
                stacktrace_info.stacktrace.clear()
                stacktrace_info.stacktrace.update(new_stacktrace)
                changed = True
            if raw_stacktrace is not None and \
               stacktrace_info.container is not None:
                stacktrace_info.container['raw_stacktrace'] = raw_stacktrace
                changed = True
            if errors:
                data.setdefault('errors', []).extend(errors)
                changed = True

        for processor in processors:
            processor.close()

    if changed:
        return data
Esempio n. 18
0
        def _wrapped(*args, **kwargs):
            # TODO(dcramer): we want to tag a transaction ID, but overriding
            # the base on app.task seems to cause problems w/ Celery internals
            transaction_id = kwargs.pop('__transaction_id', None)

            key = 'jobs.duration'
            if stat_suffix:
                instance = '{}.{}'.format(name, stat_suffix(*args, **kwargs))
            else:
                instance = name
            Raven.tags_context({
                'task_name': name,
                'transaction_id': transaction_id,
            })
            with metrics.timer(key, instance=instance), \
                    track_memory_usage('jobs.memory_change', instance=instance), \
                    SqlQueryCountMonitor(name):
                try:
                    result = func(*args, **kwargs)
                finally:
                    Raven.context.clear()
            return result
Esempio n. 19
0
def fetch_release_file(filename, release, dist=None):
    """
    Attempt to retrieve a release artifact from the database.

    Caches the result of that attempt (whether successful or not).
    """
    dist_name = dist and dist.name or None
    cache_key, cache_key_meta = get_cache_keys(filename, release, dist)

    logger.debug("Checking cache for release artifact %r (release_id=%s)",
                 filename, release.id)
    result = cache.get(cache_key)

    # not in the cache (meaning we haven't checked the database recently), so check the database
    if result is None:
        with metrics.timer("sourcemaps.release_artifact_from_file"):
            filename_choices = ReleaseFile.normalize(filename)
            filename_idents = [
                ReleaseFile.get_ident(f, dist_name) for f in filename_choices
            ]

            logger.debug(
                "Checking database for release artifact %r (release_id=%s)",
                filename, release.id)

            possible_files = list(
                ReleaseFile.objects.filter(
                    release_id=release.id,
                    dist_id=dist.id if dist else dist,
                    ident__in=filename_idents,
                ).select_related("file"))

            if len(possible_files) == 0:
                logger.debug(
                    "Release artifact %r not found in database (release_id=%s)",
                    filename,
                    release.id,
                )
                cache.set(cache_key, -1, 60)
                return None

            elif len(possible_files) == 1:
                releasefile = possible_files[0]

            else:
                # Pick first one that matches in priority order.
                # This is O(N*M) but there are only ever at most 4 things here
                # so not really worth optimizing.
                releasefile = next(rf for ident in filename_idents
                                   for rf in possible_files
                                   if rf.ident == ident)

            logger.debug(
                "Found release artifact %r (id=%s, release_id=%s)",
                filename,
                releasefile.id,
                release.id,
            )

            result = fetch_and_cache_artifact(
                filename,
                lambda: ReleaseFile.cache.getfile(releasefile),
                cache_key,
                cache_key_meta,
                releasefile.file.headers,
                compress_file,
            )

    # in the cache as an unsuccessful attempt
    elif result == -1:
        result = None

    # in the cache as a successful attempt, including the zipped contents of the file
    else:
        result = result_from_cache(filename, result)

    return result
Esempio n. 20
0
    def _post(self, request):
        relay = request.relay
        assert relay is not None  # should be provided during Authentication

        full_config_requested = request.relay_request_data.get("fullConfig")

        if full_config_requested and not relay.is_internal:
            return Response("Relay unauthorized for full config information",
                            403)

        with Hub.current.start_span(op="relay_fetch_projects"):
            project_ids = set(request.relay_request_data.get("projects") or ())
            if project_ids:
                with metrics.timer(
                        "relay_project_configs.fetching_projects.duration"):
                    projects = {
                        p.id: p
                        for p in Project.objects.get_many_from_cache(
                            project_ids)
                    }
            else:
                projects = {}

        with Hub.current.start_span(op="relay_fetch_orgs"):
            # Preload all organizations and their options to prevent repeated
            # database access when computing the project configuration.
            org_ids = set(project.organization_id
                          for project in six.itervalues(projects))
            if org_ids:
                with metrics.timer(
                        "relay_project_configs.fetching_orgs.duration"):
                    orgs = {
                        o.id: o
                        for o in Organization.objects.get_many_from_cache(
                            org_ids) if request.relay.has_org_access(o)
                    }
            else:
                orgs = {}
            org_options = {
                i: OrganizationOption.objects.get_all_values(i)
                for i in six.iterkeys(orgs)
            }

        with Hub.current.start_span(op="relay_fetch_keys"):
            project_keys = {}
            for key in ProjectKey.objects.get_many_from_cache(
                    project_ids, key="project_id"):
                project_keys.setdefault(key.project_id, []).append(key)

        metrics.timing("relay_project_configs.projects_requested",
                       len(project_ids))
        metrics.timing("relay_project_configs.projects_fetched", len(projects))
        metrics.timing("relay_project_configs.orgs_fetched", len(orgs))

        configs = {}
        for project_id in project_ids:
            configs[six.text_type(project_id)] = None

            project = projects.get(int(project_id))
            if project is None:
                continue

            organization = orgs.get(project.organization_id)
            if organization is None:
                continue

            project.organization = organization
            org_opts = org_options.get(organization.id) or {}

            with Hub.current.start_span(op="get_config"):
                with metrics.timer(
                        "relay_project_configs.get_config.duration"):
                    project_config = config.get_project_config(
                        project,
                        org_options=org_opts,
                        full_config=full_config_requested,
                        project_keys=project_keys.get(project.id, []),
                    )

            configs[six.text_type(project_id)] = project_config.to_dict()

        return Response({"configs": configs}, status=200)
Esempio n. 21
0
    def _flush_batch(self, batch: Sequence[Message]):
        attachment_chunks = []

        # Processing functions may be either synchronous or asynchronous.
        # Functions that return an ``AsyncResult`` may perform a combination of
        # synchronous and asynchronous work, and need to be explicitly waited on
        # to ensure they have completed and callbacks have been invoked before
        # returning. Functions that return anything else are assumed to have
        # completed successfully after they have returned.
        other_messages: MutableSequence[Tuple[Callable[
            [Message, Mapping[int, Project]], Union[Any, AsyncResult], ],
                                              Message, ]] = []

        projects_to_fetch = set()

        with metrics.timer("ingest_consumer.prepare_messages"):
            for message in batch:
                message_type = message["type"]
                projects_to_fetch.add(message["project_id"])

                if message_type == "event":
                    other_messages.append((self.__process_event, message))
                elif message_type == "attachment_chunk":
                    attachment_chunks.append(message)
                elif message_type == "attachment":
                    other_messages.append(
                        (process_individual_attachment, message))
                elif message_type == "user_report":
                    other_messages.append((process_userreport, message))
                else:
                    raise ValueError(f"Unknown message type: {message_type}")
                metrics.incr("ingest_consumer.flush.messages_seen",
                             tags={"message_type": message_type})

        with metrics.timer("ingest_consumer.fetch_projects"):
            projects = {
                p.id: p
                for p in Project.objects.get_many_from_cache(projects_to_fetch)
            }

        if attachment_chunks:
            # attachment_chunk messages need to be processed before attachment/event messages.
            with metrics.timer(
                    "ingest_consumer.process_attachment_chunk_batch"):
                for attachment_chunk in attachment_chunks:
                    process_attachment_chunk(attachment_chunk,
                                             projects=projects)

        if other_messages:
            with metrics.timer("ingest_consumer.process_other_messages_batch"):
                other_messages_flush_start = time.monotonic()

                # Keep a mapping of futures to their metadata so that we can
                # easily associate a future with its callback once completed.
                results: MutableMapping["Future[Any]", "AsyncResult[Any]"] = {}

                # Execute synchronous tasks and dispatch asynchronous tasks.
                for processing_func, message in other_messages:
                    result = processing_func(message, projects)
                    if isinstance(result, AsyncResult):
                        results[result.future] = result

                # Wait for any asynchronous work to be completed, invoking
                # callbacks (on the main thread) as results are ready.
                for future in as_completed(results.keys()):
                    results[future].callback(future)

                metrics.timing(
                    "ingest_consumer.process_other_messages_batch.normalized",
                    (time.monotonic() - other_messages_flush_start) /
                    len(other_messages),
                )
Esempio n. 22
0
    def handle_message(self, message):
        """
        Parses the value from Kafka, and if valid passes the payload to the callback defined by the
        subscription. If the subscription has been removed, or no longer has a valid callback then
        just log metrics/errors and continue.
        :param message:
        :return:
        """
        with sentry_sdk.push_scope() as scope:
            try:
                with metrics.timer(
                        "snuba_query_subscriber.parse_message_value"):
                    contents = self.parse_message_value(message.value())
            except InvalidMessageError:
                # If the message is in an invalid format, just log the error
                # and continue
                logger.exception(
                    "Subscription update could not be parsed",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                return
            scope.set_tag("query_subscription_id", contents["subscription_id"])

            try:
                with metrics.timer(
                        "snuba_query_subscriber.fetch_subscription"):
                    subscription = QuerySubscription.objects.get_from_cache(
                        subscription_id=contents["subscription_id"])
                    if subscription.status != QuerySubscription.Status.ACTIVE.value:
                        metrics.incr(
                            "snuba_query_subscriber.subscription_inactive")
                        return
            except QuerySubscription.DoesNotExist:
                metrics.incr(
                    "snuba_query_subscriber.subscription_doesnt_exist")
                logger.error(
                    "Received subscription update, but subscription does not exist",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                try:
                    _delete_from_snuba(self.topic_to_dataset[message.topic()],
                                       contents["subscription_id"])
                except Exception:
                    logger.exception(
                        "Failed to delete unused subscription from snuba.")
                return

            if subscription.type not in subscriber_registry:
                metrics.incr(
                    "snuba_query_subscriber.subscription_type_not_registered")
                logger.error(
                    "Received subscription update, but no subscription handler registered",
                    extra={
                        "offset": message.offset(),
                        "partition": message.partition(),
                        "value": message.value(),
                    },
                )
                return

            logger.info(
                "query-subscription-consumer.handle_message",
                extra={
                    "timestamp": contents["timestamp"],
                    "query_subscription_id": contents["subscription_id"],
                    "project_id": subscription.project_id,
                    "subscription_dataset": subscription.snuba_query.dataset,
                    "subscription_query": subscription.snuba_query.query,
                    "subscription_aggregation":
                    subscription.snuba_query.aggregate,
                    "subscription_time_window":
                    subscription.snuba_query.time_window,
                    "subscription_resolution":
                    subscription.snuba_query.resolution,
                    "offset": message.offset(),
                    "partition": message.partition(),
                    "value": message.value(),
                },
            )

            callback = subscriber_registry[subscription.type]
            with sentry_sdk.start_span(
                    op="process_message") as span, metrics.timer(
                        "snuba_query_subscriber.callback.duration",
                        instance=subscription.type):
                span.set_data("payload", contents)
                callback(contents, subscription)
Esempio n. 23
0
def fetch_release_file(filename, release):
    cache_key = 'releasefile:v1:%s:%s' % (
        release.id,
        md5_text(filename).hexdigest(),
    )

    filename_path = None
    if filename is not None:
        # Reconstruct url without protocol + host
        # e.g. http://example.com/foo?bar => ~/foo?bar
        parsed_url = urlparse(filename)
        filename_path = '~' + parsed_url.path
        if parsed_url.query:
            filename_path += '?' + parsed_url.query

    logger.debug('Checking cache for release artifact %r (release_id=%s)',
                 filename, release.id)
    result = cache.get(cache_key)

    if result is None:
        logger.debug('Checking database for release artifact %r (release_id=%s)',
                     filename, release.id)

        filename_idents = [ReleaseFile.get_ident(filename)]
        if filename_path is not None and filename_path != filename:
            filename_idents.append(ReleaseFile.get_ident(filename_path))

        possible_files = list(ReleaseFile.objects.filter(
            release=release,
            ident__in=filename_idents,
        ).select_related('file'))

        if len(possible_files) == 0:
            logger.debug('Release artifact %r not found in database (release_id=%s)',
                         filename, release.id)
            cache.set(cache_key, -1, 60)
            return None
        elif len(possible_files) == 1:
            releasefile = possible_files[0]
        else:
            # Prioritize releasefile that matches full url (w/ host)
            # over hostless releasefile
            target_ident = filename_idents[0]
            releasefile = next((f for f in possible_files if f.ident == target_ident))

        logger.debug('Found release artifact %r (id=%s, release_id=%s)',
                     filename, releasefile.id, release.id)
        try:
            with metrics.timer('sourcemaps.release_file_read'):
                with releasefile.file.getfile() as fp:
                    z_body, body = compress_file(fp)
        except Exception as e:
            logger.exception(six.text_type(e))
            cache.set(cache_key, -1, 3600)
            result = None
        else:
            try:
                result = (releasefile.file.headers, body.decode('utf-8'), 200)
            except UnicodeDecodeError:
                error = {
                    'type': EventError.JS_INVALID_SOURCE_ENCODING,
                    'value': 'utf8',
                    'url': expose_url(releasefile.name),
                }
                raise CannotFetchSource(error)
            else:
                # Write the compressed version to cache, but return the deflated version
                cache.set(cache_key, (releasefile.file.headers, z_body, 200), 3600)

    elif result == -1:
        # We cached an error, so normalize
        # it down to None
        result = None
    else:
        # We got a cache hit, but the body is compressed, so we
        # need to decompress it before handing it off
        body = zlib.decompress(result[1])
        try:
            result = (result[0], body.decode('utf-8'), result[2])
        except UnicodeDecodeError:
            error = {
                'type': EventError.JS_INVALID_SOURCE_ENCODING,
                'value': 'utf8',
                'url': expose_url(releasefile.name),
            }
            raise CannotFetchSource(error)

    return result
Esempio n. 24
0
    def _post_by_project(self, request, full_config_requested):
        project_ids = set(request.relay_request_data.get("projects") or ())

        with start_span(op="relay_fetch_projects"):
            if project_ids:
                with metrics.timer(
                        "relay_project_configs.fetching_projects.duration"):
                    projects = {
                        p.id: p
                        for p in Project.objects.get_many_from_cache(
                            project_ids)
                    }
            else:
                projects = {}

        with start_span(op="relay_fetch_orgs"):
            # Preload all organizations and their options to prevent repeated
            # database access when computing the project configuration.
            org_ids = set(project.organization_id
                          for project in six.itervalues(projects))
            if org_ids:
                with metrics.timer(
                        "relay_project_configs.fetching_orgs.duration"):
                    orgs = Organization.objects.get_many_from_cache(org_ids)
                    orgs = {
                        o.id: o
                        for o in orgs if request.relay.has_org_access(o)
                    }
            else:
                orgs = {}

            with metrics.timer(
                    "relay_project_configs.fetching_org_options.duration"):
                for org_id in six.iterkeys(orgs):
                    OrganizationOption.objects.get_all_values(org_id)

        with start_span(op="relay_fetch_keys"):
            project_keys = {}
            for key in ProjectKey.objects.filter(project_id__in=project_ids):
                project_keys.setdefault(key.project_id, []).append(key)

        metrics.timing("relay_project_configs.projects_requested",
                       len(project_ids))
        metrics.timing("relay_project_configs.projects_fetched", len(projects))
        metrics.timing("relay_project_configs.orgs_fetched", len(orgs))

        configs = {}
        for project_id in project_ids:
            configs[six.text_type(project_id)] = {"disabled": True}

            project = projects.get(int(project_id))
            if project is None:
                continue

            organization = orgs.get(project.organization_id)
            if organization is None:
                continue

            # Try to prevent organization from being fetched again in quotas.
            project.organization = organization
            project._organization_cache = organization

            with start_span(op="get_config"):
                with metrics.timer(
                        "relay_project_configs.get_config.duration"):
                    project_config = config.get_project_config(
                        project,
                        full_config=full_config_requested,
                        project_keys=project_keys.get(project.id) or [],
                    )

            configs[six.text_type(project_id)] = project_config.to_dict()

        if full_config_requested:
            projectconfig_cache.set_many(configs)

        return Response({"configs": configs}, status=200)
Esempio n. 25
0
 def get_or_create(cls, project, version, date_added=None):
     with metrics.timer("models.release.get_or_create") as metric_tags:
         return cls._get_or_create_impl(project, version, date_added, metric_tags)
Esempio n. 26
0
def process_suspect_commits(event_id, event_platform, event_frames, group_id, project_id, **kwargs):
    metrics.incr("sentry.tasks.process_suspect_commits.start")
    set_current_event_project(project_id)

    project = Project.objects.get_from_cache(id=project_id)
    owners = GroupOwner.objects.filter(
        group_id=group_id,
        project=project,
        organization_id=project.organization_id,
        type=GroupOwnerType.SUSPECT_COMMIT.value,
    )
    owner_count = owners.count()
    if owner_count >= PREFERRED_GROUP_OWNERS:
        owners = owners.filter(date_added__lte=timezone.now() - PREFERRED_GROUP_OWNER_AGE).order_by(
            "-date_added"
        )
        if not owners.exists():
            metrics.incr(
                "sentry.tasks.process_suspect_commits.aborted",
                tags={"detail": "maxed_owners_none_old"},
            )
            return

    with metrics.timer("sentry.tasks.process_suspect_commits.process_loop"):
        try:
            with metrics.timer(
                "sentry.tasks.process_suspect_commits.get_serialized_event_file_committers"
            ):
                committers = get_event_file_committers(
                    project, group_id, event_frames, event_platform
                )
            owner_scores = {}
            for committer in committers:
                if "id" in committer["author"]:
                    author_id = committer["author"]["id"]
                    for commit, score in committer["commits"]:
                        if score >= MIN_COMMIT_SCORE:
                            owner_scores[author_id] = max(score, owner_scores.get(author_id, 0))

            if owner_scores:
                for owner_id in sorted(owner_scores, reverse=True, key=owner_scores.get)[
                    :PREFERRED_GROUP_OWNERS
                ]:
                    go, created = GroupOwner.objects.update_or_create(
                        group_id=group_id,
                        type=GroupOwnerType.SUSPECT_COMMIT.value,
                        user_id=owner_id,
                        project=project,
                        organization_id=project.organization_id,
                        defaults={
                            "date_added": timezone.now()
                        },  # Updates date of an existing owner, since we just matched them with this new event
                    )
                    if created:
                        owner_count += 1
                        if owner_count > PREFERRED_GROUP_OWNERS:
                            try:
                                owner = owners[0]
                            except IndexError:
                                pass
                            else:
                                owner.delete()

        except Commit.DoesNotExist:
            logger.info(
                "process_suspect_commits.skipped",
                extra={"event": event_id, "reason": "no_commit"},
            )
        except Release.DoesNotExist:
            logger.info(
                "process_suspect_commits.skipped",
                extra={"event": event_id, "reason": "no_release"},
            )
Esempio n. 27
0
    def normalize(self):
        with metrics.timer("events.store.normalize.duration"):
            self._normalize_impl()

        metrics.timing("events.store.normalize.errors",
                       len(self._data.get("errors") or ()))
Esempio n. 28
0
 def _get_event_user(self, project, data):
     with metrics.timer("event_manager.get_event_user") as metrics_tags:
         return self._get_event_user_impl(project, data, metrics_tags)
Esempio n. 29
0
    def run_post_process_forwarder(
        self,
        consumer_group,
        commit_log_topic,
        synchronize_commit_group,
        commit_batch_size=100,
        initial_offset_reset="latest",
    ):
        logger.debug("Starting post-process forwarder...")

        cluster_name = settings.KAFKA_TOPICS[settings.KAFKA_EVENTS]["cluster"]

        consumer = SynchronizedConsumer(
            cluster_name=cluster_name,
            consumer_group=consumer_group,
            commit_log_topic=commit_log_topic,
            synchronize_commit_group=synchronize_commit_group,
            initial_offset_reset=initial_offset_reset,
        )

        owned_partition_offsets = {}

        def commit(partitions):
            results = consumer.commit(offsets=partitions, asynchronous=False)

            errors = [i for i in results if i.error is not None]
            if errors:
                raise Exception(
                    "Failed to commit {}/{} partitions: {!r}".format(
                        len(errors), len(partitions), errors))

            return results

        def on_assign(consumer, partitions):
            logger.debug("Received partition assignment: %r", partitions)

            for i in partitions:
                if i.offset == OFFSET_INVALID:
                    updated_offset = None
                elif i.offset < 0:
                    raise Exception(
                        f"Received unexpected negative offset during partition assignment: {i!r}"
                    )
                else:
                    updated_offset = i.offset

                key = (i.topic, i.partition)
                previous_offset = owned_partition_offsets.get(key, None)
                if previous_offset is not None and previous_offset != updated_offset:
                    logger.warning(
                        "Received new offset for owned partition %r, will overwrite previous stored offset %r with %r.",
                        key,
                        previous_offset,
                        updated_offset,
                    )

                owned_partition_offsets[key] = updated_offset

        def on_revoke(consumer, partitions):
            logger.debug("Revoked partition assignment: %r", partitions)

            offsets_to_commit = []

            for i in partitions:
                key = (i.topic, i.partition)

                try:
                    offset = owned_partition_offsets.pop(key)
                except KeyError:
                    logger.warning(
                        "Received unexpected partition revocation for unowned partition: %r",
                        i,
                        exc_info=True,
                    )
                    continue

                if offset is None:
                    logger.debug(
                        "Skipping commit of unprocessed partition: %r", i)
                    continue

                offsets_to_commit.append(
                    TopicPartition(i.topic, i.partition, offset))

            if offsets_to_commit:
                logger.debug(
                    "Committing offset(s) for %s revoked partition(s): %r",
                    len(offsets_to_commit),
                    offsets_to_commit,
                )
                commit(offsets_to_commit)

        consumer.subscribe([self.topic],
                           on_assign=on_assign,
                           on_revoke=on_revoke)

        def commit_offsets():
            offsets_to_commit = []
            for (topic, partition), offset in owned_partition_offsets.items():
                if offset is None:
                    logger.debug(
                        "Skipping commit of unprocessed partition: %r",
                        (topic, partition))
                    continue

                offsets_to_commit.append(
                    TopicPartition(topic, partition, offset))

            if offsets_to_commit:
                logger.debug(
                    "Committing offset(s) for %s owned partition(s): %r",
                    len(offsets_to_commit),
                    offsets_to_commit,
                )
                commit(offsets_to_commit)

        try:
            i = 0
            while True:
                message = consumer.poll(0.1)
                if message is None:
                    continue

                error = message.error()
                if error is not None:
                    raise Exception(error)

                key = (message.topic(), message.partition())
                if key not in owned_partition_offsets:
                    logger.warning(
                        "Skipping message for unowned partition: %r", key)
                    continue

                i = i + 1
                owned_partition_offsets[key] = message.offset() + 1

                with metrics.timer("eventstream.duration",
                                   instance="get_task_kwargs_for_message"):
                    task_kwargs = get_task_kwargs_for_message(message.value())

                if task_kwargs is not None:
                    with metrics.timer(
                            "eventstream.duration",
                            instance="dispatch_post_process_group_task"):
                        self._dispatch_post_process_group_task(**task_kwargs)

                if i % commit_batch_size == 0:
                    commit_offsets()
        except KeyboardInterrupt:
            pass

        logger.debug("Committing offsets and closing consumer...")
        commit_offsets()

        consumer.close()
Esempio n. 30
0
    def save(self,
             project_id,
             raw=False,
             assume_normalized=False,
             cache_key=None):
        """
        We re-insert events with duplicate IDs into Snuba, which is responsible
        for deduplicating events. Since deduplication in Snuba is on the primary
        key (based on event ID, project ID and day), events with same IDs are only
        deduplicated if their timestamps fall on the same day. The latest event
        always wins and overwrites the value of events received earlier in that day.

        Since we increment counters and frequencies here before events get inserted
        to eventstream these numbers may be larger than the total number of
        events if we receive duplicate event IDs that fall on the same day
        (that do not hit cache first).
        """

        # Normalize if needed
        if not self._normalized:
            if not assume_normalized:
                self.normalize()
            self._normalized = True

        data = self._data

        with metrics.timer("event_manager.save.project.get_from_cache"):
            project = Project.objects.get_from_cache(id=project_id)

        with metrics.timer("event_manager.save.organization.get_from_cache"):
            project._organization_cache = Organization.objects.get_from_cache(
                id=project.organization_id)

        # Pull out the culprit
        culprit = self.get_culprit()

        # Pull the toplevel data we're interested in
        level = data.get("level")

        # TODO(mitsuhiko): this code path should be gone by July 2018.
        # This is going to be fine because no code actually still depends
        # on integers here.  When we need an integer it will be converted
        # into one later.  Old workers used to send integers here.
        if level is not None and isinstance(level, six.integer_types):
            level = LOG_LEVELS[level]

        transaction_name = data.get("transaction")
        logger_name = data.get("logger")
        release = data.get("release")
        dist = data.get("dist")
        environment = data.get("environment")
        recorded_timestamp = data.get("timestamp")

        # We need to swap out the data with the one internal to the newly
        # created event object
        event = self._get_event_instance(project_id=project_id)
        self._data = data = event.data.data

        event._project_cache = project

        date = event.datetime
        platform = event.platform
        event_id = event.event_id

        if transaction_name:
            transaction_name = force_text(transaction_name)

        # Right now the event type is the signal to skip the group. This
        # is going to change a lot.
        if event.get_event_type() == "transaction":
            issueless_event = True
        else:
            issueless_event = False

        # Some of the data that are toplevel attributes are duplicated
        # into tags (logger, level, environment, transaction).  These are
        # different from legacy attributes which are normalized into tags
        # ahead of time (site, server_name).
        setdefault_path(data, "tags", value=[])
        set_tag(data, "level", level)
        if logger_name:
            set_tag(data, "logger", logger_name)
        if environment:
            set_tag(data, "environment", environment)
        if transaction_name:
            set_tag(data, "transaction", transaction_name)

        if release:
            # dont allow a conflicting 'release' tag
            pop_tag(data, "release")
            release = Release.get_or_create(project=project,
                                            version=release,
                                            date_added=date)
            set_tag(data, "sentry:release", release.version)

        if dist and release:
            dist = release.add_dist(dist, date)
            # dont allow a conflicting 'dist' tag
            pop_tag(data, "dist")
            set_tag(data, "sentry:dist", dist.name)
        else:
            dist = None

        event_user = self._get_event_user(project, data)
        if event_user:
            # dont allow a conflicting 'user' tag
            pop_tag(data, "user")
            set_tag(data, "sentry:user", event_user.tag_value)

        with metrics.timer("event_manager.load_grouping_config"):
            # At this point we want to normalize the in_app values in case the
            # clients did not set this appropriately so far.
            grouping_config = load_grouping_config(
                get_grouping_config_dict_for_event_data(data, project))

        with metrics.timer("event_manager.normalize_stacktraces_for_grouping"):
            normalize_stacktraces_for_grouping(data, grouping_config)

        with metrics.timer("event_manager.plugins"):
            for plugin in plugins.for_project(project, version=None):
                added_tags = safe_execute(plugin.get_tags,
                                          event,
                                          _with_transaction=False)
                if added_tags:
                    # plugins should not override user provided tags
                    for key, value in added_tags:
                        if get_tag(data, key) is None:
                            set_tag(data, key, value)

        with metrics.timer("event_manager.set_tags"):
            for path, iface in six.iteritems(event.interfaces):
                for k, v in iface.iter_tags():
                    set_tag(data, k, v)
                # Get rid of ephemeral interface data
                if iface.ephemeral:
                    data.pop(iface.path, None)

        with metrics.timer("event_manager.apply_server_fingerprinting"):
            # The active grouping config was put into the event in the
            # normalize step before.  We now also make sure that the
            # fingerprint was set to `'{{ default }}' just in case someone
            # removed it from the payload.  The call to get_hashes will then
            # look at `grouping_config` to pick the right parameters.
            data["fingerprint"] = data.get("fingerprint") or ["{{ default }}"]
            apply_server_fingerprinting(
                data, get_fingerprinting_config_for_project(project))

        with metrics.timer("event_manager.event.get_hashes"):
            # Here we try to use the grouping config that was requested in the
            # event.  If that config has since been deleted (because it was an
            # experimental grouping config) we fall back to the default.
            try:
                hashes = event.get_hashes()
            except GroupingConfigNotFound:
                data["grouping_config"] = get_grouping_config_dict_for_project(
                    project)
                hashes = event.get_hashes()

        data["hashes"] = hashes

        with metrics.timer("event_manager.materialize_metadata"):
            # we want to freeze not just the metadata and type in but also the
            # derived attributes.  The reason for this is that we push this
            # data into kafka for snuba processing and our postprocessing
            # picks up the data right from the snuba topic.  For most usage
            # however the data is dynamically overridden by Event.title and
            # Event.location (See Event.as_dict)
            materialized_metadata = self.materialize_metadata()
            data.update(materialized_metadata)
            data["culprit"] = culprit

        received_timestamp = event.data.get("received") or float(
            event.datetime.strftime("%s"))

        if not issueless_event:
            # The group gets the same metadata as the event when it's flushed but
            # additionally the `last_received` key is set.  This key is used by
            # _save_aggregate.
            group_metadata = dict(materialized_metadata)
            group_metadata["last_received"] = received_timestamp
            kwargs = {
                "platform": platform,
                "message": event.search_message,
                "culprit": culprit,
                "logger": logger_name,
                "level": LOG_LEVELS_MAP.get(level),
                "last_seen": date,
                "first_seen": date,
                "active_at": date,
                "data": group_metadata,
            }

            if release:
                kwargs["first_release"] = release

            try:
                group, is_new, is_regression = self._save_aggregate(
                    event=event, hashes=hashes, release=release, **kwargs)
            except HashDiscarded:
                event_discarded.send_robust(project=project,
                                            sender=EventManager)

                metrics.incr(
                    "events.discarded",
                    skip_internal=True,
                    tags={
                        "organization_id": project.organization_id,
                        "platform": platform
                    },
                )
                raise
            event.group = group
        else:
            group = None
            is_new = False
            is_regression = False

        with metrics.timer("event_manager.event_saved_signal"):
            event_saved.send_robust(project=project,
                                    event_size=event.size,
                                    sender=EventManager)

        # store a reference to the group id to guarantee validation of isolation
        event.data.bind_ref(event)

        environment = Environment.get_or_create(project=project,
                                                name=environment)

        if group:
            group_environment, is_new_group_environment = GroupEnvironment.get_or_create(
                group_id=group.id,
                environment_id=environment.id,
                defaults={"first_release": release if release else None},
            )
        else:
            is_new_group_environment = False

        if release:
            ReleaseEnvironment.get_or_create(project=project,
                                             release=release,
                                             environment=environment,
                                             datetime=date)

            ReleaseProjectEnvironment.get_or_create(project=project,
                                                    release=release,
                                                    environment=environment,
                                                    datetime=date)

            if group:
                grouprelease = GroupRelease.get_or_create(
                    group=group,
                    release=release,
                    environment=environment,
                    datetime=date)

        counters = [(tsdb.models.project, project.id)]

        if group:
            counters.append((tsdb.models.group, group.id))

        if release:
            counters.append((tsdb.models.release, release.id))

        with metrics.timer("event_manager.tsdb_incr_group_and_release_counters"
                           ) as metrics_tags:
            metrics_tags["has_group"] = "true" if group else "false"
            tsdb.incr_multi(counters,
                            timestamp=event.datetime,
                            environment_id=environment.id)

        frequencies = []

        if group:
            frequencies.append((tsdb.models.frequent_environments_by_group, {
                group.id: {
                    environment.id: 1
                }
            }))

            if release:
                frequencies.append((tsdb.models.frequent_releases_by_group, {
                    group.id: {
                        grouprelease.id: 1
                    }
                }))
        if frequencies:
            tsdb.record_frequency_multi(frequencies, timestamp=event.datetime)

        if group:
            UserReport.objects.filter(project=project,
                                      event_id=event_id).update(
                                          group=group, environment=environment)

        # Enusre the _metrics key exists. This is usually created during
        # and prefilled with ingestion sizes.
        event_metrics = event.data.get("_metrics") or {}
        event.data["_metrics"] = event_metrics

        # Capture the actual size that goes into node store.
        event_metrics["bytes.stored.event"] = len(
            json.dumps(dict(event.data.items())))

        if not issueless_event:
            # Load attachments first, but persist them at the very last after
            # posting to eventstream to make sure all counters and eventstream are
            # incremented for sure.
            attachments = self.get_attachments(cache_key, event)
            for attachment in attachments:
                key = "bytes.stored.%s" % (attachment.type, )
                event_metrics[key] = (event_metrics.get(key) or 0) + len(
                    attachment.data)

        with metrics.timer("event_manager.nodestore.save"):
            # Write the event to Nodestore
            event.data.save()

        if event_user:
            counters = [(tsdb.models.users_affected_by_project, project.id,
                         (event_user.tag_value, ))]

            if group:
                counters.append((tsdb.models.users_affected_by_group, group.id,
                                 (event_user.tag_value, )))

            with metrics.timer("event_manager.tsdb_record_users_affected"
                               ) as metrics_tags:
                metrics_tags["has_group"] = "true" if group else "false"
                tsdb.record_multi(counters,
                                  timestamp=event.datetime,
                                  environment_id=environment.id)

        if release:
            if is_new:
                buffer.incr(
                    ReleaseProject,
                    {"new_groups": 1},
                    {
                        "release_id": release.id,
                        "project_id": project.id
                    },
                )
            if is_new_group_environment:
                buffer.incr(
                    ReleaseProjectEnvironment,
                    {"new_issues_count": 1},
                    {
                        "project_id": project.id,
                        "release_id": release.id,
                        "environment_id": environment.id,
                    },
                )

        if not raw:
            if not project.first_event:
                project.update(first_event=date)
                first_event_received.send_robust(project=project,
                                                 event=event,
                                                 sender=Project)

        with metrics.timer("event_manager.eventstream.insert"):
            eventstream.insert(
                group=group,
                event=event,
                is_new=is_new,
                is_regression=is_regression,
                is_new_group_environment=is_new_group_environment,
                primary_hash=hashes[0],
                # We are choosing to skip consuming the event back
                # in the eventstream if it's flagged as raw.
                # This means that we want to publish the event
                # through the event stream, but we don't care
                # about post processing and handling the commit.
                skip_consume=raw,
            )

        if not issueless_event:
            # Do this last to ensure signals get emitted even if connection to the
            # file store breaks temporarily.
            self.save_attachments(attachments, event)

        metric_tags = {"from_relay": "_relay_processed" in self._data}

        metrics.timing("events.latency",
                       received_timestamp - recorded_timestamp,
                       tags=metric_tags)
        metrics.timing("events.size.data.post_save",
                       event.size,
                       tags=metric_tags)
        metrics.incr(
            "events.post_save.normalize.errors",
            amount=len(self._data.get("errors") or ()),
            tags=metric_tags,
        )

        return event
Esempio n. 31
0
 def flush_batch(self, batch):
     mark_scope_as_unsafe()
     with metrics.timer("ingest_consumer.flush_batch"):
         return self._flush_batch(batch)
Esempio n. 32
0
    def _post_by_key(self, request, full_config_requested):
        public_keys = request.relay_request_data.get("publicKeys")
        public_keys = set(public_keys or ())

        project_keys = {}  # type: dict[str, ProjectKey]
        project_ids = set()  # type: set[int]

        with start_span(op="relay_fetch_keys"):
            with metrics.timer("relay_project_configs.fetching_keys.duration"):
                for key in ProjectKey.objects.get_many_from_cache(
                        public_keys, key="public_key"):
                    if key.status != ProjectKeyStatus.ACTIVE:
                        continue

                    project_keys[key.public_key] = key
                    project_ids.add(key.project_id)

        projects = {}  # type: dict[int, Project]
        organization_ids = set()  # type: set[int]

        with start_span(op="relay_fetch_projects"):
            with metrics.timer(
                    "relay_project_configs.fetching_projects.duration"):
                for project in Project.objects.get_many_from_cache(
                        project_ids):
                    projects[project.id] = project
                    organization_ids.add(project.organization_id)

        # Preload all organizations and their options to prevent repeated
        # database access when computing the project configuration.

        orgs = {}  # type: dict[int, Organization]

        with start_span(op="relay_fetch_orgs"):
            with metrics.timer("relay_project_configs.fetching_orgs.duration"):
                for org in Organization.objects.get_many_from_cache(
                        organization_ids):
                    if request.relay.has_org_access(org):
                        orgs[org.id] = org

        with start_span(op="relay_fetch_org_options"):
            with metrics.timer(
                    "relay_project_configs.fetching_org_options.duration"):
                for org_id in orgs:
                    OrganizationOption.objects.get_all_values(org_id)

        metrics.timing("relay_project_configs.projects_requested",
                       len(project_ids))
        metrics.timing("relay_project_configs.projects_fetched", len(projects))
        metrics.timing("relay_project_configs.orgs_fetched", len(orgs))

        configs = {}
        for public_key in public_keys:
            configs[public_key] = {"disabled": True}

            key = project_keys.get(public_key)
            if key is None:
                continue

            project = projects.get(key.project_id)
            if project is None:
                continue

            organization = orgs.get(project.organization_id)
            if organization is None:
                continue

            # Try to prevent organization from being fetched again in quotas.
            project.organization = organization
            project._organization_cache = organization

            with Hub.current.start_span(op="get_config"):
                with metrics.timer(
                        "relay_project_configs.get_config.duration"):
                    project_config = config.get_project_config(
                        project,
                        full_config=full_config_requested,
                        project_keys=[key],
                    )

            configs[public_key] = project_config.to_dict()

        if full_config_requested:
            projectconfig_cache.set_many(configs)

        return Response({"configs": configs}, status=200)
Esempio n. 33
0
def fetch_release_file(filename, release):
    cache_key = 'releasefile:v1:%s:%s' % (
        release.id,
        md5_text(filename).hexdigest(),
    )

    filename_path = None
    if filename is not None:
        # Reconstruct url without protocol + host
        # e.g. http://example.com/foo?bar => ~/foo?bar
        parsed_url = urlparse(filename)
        filename_path = '~' + parsed_url.path
        if parsed_url.query:
            filename_path += '?' + parsed_url.query

    logger.debug('Checking cache for release artifact %r (release_id=%s)',
                 filename, release.id)
    result = cache.get(cache_key)

    if result is None:
        logger.debug('Checking database for release artifact %r (release_id=%s)',
                     filename, release.id)

        filename_idents = [ReleaseFile.get_ident(filename)]
        if filename_path is not None and filename_path != filename:
            filename_idents.append(ReleaseFile.get_ident(filename_path))

        possible_files = list(ReleaseFile.objects.filter(
            release=release,
            ident__in=filename_idents,
        ).select_related('file'))

        if len(possible_files) == 0:
            logger.debug('Release artifact %r not found in database (release_id=%s)',
                         filename, release.id)
            cache.set(cache_key, -1, 60)
            return None
        elif len(possible_files) == 1:
            releasefile = possible_files[0]
        else:
            # Prioritize releasefile that matches full url (w/ host)
            # over hostless releasefile
            target_ident = filename_idents[0]
            releasefile = next((f for f in possible_files if f.ident == target_ident))

        logger.debug('Found release artifact %r (id=%s, release_id=%s)',
                     filename, releasefile.id, release.id)
        try:
            with metrics.timer('sourcemaps.release_file_read'):
                with releasefile.file.getfile() as fp:
                    z_body, body = compress_file(fp)
        except Exception as e:
            logger.exception(six.text_type(e))
            cache.set(cache_key, -1, 3600)
            result = None
        else:
            headers = {k.lower(): v for k, v in releasefile.file.headers.items()}
            encoding = get_encoding_from_headers(headers)
            result = (headers, body, 200, encoding)
            cache.set(cache_key, (headers, z_body, 200, encoding), 3600)

    elif result == -1:
        # We cached an error, so normalize
        # it down to None
        result = None
    else:
        # Previous caches would be a 3-tuple instead of a 4-tuple,
        # so this is being maintained for backwards compatibility
        try:
            encoding = result[3]
        except IndexError:
            encoding = None
        result = (result[0], zlib.decompress(result[1]), result[2], encoding)

    return result
Esempio n. 34
0
def fetch_release_file(filename, release, dist=None):
    cache_key = 'releasefile:v1:%s:%s' % (release.id, md5_text(filename).hexdigest(), )

    logger.debug('Checking cache for release artifact %r (release_id=%s)', filename, release.id)
    result = cache.get(cache_key)

    dist_name = dist and dist.name or None

    if result is None:
        filename_choices = ReleaseFile.normalize(filename)
        filename_idents = [ReleaseFile.get_ident(f, dist_name) for f in filename_choices]

        logger.debug(
            'Checking database for release artifact %r (release_id=%s)', filename, release.id
        )

        possible_files = list(
            ReleaseFile.objects.filter(
                release=release,
                dist=dist,
                ident__in=filename_idents,
            ).select_related('file')
        )

        if len(possible_files) == 0:
            logger.debug(
                'Release artifact %r not found in database (release_id=%s)', filename, release.id
            )
            cache.set(cache_key, -1, 60)
            return None
        elif len(possible_files) == 1:
            releasefile = possible_files[0]
        else:
            # Pick first one that matches in priority order.
            # This is O(N*M) but there are only ever at most 4 things here
            # so not really worth optimizing.
            releasefile = next((
                rf
                for ident in filename_idents
                for rf in possible_files
                if rf.ident == ident
            ))

        logger.debug(
            'Found release artifact %r (id=%s, release_id=%s)', filename, releasefile.id, release.id
        )
        try:
            with metrics.timer('sourcemaps.release_file_read'):
                with releasefile.file.getfile() as fp:
                    z_body, body = compress_file(fp)
        except Exception:
            logger.error('sourcemap.compress_read_failed', exc_info=sys.exc_info())
            result = None
        else:
            headers = {k.lower(): v for k, v in releasefile.file.headers.items()}
            encoding = get_encoding_from_headers(headers)
            result = http.UrlResult(filename, headers, body, 200, encoding)
            cache.set(cache_key, (headers, z_body, 200, encoding), 3600)

    elif result == -1:
        # We cached an error, so normalize
        # it down to None
        result = None
    else:
        # Previous caches would be a 3-tuple instead of a 4-tuple,
        # so this is being maintained for backwards compatibility
        try:
            encoding = result[3]
        except IndexError:
            encoding = None
        result = http.UrlResult(
            filename, result[0], zlib.decompress(result[1]), result[2], encoding
        )

    return result
Esempio n. 35
0
 def apply_async(self, *args, **kwargs):
     with metrics.timer('jobs.delay', instance=self.name):
         return Task.apply_async(self, *args, **kwargs)
Esempio n. 36
0
 def apply_async(self, *args, **kwargs):
     key = 'jobs.delay'
     instance = self.name
     with metrics.timer(key, instance=instance):
         return OriginalTask.apply_async(self, *args, **kwargs)
Esempio n. 37
0
    def run(self, is_shutdown_requested=lambda: False):
        """
        Runs the message processing loop
        """
        logger.debug(
            "Staring kafka consumer for topic:%s with consumer group:%s",
            self.topic_name,
            self.consumer_group,
        )

        consumer = kafka.Consumer(self.consumer_configuration)
        consumer.subscribe([self.topic_name])

        metrics_tags = {
            "topic": self.topic_name,
            "consumer_group": self.consumer_group,
            "type": self.__class__.__name__,
        }

        # setup a flag to mark termination signals received, see below why we use an array
        termination_signal_received = [False]

        def termination_signal_handler(_sig_id, _frame):
            """
            Function to use a hook for SIGINT and SIGTERM

            This signal handler only remembers that the signal was emitted.
            The batch processing loop detects that the signal was emitted
            and stops once the whole batch is processed.
            """
            # We need to use an array so that terminal_signal_received is not a
            # local variable assignment, but a lookup in the clojure's outer scope.
            termination_signal_received[0] = True

        with set_termination_request_handlers(termination_signal_handler):
            while not (is_shutdown_requested()
                       or termination_signal_received[0]):
                # get up to commit_batch_size messages
                messages = consumer.consume(
                    num_messages=self.commit_batch_size,
                    timeout=self.max_fetch_time_seconds)

                for message in messages:
                    message_error = message.error()
                    if message_error is not None:
                        logger.error("Received message with error on %s: %s",
                                     self.topic_name, message_error)
                        raise ValueError("Bad message received from consumer",
                                         self.topic_name, message_error)

                    with metrics.timer("simple_consumer.processing_time",
                                       tags=metrics_tags):
                        self.process_message(message)

                if len(messages) > 0:
                    # we have read some messages in the previous consume, commit the offset
                    consumer.commit(asynchronous=False)

                metrics.timing("simple_consumer.committed_batch.size",
                               len(messages),
                               tags=metrics_tags)

        consumer.close()
        logger.debug(
            "Closing kafka consumer for topic:%s with consumer group:%s",
            self.topic_name,
            self.consumer_group,
        )
Esempio n. 38
0
def fetch_release_file(filename, release):
    cache_key = 'releasefile:v1:%s:%s' % (
        release.id,
        md5_text(filename).hexdigest(),
    )

    filename_path = None
    if filename is not None:
        # Reconstruct url without protocol + host
        # e.g. http://example.com/foo?bar => ~/foo?bar
        parsed_url = urlparse(filename)
        filename_path = '~' + parsed_url.path
        if parsed_url.query:
            filename_path += '?' + parsed_url.query

    logger.debug('Checking cache for release artifact %r (release_id=%s)',
                 filename, release.id)
    result = cache.get(cache_key)

    if result is None:
        logger.debug(
            'Checking database for release artifact %r (release_id=%s)',
            filename, release.id)

        filename_idents = [ReleaseFile.get_ident(filename)]
        if filename_path is not None and filename_path != filename:
            filename_idents.append(ReleaseFile.get_ident(filename_path))

        possible_files = list(
            ReleaseFile.objects.filter(
                release=release,
                ident__in=filename_idents,
            ).select_related('file'))

        if len(possible_files) == 0:
            logger.debug(
                'Release artifact %r not found in database (release_id=%s)',
                filename, release.id)
            cache.set(cache_key, -1, 60)
            return None
        elif len(possible_files) == 1:
            releasefile = possible_files[0]
        else:
            # Prioritize releasefile that matches full url (w/ host)
            # over hostless releasefile
            target_ident = filename_idents[0]
            releasefile = next(
                (f for f in possible_files if f.ident == target_ident))

        logger.debug('Found release artifact %r (id=%s, release_id=%s)',
                     filename, releasefile.id, release.id)
        try:
            with metrics.timer('sourcemaps.release_file_read'):
                with releasefile.file.getfile() as fp:
                    z_body, body = compress_file(fp)
        except Exception as e:
            logger.exception(six.text_type(e))
            cache.set(cache_key, -1, 3600)
            result = None
        else:
            headers = {
                k.lower(): v
                for k, v in releasefile.file.headers.items()
            }
            encoding = get_encoding_from_headers(headers)
            result = (headers, body, 200, encoding)
            cache.set(cache_key, (headers, z_body, 200, encoding), 3600)

    elif result == -1:
        # We cached an error, so normalize
        # it down to None
        result = None
    else:
        # Previous caches would be a 3-tuple instead of a 4-tuple,
        # so this is being maintained for backwards compatibility
        try:
            encoding = result[3]
        except IndexError:
            encoding = None
        result = (result[0], zlib.decompress(result[1]), result[2], encoding)

    return result
    def run(self) -> None:
        logger.debug("Starting snuba query subscriber")
        self.offsets.clear()

        def on_assign(consumer: Consumer,
                      partitions: List[TopicPartition]) -> None:
            updated_partitions: List[TopicPartition] = []
            for partition in partitions:
                if self.resolve_partition_force_offset:
                    partition = self.resolve_partition_force_offset(partition)
                    updated_partitions.append(partition)

                if partition.offset == OFFSET_INVALID:
                    updated_offset = None
                else:
                    updated_offset = partition.offset
                self.offsets[partition.partition] = updated_offset
            if updated_partitions:
                self.consumer.assign(updated_partitions)
            logger.info(
                "query-subscription-consumer.on_assign",
                extra={
                    "offsets": str(self.offsets),
                    "partitions": str(partitions),
                },
            )

        def on_revoke(consumer: Consumer,
                      partitions: List[TopicPartition]) -> None:
            partition_numbers = [
                partition.partition for partition in partitions
            ]
            self.commit_offsets(partition_numbers)
            for partition_number in partition_numbers:
                self.offsets.pop(partition_number, None)
            logger.info(
                "query-subscription-consumer.on_revoke",
                extra={
                    "offsets": str(self.offsets),
                    "partitions": str(partitions),
                },
            )

        self.consumer = Consumer(self.cluster_options)
        if settings.KAFKA_CONSUMER_AUTO_CREATE_TOPICS:
            # This is required for confluent-kafka>=1.5.0, otherwise the topics will
            # not be automatically created.
            admin_client = AdminClient(self.admin_cluster_options)
            wait_for_topics(admin_client, [self.topic])

        self.consumer.subscribe([self.topic],
                                on_assign=on_assign,
                                on_revoke=on_revoke)

        try:
            i = 0
            while True:
                message = self.consumer.poll(0.1)
                if message is None:
                    continue

                error = message.error()
                if error is not None:
                    raise KafkaException(error)

                i = i + 1

                with sentry_sdk.start_transaction(
                        op="handle_message",
                        name="query_subscription_consumer_process_message",
                        sampled=True,
                ), metrics.timer("snuba_query_subscriber.handle_message"):
                    self.handle_message(message)

                # Track latest completed message here, for use in `shutdown` handler.
                self.offsets[message.partition()] = message.offset() + 1

                if i % self.commit_batch_size == 0:
                    logger.debug("Committing offsets")
                    self.commit_offsets()
        except KeyboardInterrupt:
            pass

        self.shutdown()
Esempio n. 40
0
def fetch_file(url, project=None, release=None, allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    # If our url has been truncated, it'd be impossible to fetch
    # so we check for this early and bail
    if url[-3:] == '...':
        raise CannotFetchSource({
            'type': EventError.JS_MISSING_SOURCE,
            'url': expose_url(url),
        })
    if release:
        with metrics.timer('sourcemaps.release_file'):
            result = fetch_release_file(url, release)
    else:
        result = None

    cache_key = 'source:cache:v3:%s' % (md5_text(url).hexdigest(), )

    if result is None:
        if not allow_scraping or not url.startswith(('http:', 'https:')):
            error = {
                'type': EventError.JS_MISSING_SOURCE,
                'url': expose_url(url),
            }
            raise CannotFetchSource(error)

        logger.debug('Checking cache for url %r', url)
        result = cache.get(cache_key)
        if result is not None:
            # Previous caches would be a 3-tuple instead of a 4-tuple,
            # so this is being maintained for backwards compatibility
            try:
                encoding = result[3]
            except IndexError:
                encoding = None
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            result = (result[0], zlib.decompress(result[1]), result[2],
                      encoding)

    if result is None:
        # lock down domains that are problematic
        domain = urlparse(url).netloc
        domain_key = 'source:blacklist:v2:%s' % (
            md5_text(domain).hexdigest(), )
        domain_result = cache.get(domain_key)
        if domain_result:
            domain_result['url'] = url
            raise CannotFetchSource(domain_result)

        headers = {}
        if project and is_valid_origin(url, project=project):
            token = project.get_option('sentry:token')
            if token:
                token_header = project.get_option(
                    'sentry:token_header',
                    'X-Sentry-Token',
                )
                headers[token_header] = token

        logger.debug('Fetching %r from the internet', url)

        with metrics.timer('sourcemaps.fetch'):
            http_session = http.build_session()
            response = None
            try:
                try:
                    start = time.time()
                    response = http_session.get(
                        url,
                        allow_redirects=True,
                        verify=False,
                        headers=headers,
                        timeout=settings.SENTRY_SOURCE_FETCH_SOCKET_TIMEOUT,
                        stream=True,
                    )

                    try:
                        cl = int(response.headers['content-length'])
                    except (LookupError, ValueError):
                        cl = 0
                    if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE:
                        raise OverflowError()

                    contents = []
                    cl = 0

                    # Only need to even attempt to read the response body if we
                    # got a 200 OK
                    if response.status_code == 200:
                        for chunk in response.iter_content(16 * 1024):
                            if time.time(
                            ) - start > settings.SENTRY_SOURCE_FETCH_TIMEOUT:
                                raise Timeout()
                            contents.append(chunk)
                            cl += len(chunk)
                            if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE:
                                raise OverflowError()

                except Exception as exc:
                    logger.debug('Unable to fetch %r', url, exc_info=True)
                    if isinstance(exc, RestrictedIPAddress):
                        error = {
                            'type': EventError.RESTRICTED_IP,
                            'url': expose_url(url),
                        }
                    elif isinstance(exc, SuspiciousOperation):
                        error = {
                            'type': EventError.SECURITY_VIOLATION,
                            'url': expose_url(url),
                        }
                    elif isinstance(exc, Timeout):
                        error = {
                            'type': EventError.JS_FETCH_TIMEOUT,
                            'url': expose_url(url),
                            'timeout': settings.SENTRY_SOURCE_FETCH_TIMEOUT,
                        }
                    elif isinstance(exc, OverflowError):
                        error = {
                            'type':
                            EventError.JS_TOO_LARGE,
                            'url':
                            expose_url(url),
                            # We want size in megabytes to format nicely
                            'max_size':
                            float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) /
                            1024 / 1024,
                        }
                    elif isinstance(exc, (RequestException, ZeroReturnError)):
                        error = {
                            'type': EventError.JS_GENERIC_FETCH_ERROR,
                            'value': six.text_type(type(exc)),
                            'url': expose_url(url),
                        }
                    else:
                        logger.exception(six.text_type(exc))
                        error = {
                            'type': EventError.UNKNOWN_ERROR,
                            'url': expose_url(url),
                        }

                    # TODO(dcramer): we want to be less aggressive on disabling domains
                    cache.set(domain_key, error or '', 300)
                    logger.warning('source.disabled', extra=error)
                    raise CannotFetchSource(error)

                body = b''.join(contents)
                z_body = zlib.compress(body)
                headers = {k.lower(): v for k, v in response.headers.items()}
                encoding = response.encoding

                cache.set(cache_key,
                          (headers, z_body, response.status_code, encoding),
                          60)
                result = (headers, body, response.status_code, encoding)
            finally:
                if response is not None:
                    response.close()

    if result[2] != 200:
        logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True)
        error = {
            'type': EventError.JS_INVALID_HTTP_CODE,
            'value': result[2],
            'url': expose_url(url),
        }
        raise CannotFetchSource(error)

    # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML)
    # NOTE: possible to have JS files that don't actually end w/ ".js", but this should catch 99% of cases
    if url.endswith('.js'):
        # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<').
        # This cannot parse as valid JS/JSON.
        # NOTE: not relying on Content-Type header because apps often don't set this correctly
        body_start = result[1][:20].lstrip(
        )  # Discard leading whitespace (often found before doctype)

        if body_start[:1] == u'<':
            error = {
                'type': EventError.JS_INVALID_CONTENT,
                'url': url,
            }
            raise CannotFetchSource(error)

    # Make sure the file we're getting back is six.binary_type. The only
    # reason it'd not be binary would be from old cached blobs, so
    # for compatibility with current cached files, let's coerce back to
    # binary and say utf8 encoding.
    if not isinstance(result[1], six.binary_type):
        try:
            result = (result[0], result[1].encode('utf8'), None)
        except UnicodeEncodeError:
            error = {
                'type': EventError.JS_INVALID_SOURCE_ENCODING,
                'value': 'utf8',
                'url': expose_url(url),
            }
            raise CannotFetchSource(error)

    return UrlResult(url, result[0], result[1], result[3])
Esempio n. 41
0
def fetch_release_file(filename, release, dist=None):
    cache_key = "releasefile:v1:%s:%s" % (release.id, md5_text(filename).hexdigest())

    logger.debug("Checking cache for release artifact %r (release_id=%s)", filename, release.id)
    result = cache.get(cache_key)

    dist_name = dist and dist.name or None

    if result is None:
        filename_choices = ReleaseFile.normalize(filename)
        filename_idents = [ReleaseFile.get_ident(f, dist_name) for f in filename_choices]

        logger.debug(
            "Checking database for release artifact %r (release_id=%s)", filename, release.id
        )

        possible_files = list(
            ReleaseFile.objects.filter(
                release=release, dist=dist, ident__in=filename_idents
            ).select_related("file")
        )

        if len(possible_files) == 0:
            logger.debug(
                "Release artifact %r not found in database (release_id=%s)", filename, release.id
            )
            cache.set(cache_key, -1, 60)
            return None
        elif len(possible_files) == 1:
            releasefile = possible_files[0]
        else:
            # Pick first one that matches in priority order.
            # This is O(N*M) but there are only ever at most 4 things here
            # so not really worth optimizing.
            releasefile = next(
                (rf for ident in filename_idents for rf in possible_files if rf.ident == ident)
            )

        logger.debug(
            "Found release artifact %r (id=%s, release_id=%s)", filename, releasefile.id, release.id
        )
        try:
            with metrics.timer("sourcemaps.release_file_read"):
                with releasefile.file.getfile() as fp:
                    z_body, body = compress_file(fp)
        except Exception:
            logger.error("sourcemap.compress_read_failed", exc_info=sys.exc_info())
            result = None
        else:
            headers = {k.lower(): v for k, v in releasefile.file.headers.items()}
            encoding = get_encoding_from_headers(headers)
            result = http.UrlResult(filename, headers, body, 200, encoding)
            cache.set(cache_key, (headers, z_body, 200, encoding), 3600)

    elif result == -1:
        # We cached an error, so normalize
        # it down to None
        result = None
    else:
        # Previous caches would be a 3-tuple instead of a 4-tuple,
        # so this is being maintained for backwards compatibility
        try:
            encoding = result[3]
        except IndexError:
            encoding = None
        result = http.UrlResult(
            filename, result[0], zlib.decompress(result[1]), result[2], encoding
        )

    return result
Esempio n. 42
0
def _process_signal_with_timer(message):
    with metrics.timer("outcomes_consumer.process_signal"):
        return _process_signal(message)
Esempio n. 43
0
    def run(self):
        logger.debug("Starting snuba query subscriber")
        self.offsets.clear()

        conf = {
            "bootstrap.servers": self.bootstrap_servers,
            "group.id": self.group_id,
            "session.timeout.ms": 6000,
            "auto.offset.reset": self.initial_offset_reset,
            "enable.auto.commit": "false",
            "enable.auto.offset.store": "false",
            "enable.partition.eof": "false",
            "default.topic.config": {
                "auto.offset.reset": self.initial_offset_reset
            },
        }

        def on_assign(consumer, partitions):
            for partition in partitions:
                if partition.offset == OFFSET_INVALID:
                    updated_offset = None
                else:
                    updated_offset = partition.offset
                self.offsets[partition.partition] = updated_offset

        def on_revoke(consumer, partitions):
            partition_numbers = [
                partition.partition for partition in partitions
            ]
            self.commit_offsets(partition_numbers)
            for partition_number in partition_numbers:
                self.offsets.pop(partition_number, None)

        self.consumer = Consumer(conf)
        self.consumer.subscribe([self.topic],
                                on_assign=on_assign,
                                on_revoke=on_revoke)

        try:
            i = 0
            while True:
                message = self.consumer.poll(0.1)
                if message is None:
                    continue

                error = message.error()
                if error is not None:
                    raise KafkaException(error)

                i = i + 1

                with sentry_sdk.start_span(
                        Span(
                            op="handle_message",
                            transaction=
                            "query_subscription_consumer_process_message",
                            sampled=True,
                        )), metrics.timer(
                            "snuba_query_subscriber.handle_message"):
                    self.handle_message(message)

                # Track latest completed message here, for use in `shutdown` handler.
                self.offsets[message.partition()] = message.offset() + 1

                if i % self.commit_batch_size == 0:
                    logger.debug("Committing offsets")
                    self.commit_offsets()
        except KeyboardInterrupt:
            pass

        self.shutdown()
Esempio n. 44
0
def fetch_release_file(filename, release, dist=None):
    """
    Attempt to retrieve a release artifact from the database.

    Caches the result of that attempt (whether successful or not).
    """

    dist_name = dist and dist.name or None
    releasefile_ident = ReleaseFile.get_ident(filename, dist_name)
    cache_key = get_release_file_cache_key(
        release_id=release.id, releasefile_ident=releasefile_ident
    )
    # Cache key to store file metadata, currently only the size of the
    # compressed version of file. We cannot use the cache_key because large
    # payloads (silently) fail to cache due to e.g. memcached payload size
    # limitation and we use the meta data to avoid compression of such a files.
    cache_key_meta = get_release_file_cache_key_meta(
        release_id=release.id, releasefile_ident=releasefile_ident
    )

    logger.debug("Checking cache for release artifact %r (release_id=%s)", filename, release.id)
    result = cache.get(cache_key)

    # not in the cache (meaning we haven't checked the database recently), so check the database
    if result is None:
        filename_choices = ReleaseFile.normalize(filename)
        filename_idents = [ReleaseFile.get_ident(f, dist_name) for f in filename_choices]

        logger.debug(
            "Checking database for release artifact %r (release_id=%s)", filename, release.id
        )

        possible_files = list(
            ReleaseFile.objects.filter(
                release=release, dist=dist, ident__in=filename_idents
            ).select_related("file")
        )

        if len(possible_files) == 0:
            logger.debug(
                "Release artifact %r not found in database (release_id=%s)", filename, release.id
            )
            cache.set(cache_key, -1, 60)
            return None

        elif len(possible_files) == 1:
            releasefile = possible_files[0]

        else:
            # Pick first one that matches in priority order.
            # This is O(N*M) but there are only ever at most 4 things here
            # so not really worth optimizing.
            releasefile = next(
                rf for ident in filename_idents for rf in possible_files if rf.ident == ident
            )

        # If the release file is not in cache, check if we can retrieve at
        # least the size metadata from cache and prevent compression and
        # caching if payload exceeds the backend limit.
        z_body = None
        z_body_size = None

        if CACHE_MAX_VALUE_SIZE:
            cache_meta = cache.get(cache_key_meta)
            if cache_meta:
                z_body_size = int(cache_meta.get("compressed_size"))

        logger.debug(
            "Found release artifact %r (id=%s, release_id=%s)", filename, releasefile.id, release.id
        )
        try:
            with metrics.timer("sourcemaps.release_file_read"):
                with ReleaseFile.cache.getfile(releasefile) as fp:
                    if z_body_size and z_body_size > CACHE_MAX_VALUE_SIZE:
                        body = fp.read()
                    else:
                        z_body, body = compress_file(fp)
        except Exception:
            logger.error("sourcemap.compress_read_failed", exc_info=sys.exc_info())
            result = None
        else:
            headers = {k.lower(): v for k, v in releasefile.file.headers.items()}
            encoding = get_encoding_from_headers(headers)
            result = http.UrlResult(filename, headers, body, 200, encoding)

            # If we don't have the compressed body for caching because the
            # cached metadata said it is too large payload for the cache
            # backend, do not attempt to cache.
            if z_body:
                # This will implicitly skip too large payloads. Those will be cached
                # on the file system by `ReleaseFile.cache`, instead.
                cache.set(cache_key, (headers, z_body, 200, encoding), 3600)

                # In case the previous call to cache implicitly fails, we use
                # the meta data to avoid pointless compression which is done
                # only for caching.
                cache.set(cache_key_meta, {"compressed_size": len(z_body)}, 3600)

    # in the cache as an unsuccessful attempt
    elif result == -1:
        result = None

    # in the cache as a successful attempt, including the zipped contents of the file
    else:
        # Previous caches would be a 3-tuple instead of a 4-tuple,
        # so this is being maintained for backwards compatibility
        try:
            encoding = result[3]
        except IndexError:
            encoding = None
        result = http.UrlResult(
            filename, result[0], zlib.decompress(result[1]), result[2], encoding
        )

    return result
 def __instrumented_method_call(self, method, scope, *args, **kwargs):
     with timer(self.template.format(method),
                tags={self.scope_tag_name: scope}):
         return getattr(self.backend, method)(scope, *args, **kwargs)
Esempio n. 46
0
def fetch_file(url, project=None, release=None, dist=None, allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the database first (assuming there's a release on the
    event), then the internet. Caches the result of each of those two attempts
    separately, whether or not those attempts are successful. Used for both
    source files and source maps.
    """

    # If our url has been truncated, it'd be impossible to fetch
    # so we check for this early and bail
    if url[-3:] == "...":
        raise http.CannotFetch({"type": EventError.JS_MISSING_SOURCE, "url": http.expose_url(url)})

    # if we've got a release to look on, try that first (incl associated cache)
    if release:
        with metrics.timer("sourcemaps.release_file"):
            result = fetch_release_file(url, release, dist)
    else:
        result = None

    # otherwise, try the web-scraping cache and then the web itself

    cache_key = "source:cache:v4:%s" % (md5_text(url).hexdigest(),)

    if result is None:
        if not allow_scraping or not url.startswith(("http:", "https:")):
            error = {"type": EventError.JS_MISSING_SOURCE, "url": http.expose_url(url)}
            raise http.CannotFetch(error)

        logger.debug("Checking cache for url %r", url)
        result = cache.get(cache_key)
        if result is not None:
            # Previous caches would be a 3-tuple instead of a 4-tuple,
            # so this is being maintained for backwards compatibility
            try:
                encoding = result[4]
            except IndexError:
                encoding = None
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            result = http.UrlResult(
                result[0], result[1], zlib.decompress(result[2]), result[3], encoding
            )

    if result is None:
        headers = {}
        verify_ssl = False
        if project and is_valid_origin(url, project=project):
            verify_ssl = bool(project.get_option("sentry:verify_ssl", False))
            token = project.get_option("sentry:token")
            if token:
                token_header = project.get_option("sentry:token_header") or "X-Sentry-Token"
                headers[token_header] = token

        with metrics.timer("sourcemaps.fetch"):
            result = http.fetch_file(url, headers=headers, verify_ssl=verify_ssl)
            z_body = zlib.compress(result.body)
            cache.set(
                cache_key,
                (url, result.headers, z_body, result.status, result.encoding),
                get_max_age(result.headers),
            )

    # If we did not get a 200 OK we just raise a cannot fetch here.
    if result.status != 200:
        raise http.CannotFetch(
            {
                "type": EventError.FETCH_INVALID_HTTP_CODE,
                "value": result.status,
                "url": http.expose_url(url),
            }
        )

    # Make sure the file we're getting back is bytes. The only
    # reason it'd not be binary would be from old cached blobs, so
    # for compatibility with current cached files, let's coerce back to
    # binary and say utf8 encoding.
    if not isinstance(result.body, bytes):
        try:
            result = http.UrlResult(
                result.url,
                result.headers,
                result.body.encode("utf8"),
                result.status,
                result.encoding,
            )
        except UnicodeEncodeError:
            error = {
                "type": EventError.FETCH_INVALID_ENCODING,
                "value": "utf8",
                "url": http.expose_url(url),
            }
            raise http.CannotFetch(error)

    # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML)
    # NOTE: possible to have JS files that don't actually end w/ ".js", but
    # this should catch 99% of cases
    if urlsplit(url).path.endswith(".js"):
        # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<').
        # This cannot parse as valid JS/JSON.
        # NOTE: not relying on Content-Type header because apps often don't set this correctly
        # Discard leading whitespace (often found before doctype)
        body_start = result.body[:20].lstrip()

        if body_start[:1] == b"<":
            error = {"type": EventError.JS_INVALID_CONTENT, "url": url}
            raise http.CannotFetch(error)

    return result
Esempio n. 47
0
 def apply_async(self, *args, **kwargs):
     key = 'jobs.delay'
     instance = self.name
     with metrics.timer(key, instance=instance):
         return OriginalTask.apply_async(self, *args, **kwargs)
Esempio n. 48
0
def post_process_group(
    is_new, is_regression, is_new_group_environment, cache_key, group_id=None, **kwargs
):
    """
    Fires post processing hooks for a group.
    """
    from sentry.eventstore.models import Event
    from sentry.eventstore.processing import event_processing_store
    from sentry.utils import snuba
    from sentry.reprocessing2 import is_reprocessed_event

    with snuba.options_override({"consistent": True}):
        # We use the data being present/missing in the processing store
        # to ensure that we don't duplicate work should the forwarding consumers
        # need to rewind history.
        data = event_processing_store.get(cache_key)
        if not data:
            logger.info(
                "post_process.skipped", extra={"cache_key": cache_key, "reason": "missing_cache"},
            )
            return
        event = Event(
            project_id=data["project"], event_id=data["event_id"], group_id=group_id, data=data
        )

        set_current_project(event.project_id)

        is_reprocessed = is_reprocessed_event(event.data)

        # NOTE: we must pass through the full Event object, and not an
        # event_id since the Event object may not actually have been stored
        # in the database due to sampling.
        from sentry.models import (
            Commit,
            Project,
            Organization,
            EventDict,
            GroupInboxReason,
        )
        from sentry.models.groupinbox import add_group_to_inbox
        from sentry.models.group import get_group_with_redirect
        from sentry.rules.processor import RuleProcessor
        from sentry.tasks.servicehooks import process_service_hook
        from sentry.tasks.groupowner import process_suspect_commits

        # Re-bind node data to avoid renormalization. We only want to
        # renormalize when loading old data from the database.
        event.data = EventDict(event.data, skip_renormalization=True)

        # Re-bind Project and Org since we're reading the Event object
        # from cache which may contain stale parent models.
        event.project = Project.objects.get_from_cache(id=event.project_id)
        event.project._organization_cache = Organization.objects.get_from_cache(
            id=event.project.organization_id
        )

        if event.group_id:
            # Re-bind Group since we're reading the Event object
            # from cache, which may contain a stale group and project
            event.group, _ = get_group_with_redirect(event.group_id)
            event.group_id = event.group.id

            event.group.project = event.project
            event.group.project._organization_cache = event.project._organization_cache

        bind_organization_context(event.project.organization)

        _capture_stats(event, is_new)

        if event.group_id and not is_reprocessed:
            # we process snoozes before rules as it might create a regression
            # but not if it's new because you can't immediately snooze a new group
            has_reappeared = False if is_new else process_snoozes(event.group)
            if not has_reappeared:  # If true, we added the .UNIGNORED reason already
                if is_new:
                    add_group_to_inbox(event.group, GroupInboxReason.NEW)
                elif is_regression:
                    add_group_to_inbox(event.group, GroupInboxReason.REGRESSION)

            handle_owner_assignment(event.project, event.group, event)

            rp = RuleProcessor(
                event, is_new, is_regression, is_new_group_environment, has_reappeared
            )
            has_alert = False
            # TODO(dcramer): ideally this would fanout, but serializing giant
            # objects back and forth isn't super efficient
            for callback, futures in rp.apply():
                has_alert = True
                with sentry_sdk.start_transaction(
                    op="post_process_group", name="rule_processor_apply", sampled=True
                ):
                    safe_execute(callback, event, futures)

            has_commit_key = "workflow-owners-ingestion:org-{}-has-commits".format(
                event.project.organization_id
            )

            try:
                org_has_commit = cache.get(has_commit_key)
                if org_has_commit is None:
                    org_has_commit = Commit.objects.filter(
                        organization_id=event.project.organization_id
                    ).exists()
                    cache.set(has_commit_key, org_has_commit, 3600)

                if org_has_commit and features.has(
                    "projects:workflow-owners-ingestion", event.project,
                ):
                    process_suspect_commits(event=event)
            except Exception:
                logger.exception("Failed to process suspect commits")

            if features.has("projects:servicehooks", project=event.project):
                allowed_events = set(["event.created"])
                if has_alert:
                    allowed_events.add("event.alert")

                if allowed_events:
                    for servicehook_id, events in _get_service_hooks(project_id=event.project_id):
                        if any(e in allowed_events for e in events):
                            process_service_hook.delay(servicehook_id=servicehook_id, event=event)

            from sentry.tasks.sentry_apps import process_resource_change_bound

            if event.get_event_type() == "error" and _should_send_error_created_hooks(
                event.project
            ):
                process_resource_change_bound.delay(
                    action="created", sender="Error", instance_id=event.event_id, instance=event
                )
            if is_new:
                process_resource_change_bound.delay(
                    action="created", sender="Group", instance_id=event.group_id
                )

            from sentry.plugins.base import plugins

            for plugin in plugins.for_project(event.project):
                plugin_post_process_group(
                    plugin_slug=plugin.slug, event=event, is_new=is_new, is_regresion=is_regression
                )

            from sentry import similarity

            safe_execute(similarity.record, event.project, [event])

        if event.group_id:
            # Patch attachments that were ingested on the standalone path.
            update_existing_attachments(event)

        if not is_reprocessed:
            event_processed.send_robust(
                sender=post_process_group,
                project=event.project,
                event=event,
                primary_hash=kwargs.get("primary_hash"),
            )

        with metrics.timer("tasks.post_process.delete_event_cache"):
            event_processing_store.delete_by_key(cache_key)
Esempio n. 49
0
    def run_post_process_forwarder(self, consumer_group, commit_log_topic,
                                   synchronize_commit_group, commit_batch_size=100, initial_offset_reset='latest'):
        logger.debug('Starting post-process forwarder...')

        cluster_name = settings.KAFKA_TOPICS[settings.KAFKA_EVENTS]['cluster']
        bootstrap_servers = settings.KAFKA_CLUSTERS[cluster_name]['bootstrap.servers']

        consumer = SynchronizedConsumer(
            bootstrap_servers=bootstrap_servers,
            consumer_group=consumer_group,
            commit_log_topic=commit_log_topic,
            synchronize_commit_group=synchronize_commit_group,
            initial_offset_reset=initial_offset_reset,
        )

        owned_partition_offsets = {}

        def commit(partitions):
            results = consumer.commit(offsets=partitions, asynchronous=False)

            errors = filter(lambda i: i.error is not None, results)
            if errors:
                raise Exception(
                    'Failed to commit %s/%s partitions: %r' %
                    (len(errors), len(partitions), errors))

            return results

        def on_assign(consumer, partitions):
            logger.debug('Received partition assignment: %r', partitions)

            for i in partitions:
                if i.offset == OFFSET_INVALID:
                    updated_offset = None
                elif i.offset < 0:
                    raise Exception(
                        'Received unexpected negative offset during partition assignment: %r' %
                        (i,))
                else:
                    updated_offset = i.offset

                key = (i.topic, i.partition)
                previous_offset = owned_partition_offsets.get(key, None)
                if previous_offset is not None and previous_offset != updated_offset:
                    logger.warning(
                        'Received new offset for owned partition %r, will overwrite previous stored offset %r with %r.',
                        key,
                        previous_offset,
                        updated_offset)

                owned_partition_offsets[key] = updated_offset

        def on_revoke(consumer, partitions):
            logger.debug('Revoked partition assignment: %r', partitions)

            offsets_to_commit = []

            for i in partitions:
                key = (i.topic, i.partition)

                try:
                    offset = owned_partition_offsets.pop(key)
                except KeyError:
                    logger.warning(
                        'Received unexpected partition revocation for unowned partition: %r',
                        i,
                        exc_info=True)
                    continue

                if offset is None:
                    logger.debug('Skipping commit of unprocessed partition: %r', i)
                    continue

                offsets_to_commit.append(TopicPartition(i.topic, i.partition, offset))

            if offsets_to_commit:
                logger.debug(
                    'Committing offset(s) for %s revoked partition(s): %r',
                    len(offsets_to_commit),
                    offsets_to_commit)
                commit(offsets_to_commit)

        consumer.subscribe(
            [self.topic],
            on_assign=on_assign,
            on_revoke=on_revoke,
        )

        def commit_offsets():
            offsets_to_commit = []
            for (topic, partition), offset in owned_partition_offsets.items():
                if offset is None:
                    logger.debug('Skipping commit of unprocessed partition: %r', (topic, partition))
                    continue

                offsets_to_commit.append(TopicPartition(topic, partition, offset))

            if offsets_to_commit:
                logger.debug(
                    'Committing offset(s) for %s owned partition(s): %r',
                    len(offsets_to_commit),
                    offsets_to_commit)
                commit(offsets_to_commit)

        try:
            i = 0
            while True:
                message = consumer.poll(0.1)
                if message is None:
                    continue

                error = message.error()
                if error is not None:
                    raise Exception(error)

                key = (message.topic(), message.partition())
                if key not in owned_partition_offsets:
                    logger.warning('Skipping message for unowned partition: %r', key)
                    continue

                i = i + 1
                owned_partition_offsets[key] = message.offset() + 1

                with metrics.timer('eventstream.duration', instance='get_task_kwargs_for_message'):
                    task_kwargs = get_task_kwargs_for_message(message.value())

                if task_kwargs is not None:
                    with metrics.timer('eventstream.duration', instance='dispatch_post_process_group_task'):
                        self._dispatch_post_process_group_task(**task_kwargs)

                if i % commit_batch_size == 0:
                    commit_offsets()
        except KeyboardInterrupt:
            pass

        logger.debug('Committing offsets and closing consumer...')
        commit_offsets()

        consumer.close()
Esempio n. 50
0
 def get_or_create(cls, release, project, environment, datetime, **kwargs):
     with metrics.timer("models.releaseprojectenvironment.get_or_create"
                        ) as metrics_tags:
         return cls._get_or_create_impl(release, project, environment,
                                        datetime, metrics_tags, **kwargs)
Esempio n. 51
0
def fetch_file(url, project=None, release=None, allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    # If our url has been truncated, it'd be impossible to fetch
    # so we check for this early and bail
    if url[-3:] == '...':
        raise CannotFetchSource({
            'type': EventError.JS_MISSING_SOURCE,
            'url': expose_url(url),
        })
    if release:
        with metrics.timer('sourcemaps.release_file'):
            result = fetch_release_file(url, release)
    else:
        result = None

    cache_key = 'source:cache:v3:%s' % (
        md5_text(url).hexdigest(),
    )

    if result is None:
        if not allow_scraping or not url.startswith(('http:', 'https:')):
            error = {
                'type': EventError.JS_MISSING_SOURCE,
                'url': expose_url(url),
            }
            raise CannotFetchSource(error)

        logger.debug('Checking cache for url %r', url)
        result = cache.get(cache_key)
        if result is not None:
            # Previous caches would be a 3-tuple instead of a 4-tuple,
            # so this is being maintained for backwards compatibility
            try:
                encoding = result[3]
            except IndexError:
                encoding = None
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            result = (result[0], zlib.decompress(result[1]), result[2], encoding)

    if result is None:
        # lock down domains that are problematic
        domain = urlparse(url).netloc
        domain_key = 'source:blacklist:v2:%s' % (
            md5_text(domain).hexdigest(),
        )
        domain_result = cache.get(domain_key)
        if domain_result:
            domain_result['url'] = url
            raise CannotFetchSource(domain_result)

        headers = {}
        if project and is_valid_origin(url, project=project):
            token = project.get_option('sentry:token')
            if token:
                token_header = project.get_option(
                    'sentry:token_header',
                    'X-Sentry-Token',
                )
                headers[token_header] = token

        logger.debug('Fetching %r from the internet', url)

        with metrics.timer('sourcemaps.fetch'):
            http_session = http.build_session()
            response = None
            try:
                try:
                    start = time.time()
                    response = http_session.get(
                        url,
                        allow_redirects=True,
                        verify=False,
                        headers=headers,
                        timeout=settings.SENTRY_SOURCE_FETCH_SOCKET_TIMEOUT,
                        stream=True,
                    )

                    try:
                        cl = int(response.headers['content-length'])
                    except (LookupError, ValueError):
                        cl = 0
                    if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE:
                        raise OverflowError()

                    contents = []
                    cl = 0

                    # Only need to even attempt to read the response body if we
                    # got a 200 OK
                    if response.status_code == 200:
                        for chunk in response.iter_content(16 * 1024):
                            if time.time() - start > settings.SENTRY_SOURCE_FETCH_TIMEOUT:
                                raise Timeout()
                            contents.append(chunk)
                            cl += len(chunk)
                            if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE:
                                raise OverflowError()

                except Exception as exc:
                    logger.debug('Unable to fetch %r', url, exc_info=True)
                    if isinstance(exc, RestrictedIPAddress):
                        error = {
                            'type': EventError.RESTRICTED_IP,
                            'url': expose_url(url),
                        }
                    elif isinstance(exc, SuspiciousOperation):
                        error = {
                            'type': EventError.SECURITY_VIOLATION,
                            'url': expose_url(url),
                        }
                    elif isinstance(exc, Timeout):
                        error = {
                            'type': EventError.JS_FETCH_TIMEOUT,
                            'url': expose_url(url),
                            'timeout': settings.SENTRY_SOURCE_FETCH_TIMEOUT,
                        }
                    elif isinstance(exc, OverflowError):
                        error = {
                            'type': EventError.JS_TOO_LARGE,
                            'url': expose_url(url),
                            # We want size in megabytes to format nicely
                            'max_size': float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) / 1024 / 1024,
                        }
                    elif isinstance(exc, (RequestException, ZeroReturnError)):
                        error = {
                            'type': EventError.JS_GENERIC_FETCH_ERROR,
                            'value': six.text_type(type(exc)),
                            'url': expose_url(url),
                        }
                    else:
                        logger.exception(six.text_type(exc))
                        error = {
                            'type': EventError.UNKNOWN_ERROR,
                            'url': expose_url(url),
                        }

                    # TODO(dcramer): we want to be less aggressive on disabling domains
                    cache.set(domain_key, error or '', 300)
                    logger.warning('source.disabled', extra=error)
                    raise CannotFetchSource(error)

                body = b''.join(contents)
                z_body = zlib.compress(body)
                headers = {k.lower(): v for k, v in response.headers.items()}
                encoding = response.encoding

                cache.set(cache_key, (headers, z_body, response.status_code, encoding), 60)
                result = (headers, body, response.status_code, encoding)
            finally:
                if response is not None:
                    response.close()

    if result[2] != 200:
        logger.debug('HTTP %s when fetching %r', result[2], url,
                     exc_info=True)
        error = {
            'type': EventError.JS_INVALID_HTTP_CODE,
            'value': result[2],
            'url': expose_url(url),
        }
        raise CannotFetchSource(error)

    # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML)
    # NOTE: possible to have JS files that don't actually end w/ ".js", but this should catch 99% of cases
    if url.endswith('.js'):
        # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<').
        # This cannot parse as valid JS/JSON.
        # NOTE: not relying on Content-Type header because apps often don't set this correctly
        body_start = result[1][:20].lstrip()  # Discard leading whitespace (often found before doctype)

        if body_start[:1] == u'<':
            error = {
                'type': EventError.JS_INVALID_CONTENT,
                'url': url,
            }
            raise CannotFetchSource(error)

    # Make sure the file we're getting back is six.binary_type. The only
    # reason it'd not be binary would be from old cached blobs, so
    # for compatibility with current cached files, let's coerce back to
    # binary and say utf8 encoding.
    if not isinstance(result[1], six.binary_type):
        try:
            result = (result[0], result[1].encode('utf8'), None)
        except UnicodeEncodeError:
            error = {
                'type': EventError.JS_INVALID_SOURCE_ENCODING,
                'value': 'utf8',
                'url': expose_url(url),
            }
            raise CannotFetchSource(error)

    return UrlResult(url, result[0], result[1], result[3])
Esempio n. 52
0
 def normalize(self):
     with metrics.timer("events.store.normalize.duration"):
         self._normalize_impl()
Esempio n. 53
0
def fetch_release_file(filename, release, dist=None):
    """
    Attempt to retrieve a release artifact from the database.

    Caches the result of that attempt (whether successful or not).
    """

    dist_name = dist and dist.name or None
    cache_key = "releasefile:v1:%s:%s" % (release.id, ReleaseFile.get_ident(filename, dist_name))

    logger.debug("Checking cache for release artifact %r (release_id=%s)", filename, release.id)
    result = cache.get(cache_key)

    # not in the cache (meaning we haven't checked the database recently), so check the database
    if result is None:
        filename_choices = ReleaseFile.normalize(filename)
        filename_idents = [ReleaseFile.get_ident(f, dist_name) for f in filename_choices]

        logger.debug(
            "Checking database for release artifact %r (release_id=%s)", filename, release.id
        )

        possible_files = list(
            ReleaseFile.objects.filter(
                release=release, dist=dist, ident__in=filename_idents
            ).select_related("file")
        )

        if len(possible_files) == 0:
            logger.debug(
                "Release artifact %r not found in database (release_id=%s)", filename, release.id
            )
            cache.set(cache_key, -1, 60)
            return None

        elif len(possible_files) == 1:
            releasefile = possible_files[0]

        else:
            # Pick first one that matches in priority order.
            # This is O(N*M) but there are only ever at most 4 things here
            # so not really worth optimizing.
            releasefile = next(
                (rf for ident in filename_idents for rf in possible_files if rf.ident == ident)
            )

        logger.debug(
            "Found release artifact %r (id=%s, release_id=%s)", filename, releasefile.id, release.id
        )
        try:
            with metrics.timer("sourcemaps.release_file_read"):
                with ReleaseFile.cache.getfile(releasefile) as fp:
                    z_body, body = compress_file(fp)
        except Exception:
            logger.error("sourcemap.compress_read_failed", exc_info=sys.exc_info())
            result = None
        else:
            headers = {k.lower(): v for k, v in releasefile.file.headers.items()}
            encoding = get_encoding_from_headers(headers)
            result = http.UrlResult(filename, headers, body, 200, encoding)
            # This will implicitly skip too large payloads. Those will be cached
            # on the file system by `ReleaseFile.cache`, instead.
            cache.set(cache_key, (headers, z_body, 200, encoding), 3600)

    # in the cache as an unsuccessful attempt
    elif result == -1:
        result = None

    # in the cache as a successful attempt, including the zipped contents of the file
    else:
        # Previous caches would be a 3-tuple instead of a 4-tuple,
        # so this is being maintained for backwards compatibility
        try:
            encoding = result[3]
        except IndexError:
            encoding = None
        result = http.UrlResult(
            filename, result[0], zlib.decompress(result[1]), result[2], encoding
        )

    return result
Esempio n. 54
0
    def _request(self, method, path, **kwargs):
        self._ensure_open()

        url = urljoin(self.url, path)

        # required for load balancing
        kwargs.setdefault("headers",
                          {})["x-sentry-project-id"] = self.project_id
        kwargs.setdefault("headers", {})["x-sentry-event-id"] = self.event_id

        attempts = 0
        wait = 0.5

        while True:
            try:
                with metrics.timer("events.symbolicator.session.request",
                                   tags={"attempt": attempts}):
                    response = self.session.request(
                        method,
                        url,
                        timeout=settings.SYMBOLICATOR_POLL_TIMEOUT + 1,
                        **kwargs)

                metrics.incr(
                    "events.symbolicator.status_code",
                    tags={
                        "status_code": response.status_code,
                        "project_id": self.project_id
                    },
                )

                if (method.lower() == "get" and path.startswith("requests/")
                        and response.status_code == 404):
                    # The symbolicator does not know this task. This is
                    # expected to happen when we're currently deploying
                    # symbolicator (which will clear all of its state). Re-send
                    # the symbolication task.
                    return None

                if response.status_code in (502, 503):
                    raise ServiceUnavailable()

                if response.ok:
                    json = response.json()
                else:
                    json = {
                        "status": "failed",
                        "message": "internal server error"
                    }

                return self._process_response(json)
            except (OSError, RequestException) as e:
                metrics.incr(
                    "events.symbolicator.request_error",
                    tags={
                        "exc":
                        ".".join(
                            [e.__class__.__module__, e.__class__.__name__]),
                        "attempt":
                        attempts,
                    },
                )

                attempts += 1
                # Any server error needs to be treated as a failure. We can
                # retry a couple of times, but ultimately need to bail out.
                #
                # This can happen for any network failure.
                if attempts > MAX_ATTEMPTS:
                    logger.error("Failed to contact symbolicator",
                                 exc_info=True)
                    raise

                time.sleep(wait)
                wait *= 2.0
Esempio n. 55
0
def fetch_file(url, project=None, release=None, dist=None, allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    # If our url has been truncated, it'd be impossible to fetch
    # so we check for this early and bail
    if url[-3:] == '...':
        raise http.CannotFetch(
            {
                'type': EventError.JS_MISSING_SOURCE,
                'url': http.expose_url(url),
            }
        )
    if release:
        with metrics.timer('sourcemaps.release_file'):
            result = fetch_release_file(url, release, dist)
    else:
        result = None

    cache_key = 'source:cache:v4:%s' % (md5_text(url).hexdigest(), )

    if result is None:
        if not allow_scraping or not url.startswith(('http:', 'https:')):
            error = {
                'type': EventError.JS_MISSING_SOURCE,
                'url': http.expose_url(url),
            }
            raise http.CannotFetch(error)

        logger.debug('Checking cache for url %r', url)
        result = cache.get(cache_key)
        if result is not None:
            # Previous caches would be a 3-tuple instead of a 4-tuple,
            # so this is being maintained for backwards compatibility
            try:
                encoding = result[4]
            except IndexError:
                encoding = None
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            result = http.UrlResult(
                result[0], result[1], zlib.decompress(result[2]), result[3], encoding
            )

    if result is None:
        headers = {}
        verify_ssl = False
        if project and is_valid_origin(url, project=project):
            verify_ssl = bool(project.get_option('sentry:verify_ssl', False))
            token = project.get_option('sentry:token')
            if token:
                token_header = project.get_option('sentry:token_header') or 'X-Sentry-Token'
                headers[token_header] = token

        with metrics.timer('sourcemaps.fetch'):
            result = http.fetch_file(url, headers=headers, verify_ssl=verify_ssl)
            z_body = zlib.compress(result.body)
            cache.set(
                cache_key,
                (url,
                 result.headers,
                 z_body,
                 result.status,
                 result.encoding),
                get_max_age(result.headers))

    # If we did not get a 200 OK we just raise a cannot fetch here.
    if result.status != 200:
        raise http.CannotFetch(
            {
                'type': EventError.FETCH_INVALID_HTTP_CODE,
                'value': result.status,
                'url': http.expose_url(url),
            }
        )

    # Make sure the file we're getting back is six.binary_type. The only
    # reason it'd not be binary would be from old cached blobs, so
    # for compatibility with current cached files, let's coerce back to
    # binary and say utf8 encoding.
    if not isinstance(result.body, six.binary_type):
        try:
            result = http.UrlResult(
                result.url, result.headers,
                result.body.encode('utf8'), result.status, result.encoding
            )
        except UnicodeEncodeError:
            error = {
                'type': EventError.FETCH_INVALID_ENCODING,
                'value': 'utf8',
                'url': http.expose_url(url),
            }
            raise http.CannotFetch(error)

    # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML)
    # NOTE: possible to have JS files that don't actually end w/ ".js", but
    # this should catch 99% of cases
    if url.endswith('.js'):
        # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<').
        # This cannot parse as valid JS/JSON.
        # NOTE: not relying on Content-Type header because apps often don't set this correctly
        # Discard leading whitespace (often found before doctype)
        body_start = result.body[:20].lstrip()

        if body_start[:1] == u'<':
            error = {
                'type': EventError.JS_INVALID_CONTENT,
                'url': url,
            }
            raise http.CannotFetch(error)

    return result
Esempio n. 56
0
def fetch_file(url,
               project=None,
               release=None,
               dist=None,
               allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    # If our url has been truncated, it'd be impossible to fetch
    # so we check for this early and bail
    if url[-3:] == '...':
        raise http.CannotFetch({
            'type': EventError.JS_MISSING_SOURCE,
            'url': http.expose_url(url),
        })
    if release:
        with metrics.timer('sourcemaps.release_file'):
            result = fetch_release_file(url, release, dist)
    else:
        result = None

    cache_key = 'source:cache:v4:%s' % (md5_text(url).hexdigest(), )

    if result is None:
        if not allow_scraping or not url.startswith(('http:', 'https:')):
            error = {
                'type': EventError.JS_MISSING_SOURCE,
                'url': http.expose_url(url),
            }
            raise http.CannotFetch(error)

        logger.debug('Checking cache for url %r', url)
        result = cache.get(cache_key)
        if result is not None:
            # Previous caches would be a 3-tuple instead of a 4-tuple,
            # so this is being maintained for backwards compatibility
            try:
                encoding = result[4]
            except IndexError:
                encoding = None
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            result = http.UrlResult(result[0], result[1],
                                    zlib.decompress(result[2]), result[3],
                                    encoding)

    if result is None:
        headers = {}
        verify_ssl = False
        if project and is_valid_origin(url, project=project):
            verify_ssl = bool(project.get_option('sentry:verify_ssl', False))
            token = project.get_option('sentry:token')
            if token:
                token_header = project.get_option(
                    'sentry:token_header',
                    'X-Sentry-Token',
                )
                headers[token_header] = token

        with metrics.timer('sourcemaps.fetch'):
            result = http.fetch_file(url,
                                     headers=headers,
                                     verify_ssl=verify_ssl)
            z_body = zlib.compress(result.body)
            cache.set(
                cache_key,
                (url, result.headers, z_body, result.status, result.encoding),
                60)

    # Make sure the file we're getting back is six.binary_type. The only
    # reason it'd not be binary would be from old cached blobs, so
    # for compatibility with current cached files, let's coerce back to
    # binary and say utf8 encoding.
    if not isinstance(result.body, six.binary_type):
        try:
            result = http.UrlResult(result.url, result.headers,
                                    result.body.encode('utf8'), result.status,
                                    result.encoding)
        except UnicodeEncodeError:
            error = {
                'type': EventError.FETCH_INVALID_ENCODING,
                'value': 'utf8',
                'url': http.expose_url(url),
            }
            raise http.CannotFetch(error)

    # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML)
    # NOTE: possible to have JS files that don't actually end w/ ".js", but
    # this should catch 99% of cases
    if url.endswith('.js'):
        # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<').
        # This cannot parse as valid JS/JSON.
        # NOTE: not relying on Content-Type header because apps often don't set this correctly
        # Discard leading whitespace (often found before doctype)
        body_start = result.body[:20].lstrip()

        if body_start[:1] == u'<':
            error = {
                'type': EventError.JS_INVALID_CONTENT,
                'url': url,
            }
            raise http.CannotFetch(error)

    return result