def test_is_fennec(self): raw_crash = { 'ProductName': 'Fennec' } throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == (ACCEPT, 'is_fennec', 100)
def test_is_thunderbird_seamonkey(self, product): raw_crash = { 'ProductName': product } throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == (ACCEPT, 'is_thunderbird_seamonkey', 100)
def test_bad_value(self): raw_crash = { 'ProductName': '' } throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == (DEFER, 'NO_MATCH', 0)
def test_comments(self): raw_crash = { 'ProductName': 'Test', 'Comments': 'foo bar baz' } throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == (ACCEPT, 'has_comments', 100)
def test_is_version_alpha_beta_special(self, version): raw_crash = { 'ProductName': 'Test', 'Version': version } throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == (ACCEPT, 'is_version_alpha_beta_special', 100)
def test_is_nightly(self, channel): raw_crash = { 'ProductName': 'Test', 'ReleaseChannel': channel } throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == (ACCEPT, 'is_nightly', 100)
def test_infobar(self): raw_crash = { 'ProductName': 'Firefox', 'SubmittedFromInfobar': 'true', 'Version': '52.0.2', 'BuildID': '20171223222554', } throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == (REJECT, 'infobar_is_true', None)
def test_email(self, email, expected): raw_crash = { 'ProductName': 'BarTest', } if email is not None: raw_crash['Email'] = email throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == expected
def test_hangid(self): raw_crash = { 'ProductName': 'FireSquid', 'Version': '99', 'ProcessType': 'browser', 'HangID': 'xyz' } throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == (REJECT, 'has_hangid_and_browser', None)
def test_productname_no_unsupported_products(self): """Verify productname rule doesn't do anything if using ALL_PRODUCTS""" throttler = Throttler(ConfigManager.from_dict({ 'PRODUCTS': 'antenna.throttler.ALL_PRODUCTS' })) raw_crash = { 'ProductName': 'testproduct' } # This is an unsupported product, but it's not accepted for processing # by any of the rules, so it gets caught up by the last rule assert throttler.throttle(raw_crash) == (ACCEPT, 'accept_everything', 100)
def test_productname_reject(self, caplogpp, productname, expected): """Verify productname rule blocks unsupported products""" with caplogpp.at_level(logging.INFO, logger='antenna'): # Need a throttler with the default configuration which includes supported # products throttler = Throttler(ConfigManager.from_dict({})) raw_crash = {} if productname is not None: raw_crash['ProductName'] = productname assert throttler.throttle(raw_crash) == expected assert caplogpp.record_tuples == [ ('antenna.throttler', logging.INFO, 'ProductName rejected: %r' % productname) ]
def test_productname_fakeaccept(self, caplogpp): # This product isn't in the list and it's B2G which is the special case with caplogpp.at_level(logging.INFO, logger='antenna'): # Need a throttler with the default configuration which includes supported # products throttler = Throttler(ConfigManager.from_dict({})) raw_crash = { 'ProductName': 'b2g' } assert throttler.throttle(raw_crash) == (FAKEACCEPT, 'b2g', 100) assert caplogpp.record_tuples == [ ('antenna.throttler', logging.INFO, 'ProductName B2G: fake accept') ]
def test_percentage(self, randommock): throttler = Throttler(ConfigManager.from_dict({})) # Overrwrite the rule set for something we need throttler.rule_set = [ Rule('test', 'ProductName', 'test', 50) ] with randommock(0.45): # Below the percentage line, so ACCEPT! assert throttler.throttle({'ProductName': 'test'}) == (ACCEPT, 'test', 50) with randommock(0.55): # Above the percentage line, so DEFER! assert throttler.throttle({'ProductName': 'test'}) == (DEFER, 'test', 50)
def test_is_firefox(self, randommock): with randommock(0.09): raw_crash = { 'ProductName': 'Firefox', } throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == (ACCEPT, 'is_firefox_desktop', 10) with randommock(0.9): raw_crash = { 'ProductName': 'Firefox', } throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == (DEFER, 'is_firefox_desktop', 10)
def __init__(self, config): self.config = config.with_options(self) self.crashstorage = self.config('crashstorage_class')( config.with_namespace('crashstorage')) self.throttler = Throttler(config) # Gevent pool for crashmover workers self.crashmover_pool = Pool(size=self.config('concurrent_crashmovers')) # Queue for crashmover of crashes to save self.crashmover_save_queue = deque() # Register hb functions with heartbeat manager register_for_heartbeat(self.hb_report_health_stats) register_for_heartbeat(self.hb_run_crashmover) # Register life function with heartbeat manager register_for_life(self.has_work_to_do)
def __init__(self, config): self.config = config.with_options(self) self.crashstorage = self.config('crashstorage_class')(config.with_namespace('crashstorage')) self.crashpublish = self.config('crashpublish_class')(config.with_namespace('crashpublish')) self.throttler = Throttler(config) # Gevent pool for crashmover workers self.crashmover_pool = Pool(size=self.config('concurrent_crashmovers')) # Queue for crashmover work self.crashmover_queue = deque() # Register hb functions with heartbeat manager register_for_heartbeat(self.hb_report_health_stats) register_for_heartbeat(self.hb_run_crashmover) # Register life function with heartbeat manager register_for_life(self.has_work_to_do)
class BreakpadSubmitterResource(RequiredConfigMixin): """Handles incoming breakpad-style crash reports. This handles incoming HTTP POST requests containing breakpad-style crash reports in multipart/form-data format. It can handle compressed or uncompressed POST payloads. It parses the payload from the HTTP POST request, runs it through the throttler with the specified rules, generates a crash_id, returns the crash_id to the HTTP client, saves the crash using the configured crashstorage class, and publishes it using the configured crashpublish class. .. Note:: From when a crash comes in to when it's saved by the crashstorage class, the crash is entirely in memory. Keep that in mind when figuring out how to scale your Antenna nodes. The most important configuration bit here is choosing the crashstorage class. For example:: CRASHSTORAGE_CLASS=antenna.ext.s3.crashstorage.S3CrashStorage """ required_config = ConfigOptions() required_config.add_option( "dump_field", default="upload_file_minidump", doc="The name of the field in the POST data for dumps.", ) required_config.add_option( "dump_id_prefix", default="bp-", doc="The crash type prefix." ) required_config.add_option( "concurrent_crashmovers", default="2", parser=positive_int, doc=( "The number of crashes concurrently being saved and published. " "Each process gets this many concurrent crashmovers, so if you're " "running 5 processes on the node, then it's " "(5 * concurrent_crashmovers) sharing upload bandwidth." ), ) # crashstorage things required_config.add_option( "crashstorage_class", default="antenna.ext.crashstorage_base.NoOpCrashStorage", parser=parse_class, doc="The class in charge of storing crashes.", ) # crashpublish things required_config.add_option( "crashpublish_class", default="antenna.ext.crashpublish_base.NoOpCrashPublish", parser=parse_class, doc="The class in charge of publishing crashes.", ) def __init__(self, config): self.config = config.with_options(self) self.crashstorage = self.config("crashstorage_class")( config.with_namespace("crashstorage") ) self.crashpublish = self.config("crashpublish_class")( config.with_namespace("crashpublish") ) self.throttler = Throttler(config) # Gevent pool for crashmover workers self.crashmover_pool = Pool(size=self.config("concurrent_crashmovers")) # Queue for crashmover work self.crashmover_queue = deque() # Register hb functions with heartbeat manager register_for_heartbeat(self.hb_report_health_stats) register_for_heartbeat(self.hb_run_crashmover) # Register life function with heartbeat manager register_for_life(self.has_work_to_do) def get_runtime_config(self, namespace=None): """Return generator of runtime configuration.""" for item in super().get_runtime_config(): yield item for item in self.throttler.get_runtime_config(): yield item for item in self.crashstorage.get_runtime_config(["crashstorage"]): yield item for item in self.crashpublish.get_runtime_config(["crashpublish"]): yield item def check_health(self, state): """Return health state.""" if hasattr(self.crashstorage, "check_health"): self.crashstorage.check_health(state) if hasattr(self.crashpublish, "check_health"): self.crashpublish.check_health(state) def hb_report_health_stats(self): """Heartbeat function to report health stats.""" # The number of crash reports sitting in the work queue; this is a # direct measure of the health of this process--a number that's going # up means impending doom mymetrics.gauge("work_queue_size", value=len(self.crashmover_queue)) def has_work_to_do(self): """Return whether this still has work to do.""" work_to_do = len(self.crashmover_pool) + len(self.crashmover_queue) logger.info("work left to do: %s" % work_to_do) # Indicates whether or not we're sitting on crashes to save--this helps # keep Antenna alive until we're done saving crashes return bool(work_to_do) def extract_payload(self, req): """Parse HTTP POST payload. Decompresses the payload if necessary and then walks through the FieldStorage converting from multipart/form-data to Python datatypes. NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It has a list attribute that is a list of FieldStorage items--one for each key/val in the form. For attached files, the FieldStorage will have a name, value and filename and the type should be ``application/octet-stream``. Thus we parse it looking for things of type ``text/plain``, ``application/json``, and application/octet-stream. :arg falcon.request.Request req: a Falcon Request instance :returns: (raw_crash dict, dumps dict) :raises MalformedCrashReport: """ # If we don't have a content type, raise MalformedCrashReport if not req.content_type: raise MalformedCrashReport("no_content_type") # If it's the wrong content type or there's no boundary section, raise # MalformedCrashReport content_type = [part.strip() for part in req.content_type.split(";", 1)] if ( len(content_type) != 2 or content_type[0] != "multipart/form-data" or not content_type[1].startswith("boundary=") ): if content_type[0] != "multipart/form-data": raise MalformedCrashReport("wrong_content_type") else: raise MalformedCrashReport("no_boundary") content_length = req.content_length or 0 # If there's no content, raise MalformedCrashReport if content_length == 0: raise MalformedCrashReport("no_content_length") # Decompress payload if it's compressed if req.env.get("HTTP_CONTENT_ENCODING") == "gzip": mymetrics.incr("gzipped_crash") # If the content is gzipped, we pull it out and decompress it. We # have to do that here because nginx doesn't have a good way to do # that in nginx-land. gzip_header = 16 + zlib.MAX_WBITS try: data = zlib.decompress(req.stream.read(content_length), gzip_header) except zlib.error: # This indicates this isn't a valid compressed stream. Given # that the HTTP request insists it is, we're just going to # assume it's junk and not try to process any further. raise MalformedCrashReport("bad_gzip") # Stomp on the content length to correct it because we've changed # the payload size by decompressing it. We save the original value # in case we need to debug something later on. req.env["ORIG_CONTENT_LENGTH"] = content_length content_length = len(data) req.env["CONTENT_LENGTH"] = str(content_length) data = io.BytesIO(data) mymetrics.histogram( "crash_size", value=content_length, tags=["payload:compressed"] ) else: # NOTE(willkg): At this point, req.stream is either a # falcon.request_helper.BoundedStream (in tests) or a # gunicorn.http.body.Body (in production). # # FieldStorage doesn't work with BoundedStream so we pluck out the # internal stream from that which works fine. # # FIXME(willkg): why don't tests work with BoundedStream? if isinstance(req.stream, BoundedStream): data = req.stream.stream else: data = req.stream mymetrics.histogram( "crash_size", value=content_length, tags=["payload:uncompressed"] ) # Stomp on querystring so we don't pull it in request_env = dict(req.env) request_env["QUERY_STRING"] = "" fs = cgi.FieldStorage(fp=data, environ=request_env, keep_blank_values=1) raw_crash = {} dumps = {} has_json = False has_kvpairs = False for fs_item in fs.list: # If the field has no name, then it's probably junk, so let's drop it. if not fs_item.name: continue if fs_item.name == "dump_checksums": # We don't want to pick up the dump_checksums from a raw # crash that was re-submitted. continue elif fs_item.type and fs_item.type.startswith("application/json"): # This is a JSON blob, so load it and override raw_crash with # it. has_json = True try: raw_crash = json.loads(fs_item.value) except json.decoder.JSONDecodeError: raise MalformedCrashReport("bad_json") elif fs_item.type and ( fs_item.type.startswith("application/octet-stream") or isinstance(fs_item.value, bytes) ): # This is a dump, so add it to dumps using a sanitized dump # name. dump_name = sanitize_dump_name(fs_item.name) dumps[dump_name] = fs_item.value else: # This isn't a dump, so it's a key/val pair, so we add that. has_kvpairs = True raw_crash[fs_item.name] = fs_item.value if not raw_crash: raise MalformedCrashReport("no_annotations") if has_json and has_kvpairs: # If the crash payload has both kvpairs and a JSON blob, then it's # malformed and we should dump it. raise MalformedCrashReport("has_json_and_kv") # Add a note about how the annotations were encoded in the crash report. # For now, there are two options: json and multipart. if has_json: raw_crash["payload"] = "json" else: raw_crash["payload"] = "multipart" return raw_crash, dumps def get_throttle_result(self, raw_crash): """Run raw_crash through throttler for a throttling result. :arg dict raw_crash: the raw crash to throttle :returns tuple: ``(result, rule_name, percentage)`` """ # At this stage, nothing has given us a throttle answer, so we # throttle the crash. result, rule_name, throttle_rate = self.throttler.throttle(raw_crash) # Save the results in the raw_crash itself raw_crash["legacy_processing"] = result raw_crash["throttle_rate"] = throttle_rate return result, rule_name, throttle_rate def cleanup_crash_report(self, raw_crash): """Remove anything from the crash report that shouldn't be there. This operates on the raw_crash in-place. This adds notes to ``collector_notes``. """ collector_notes = [] # Remove bad fields for bad_field in BAD_FIELDS: if bad_field in raw_crash: del raw_crash[bad_field] collector_notes.append("Removed %s from raw crash." % bad_field) raw_crash["collector_notes"] = collector_notes @mymetrics.timer_decorator("on_post.time") def on_post(self, req, resp): """Handle incoming HTTP POSTs. Note: This is executed by the WSGI app, so it and anything it does is covered by the Sentry middleware. """ resp.status = falcon.HTTP_200 start_time = time.time() # NOTE(willkg): This has to return text/plain since that's what the # breakpad clients expect. resp.content_type = "text/plain" try: raw_crash, dumps = self.extract_payload(req) except MalformedCrashReport as exc: # If this is malformed, then reject it with malformed error code. msg = str(exc) mymetrics.incr("malformed", tags=["reason:%s" % msg]) resp.status = falcon.HTTP_400 resp.body = "Discarded=malformed_%s" % msg return mymetrics.incr("incoming_crash") # Add timestamps current_timestamp = utc_now() raw_crash["submitted_timestamp"] = current_timestamp.isoformat() raw_crash["timestamp"] = start_time # Add checksums and MinidumpSha256Hash raw_crash["dump_checksums"] = { dump_name: hashlib.sha256(dump).hexdigest() for dump_name, dump in dumps.items() } raw_crash["MinidumpSha256Hash"] = raw_crash["dump_checksums"].get( "upload_file_minidump", "" ) # First throttle the crash which gives us the information we need # to generate a crash id. throttle_result, rule_name, percentage = self.get_throttle_result(raw_crash) # Use a uuid if they gave us one and it's valid--otherwise create a new # one. if "uuid" in raw_crash and validate_crash_id(raw_crash["uuid"]): crash_id = raw_crash["uuid"] logger.info("%s has existing crash_id", crash_id) else: crash_id = create_crash_id( timestamp=current_timestamp, throttle_result=throttle_result ) raw_crash["uuid"] = crash_id raw_crash["type_tag"] = self.config("dump_id_prefix").strip("-") # Log the throttle result logger.info( "%s: matched by %s; returned %s", crash_id, rule_name, RESULT_TO_TEXT[throttle_result], ) mymetrics.incr("throttle_rule", tags=["rule:%s" % rule_name]) mymetrics.incr( "throttle", tags=["result:%s" % RESULT_TO_TEXT[throttle_result].lower()] ) # If the result is REJECT, then discard it if throttle_result is REJECT: resp.body = "Discarded=rule_%s" % rule_name return # If the result is a FAKEACCEPT, then we return a crash id, but throw the crash # away if throttle_result is FAKEACCEPT: resp.body = "CrashID=%s%s\n" % (self.config("dump_id_prefix"), crash_id) return # If we're accepting the cash report, then clean it up, save it and return the # CrashID to the client self.cleanup_crash_report(raw_crash) crash_report = CrashReport(raw_crash, dumps, crash_id) crash_report.set_state(STATE_SAVE) self.crashmover_queue.append(crash_report) self.hb_run_crashmover() resp.body = "CrashID=%s%s\n" % (self.config("dump_id_prefix"), crash_id) def hb_run_crashmover(self): """Spawn a crashmover if there's work to do.""" # Spawn a new crashmover if there's stuff in the queue and we haven't # hit the limit of how many we can run if self.crashmover_queue and self.crashmover_pool.free_count() > 0: self.crashmover_pool.spawn(self.crashmover_process_queue) def crashmover_process_queue(self): """Process crashmover work. NOTE(willkg): This has to be super careful not to lose crash reports. If there's any kind of problem, this must return the crash report to the relevant queue. """ while self.crashmover_queue: crash_report = self.crashmover_queue.popleft() try: if crash_report.state == STATE_SAVE: # Save crash and then toss crash_id in the publish queue self.crashmover_save(crash_report) crash_report.set_state(STATE_PUBLISH) self.crashmover_queue.append(crash_report) elif crash_report.state == STATE_PUBLISH: # Publish crash and we're done self.crashmover_publish(crash_report) self.crashmover_finish(crash_report) except Exception: mymetrics.incr("%s_crash_exception.count" % crash_report.state) crash_report.errors += 1 logger.exception( "Exception when processing queue (%s), state: %s; error %d/%d", crash_report.crash_id, crash_report.state, crash_report.errors, MAX_ATTEMPTS, ) # After MAX_ATTEMPTS, we give up on this crash and move on if crash_report.errors < MAX_ATTEMPTS: self.crashmover_queue.append(crash_report) else: logger.error( "%s: too many errors trying to %s; dropped", crash_report.crash_id, crash_report.state, ) mymetrics.incr("%s_crash_dropped.count" % crash_report.state) def crashmover_finish(self, crash_report): """Finish bookkeeping on crash report.""" # Capture the total time it took for this crash to be handled from # being received from breakpad client to saving to s3. # # NOTE(willkg): time.time returns seconds, but .timing() wants # milliseconds, so we multiply! delta = (time.time() - crash_report.raw_crash["timestamp"]) * 1000 mymetrics.timing("crash_handling.time", value=delta) mymetrics.incr("save_crash.count") @mymetrics.timer("crash_save.time") def crashmover_save(self, crash_report): """Save crash report to storage.""" self.crashstorage.save_crash(crash_report) logger.info("%s saved", crash_report.crash_id) @mymetrics.timer("crash_publish.time") def crashmover_publish(self, crash_report): """Publish crash_id in publish queue.""" self.crashpublish.publish_crash(crash_report) logger.info("%s published", crash_report.crash_id) def join_pool(self): """Join the pool. NOTE(willkg): Only use this in tests! This is helpful for forcing all the coroutines in the pool to complete so that we can verify outcomes in the test suite for work that might cross coroutines. """ self.crashmover_pool.join()
class BreakpadSubmitterResource(RequiredConfigMixin): """Handles incoming breakpad crash reports and saves to crashstorage This handles incoming HTTP POST requests containing breakpad-style crash reports in multipart/form-data format. It can handle compressed or uncompressed POST payloads. It parses the payload from the HTTP POST request, runs it through the throttler with the specified rules, generates a crash_id, returns the crash_id to the HTTP client and then saves the crash using the configured crashstorage class. .. Note:: From when a crash comes in to when it's saved by the crashstorage class, the crash is entirely in memory. Keep that in mind when figuring out how to scale your Antenna nodes. The most important configuration bit here is choosing the crashstorage class. For example:: CRASHSTORAGE_CLASS=antenna.ext.s3.crashstorage.S3CrashStorage """ required_config = ConfigOptions() required_config.add_option( 'dump_field', default='upload_file_minidump', doc='the name of the field in the POST data for dumps') required_config.add_option('dump_id_prefix', default='bp-', doc='the crash type prefix') required_config.add_option( 'crashstorage_class', default='antenna.ext.crashstorage_base.NoOpCrashStorage', parser=parse_class, doc='the class in charge of storing crashes') # Maximum number of concurrent crashmover workers; each process gets this # many concurrent crashmovers, so if you're running 5 processes on the node # then it's (5 * concurrent_crashmovers) fighting for upload bandwidth required_config.add_option( 'concurrent_crashmovers', default='2', parser=int, doc='the number of crashes concurrently being saved to s3') def __init__(self, config): self.config = config.with_options(self) self.crashstorage = self.config('crashstorage_class')( config.with_namespace('crashstorage')) self.throttler = Throttler(config) # Gevent pool for crashmover workers self.crashmover_pool = Pool(size=self.config('concurrent_crashmovers')) # Queue for crashmover of crashes to save self.crashmover_save_queue = deque() # Register hb functions with heartbeat manager register_for_heartbeat(self.hb_report_health_stats) register_for_heartbeat(self.hb_run_crashmover) # Register life function with heartbeat manager register_for_life(self.has_work_to_do) def get_runtime_config(self, namespace=None): for item in super().get_runtime_config(): yield item for item in self.throttler.get_runtime_config(): yield item for item in self.crashstorage.get_runtime_config(['crashstorage']): yield item def check_health(self, state): if hasattr(self.crashstorage, 'check_health'): self.crashstorage.check_health(state) def hb_report_health_stats(self): # The number of crash reports sitting in the queue; this is a direct # measure of the health of this process--a number that's going up means # impending doom mymetrics.gauge('save_queue_size', value=len(self.crashmover_save_queue)) def has_work_to_do(self): work_to_do = len(self.crashmover_save_queue) + len( self.crashmover_pool) logger.info('work left to do: %s' % work_to_do) # Indicates whether or not we're sitting on crashes to save--this helps # keep Antenna alive until we're done saving crashes return bool(work_to_do) def extract_payload(self, req): """Parses the HTTP POST payload Decompresses the payload if necessary and then walks through the FieldStorage converting from multipart/form-data to Python datatypes. NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It has a list attribute that is a list of FieldStorage items--one for each key/val in the form. For attached files, the FieldStorage will have a name, value and filename and the type should be application/octet-stream. Thus we parse it looking for things of type text/plain and application/octet-stream. :arg falcon.request.Request req: a Falcon Request instance :returns: (raw_crash dict, dumps dict) """ # If we don't have a content type, return an empty crash if not req.content_type: return {}, {} # If it's the wrong content type or there's no boundary section, return # an empty crash content_type = [ part.strip() for part in req.content_type.split(';', 1) ] if ((len(content_type) != 2 or content_type[0] != 'multipart/form-data' or not content_type[1].startswith('boundary='))): return {}, {} content_length = req.content_length or 0 # If there's no content, return an empty crash if content_length == 0: return {}, {} # Decompress payload if it's compressed if req.env.get('HTTP_CONTENT_ENCODING') == 'gzip': mymetrics.incr('gzipped_crash') # If the content is gzipped, we pull it out and decompress it. We # have to do that here because nginx doesn't have a good way to do # that in nginx-land. gzip_header = 16 + zlib.MAX_WBITS try: data = zlib.decompress(req.stream.read(content_length), gzip_header) except zlib.error: # This indicates this isn't a valid compressed stream. Given # that the HTTP request insists it is, we're just going to # assume it's junk and not try to process any further. mymetrics.incr('bad_gzipped_crash') return {}, {} # Stomp on the content length to correct it because we've changed # the payload size by decompressing it. We save the original value # in case we need to debug something later on. req.env['ORIG_CONTENT_LENGTH'] = content_length content_length = len(data) req.env['CONTENT_LENGTH'] = str(content_length) data = io.BytesIO(data) mymetrics.histogram('crash_size', value=content_length, tags=['payload:compressed']) else: # NOTE(willkg): At this point, req.stream is either a # falcon.request_helper.BoundedStream (in tests) or a # gunicorn.http.body.Body (in production). # # FieldStorage doesn't work with BoundedStream so we pluck out the # internal stream from that which works fine. # # FIXME(willkg): why don't tests work with BoundedStream? if isinstance(req.stream, BoundedStream): data = req.stream.stream else: data = req.stream mymetrics.histogram('crash_size', value=content_length, tags=['payload:uncompressed']) fs = cgi.FieldStorage(fp=data, environ=req.env, keep_blank_values=1) # NOTE(willkg): In the original collector, this returned request # querystring data as well as request body data, but we're not doing # that because the query string just duplicates data in the payload. raw_crash = {} dumps = {} for fs_item in fs.list: # NOTE(willkg): We saw some crashes come in where the raw crash ends up with # a None as a key. Make sure we can't end up with non-strings as keys. item_name = de_null(fs_item.name or '') if item_name == 'dump_checksums': # We don't want to pick up the dump_checksums from a raw # crash that was re-submitted. continue elif fs_item.type and ( fs_item.type.startswith('application/octet-stream') or isinstance(fs_item.value, bytes)): # This is a dump, so add it to dumps using a sanitized dump # name. dump_name = sanitize_dump_name(item_name) dumps[dump_name] = fs_item.value else: # This isn't a dump, so it's a key/val pair, so we add that. raw_crash[item_name] = de_null(fs_item.value) return raw_crash, dumps def get_throttle_result(self, raw_crash): """Given a raw_crash, figures out the throttling If the raw_crash contains throttling information already, it returns that. If it doesn't, then this will apply throttling and return the results of that. A rule name of ``ALREADY_THROTTLED`` indicates that the raw_crash was previously throttled and we're re-using that data. A rule name of ``THROTTLEABLE_0`` indicates that the raw_crash was marked to not be throttled. :arg dict raw_crash: the raw crash to throttle :returns tuple: ``(result, rule_name, percentage)`` """ # If the raw_crash has a uuid, then that implies throttling, so return # that. if 'uuid' in raw_crash: crash_id = raw_crash['uuid'] if crash_id[-7] in (str(ACCEPT), str(DEFER)): result = int(crash_id[-7]) throttle_rate = 100 # Save the results in the raw_crash itself raw_crash['legacy_processing'] = result raw_crash['throttle_rate'] = throttle_rate return result, 'FROM_CRASHID', throttle_rate # If we have throttle results for this crash, return those. if 'legacy_processing' in raw_crash and 'throttle_rate' in raw_crash: try: result = int(raw_crash['legacy_processing']) if result not in (ACCEPT, DEFER): raise ValueError('Result is not a valid value: %r', result) throttle_rate = int(raw_crash['throttle_rate']) if not (0 <= throttle_rate <= 100): raise ValueError('Throttle rate is not a valid value: %r', result) return result, 'ALREADY_THROTTLED', throttle_rate except ValueError: # If we've gotten a ValueError, it means one or both of the # values is bad and we should ignore it and move forward. mymetrics.incr('throttle.bad_throttle_values') # If we have a Throttleable=0, then return that. if raw_crash.get('Throttleable', None) == '0': # If the raw crash has ``Throttleable=0``, then we accept the # crash. mymetrics.incr('throttleable_0') result = ACCEPT rule_name = 'THROTTLEABLE_0' throttle_rate = 100 else: # At this stage, nothing has given us a throttle answer, so we # throttle the crash. result, rule_name, throttle_rate = self.throttler.throttle( raw_crash) # Save the results in the raw_crash itself raw_crash['legacy_processing'] = result raw_crash['throttle_rate'] = throttle_rate return result, rule_name, throttle_rate @mymetrics.timer_decorator('on_post.time') def on_post(self, req, resp): """Handles incoming HTTP POSTs Note: This is executed by the WSGI app, so it and anything it does is covered by the Sentry middleware. """ resp.status = falcon.HTTP_200 start_time = time.time() # NOTE(willkg): This has to return text/plain since that's what the # breakpad clients expect. resp.content_type = 'text/plain' raw_crash, dumps = self.extract_payload(req) # If we didn't get any crash data, then just drop it and move on--don't # count this as an incoming crash and don't do any more work on it if not raw_crash: resp.body = 'Discarded=1' return mymetrics.incr('incoming_crash') # Add timestamps current_timestamp = utc_now() raw_crash['submitted_timestamp'] = current_timestamp.isoformat() raw_crash['timestamp'] = start_time # Add checksums and MinidumpSha256Hash raw_crash['dump_checksums'] = { dump_name: hashlib.sha256(dump).hexdigest() for dump_name, dump in dumps.items() } raw_crash['MinidumpSha256Hash'] = raw_crash['dump_checksums'].get( 'upload_file_minidump', '') # First throttle the crash which gives us the information we need # to generate a crash id. throttle_result, rule_name, percentage = self.get_throttle_result( raw_crash) # Use a uuid if they gave us one and it's valid--otherwise create a new # one. if 'uuid' in raw_crash and validate_crash_id(raw_crash['uuid']): crash_id = raw_crash['uuid'] logger.info('%s has existing crash_id', crash_id) else: crash_id = create_crash_id(timestamp=current_timestamp, throttle_result=throttle_result) raw_crash['uuid'] = crash_id raw_crash['type_tag'] = self.config('dump_id_prefix').strip('-') # Log the throttle result logger.info('%s: matched by %s; returned %s', crash_id, rule_name, RESULT_TO_TEXT[throttle_result]) mymetrics.incr('throttle_rule', tags=['rule:%s' % rule_name]) mymetrics.incr( 'throttle', tags=['result:%s' % RESULT_TO_TEXT[throttle_result].lower()]) if throttle_result is REJECT: # If the result is REJECT, then discard it resp.body = 'Discarded=1' else: # If the result is not REJECT, then save it and return the CrashID to # the client self.crashmover_save_queue.append( CrashReport(raw_crash, dumps, crash_id)) self.hb_run_crashmover() resp.body = 'CrashID=%s%s\n' % (self.config('dump_id_prefix'), crash_id) def hb_run_crashmover(self): """Checks to see if it should spawn a crashmover and does if appropriate""" # Spawn a new crashmover if there's stuff in the queue and there isn't # one currently running if self.crashmover_save_queue and self.crashmover_pool.free_count( ) > 0: self.crashmover_pool.spawn(self.crashmover_process_queue) def crashmover_process_queue(self): """Processes the queue of crashes to save until it's empty Note: This has to be super careful not to lose crash reports. If there's any kind of problem, this must return the crash to the queue. """ # Process crashes until the queue is empty while self.crashmover_save_queue: crash_report = self.crashmover_save_queue.popleft() try: self.crashmover_save(crash_report) except Exception: mymetrics.incr('save_crash_exception.count') crash_report.errors += 1 logger.exception( 'Exception when processing save queue (%s); error %d/%d', crash_report.crash_id, crash_report.errors, MAX_ATTEMPTS) # After MAX_ATTEMPTS, we give up on this crash and move on if crash_report.errors < MAX_ATTEMPTS: self.crashmover_save_queue.append(crash_report) else: logger.error('%s: too many errors trying to save; dropped', crash_report.crash_id) mymetrics.incr('save_crash_dropped.count') def crashmover_save(self, crash_report): """Saves a crash to storage If this raises an error, then that bubbles up and the caller can figure out what to do with it and retry again later. """ crash_id = crash_report.crash_id dumps = crash_report.dumps raw_crash = crash_report.raw_crash # Capture total time it takes to save the crash with mymetrics.timer('crash_save.time'): # Save dumps to crashstorage self.crashstorage.save_dumps(crash_id, dumps) # Save the raw crash metadata to crashstorage self.crashstorage.save_raw_crash(crash_id, raw_crash) # Capture the total time it took for this crash to be handled from # being received from breakpad client to saving to s3. # # NOTE(willkg): time.time returns seconds, but .timing() wants # milliseconds, so we multiply! delta = (time.time() - raw_crash['timestamp']) * 1000 mymetrics.timing('crash_handling.time', value=delta) mymetrics.incr('save_crash.count') logger.info('%s saved', crash_id) def join_pool(self): """Joins the pool--use only in tests! This is helpful for forcing all the coroutines in the pool to complete so that we can verify outcomes in the test suite for work that might cross coroutines. """ self.crashmover_pool.join()
def throttler(): return Throttler(ConfigManager.from_dict({ 'PRODUCTS': 'antenna.throttler.ALL_PRODUCTS' }))
def test_ruleset(self): throttler = Throttler(ConfigManager.from_dict({ 'THROTTLE_RULES': 'antenna.throttler.accept_all' })) assert throttler.throttle({'ProductName': 'Test'}) == (ACCEPT, 'accept_everything', 100)
def test_ruleset(self): throttler = Throttler(ConfigManager.from_dict({ 'THROTTLE_RULES': 'antenna.throttler.ACCEPT_ALL' })) assert throttler.throttle({'ProductName': 'Test'}) == (ACCEPT, 'accept_everything', 100)
class BreakpadSubmitterResource(RequiredConfigMixin): """Handles incoming breakpad crash reports and saves to crashstorage. This handles incoming HTTP POST requests containing breakpad-style crash reports in multipart/form-data format. It can handle compressed or uncompressed POST payloads. It parses the payload from the HTTP POST request, runs it through the throttler with the specified rules, generates a crash_id, returns the crash_id to the HTTP client and then saves the crash using the configured crashstorage class. .. Note:: From when a crash comes in to when it's saved by the crashstorage class, the crash is entirely in memory. Keep that in mind when figuring out how to scale your Antenna nodes. The most important configuration bit here is choosing the crashstorage class. For example:: CRASHSTORAGE_CLASS=antenna.ext.s3.crashstorage.S3CrashStorage """ required_config = ConfigOptions() required_config.add_option( 'dump_field', default='upload_file_minidump', doc='The name of the field in the POST data for dumps.' ) required_config.add_option( 'dump_id_prefix', default='bp-', doc='The crash type prefix.' ) required_config.add_option( 'concurrent_crashmovers', default='2', parser=positive_int, doc=( 'The number of crashes concurrently being saved and published. ' 'Each process gets this many concurrent crashmovers, so if you\'re ' 'running 5 processes on the node, then it\'s ' '(5 * concurrent_crashmovers) sharing upload bandwidth.' ) ) # crashstorage things required_config.add_option( 'crashstorage_class', default='antenna.ext.crashstorage_base.NoOpCrashStorage', parser=parse_class, doc='The class in charge of storing crashes.' ) # crashpublish things required_config.add_option( 'crashpublish_class', default='antenna.ext.crashpublish_base.NoOpCrashPublish', parser=parse_class, doc='The class in charge of publishing crashes.' ) def __init__(self, config): self.config = config.with_options(self) self.crashstorage = self.config('crashstorage_class')(config.with_namespace('crashstorage')) self.crashpublish = self.config('crashpublish_class')(config.with_namespace('crashpublish')) self.throttler = Throttler(config) # Gevent pool for crashmover workers self.crashmover_pool = Pool(size=self.config('concurrent_crashmovers')) # Queue for crashmover work self.crashmover_queue = deque() # Register hb functions with heartbeat manager register_for_heartbeat(self.hb_report_health_stats) register_for_heartbeat(self.hb_run_crashmover) # Register life function with heartbeat manager register_for_life(self.has_work_to_do) def get_runtime_config(self, namespace=None): """Return generator of runtime configuration.""" for item in super().get_runtime_config(): yield item for item in self.throttler.get_runtime_config(): yield item for item in self.crashstorage.get_runtime_config(['crashstorage']): yield item for item in self.crashpublish.get_runtime_config(['crashpublish']): yield item def check_health(self, state): """Return health state.""" if hasattr(self.crashstorage, 'check_health'): self.crashstorage.check_health(state) if hasattr(self.crashpublish, 'check_health'): self.crashpublish.check_health(state) def hb_report_health_stats(self): """Heartbeat function to report health stats.""" # The number of crash reports sitting in the work queue; this is a # direct measure of the health of this process--a number that's going # up means impending doom mymetrics.gauge('work_queue_size', value=len(self.crashmover_queue)) def has_work_to_do(self): """Return whether this still has work to do.""" work_to_do = ( len(self.crashmover_pool) + len(self.crashmover_queue) ) logger.info('work left to do: %s' % work_to_do) # Indicates whether or not we're sitting on crashes to save--this helps # keep Antenna alive until we're done saving crashes return bool(work_to_do) def extract_payload(self, req): """Parse HTTP POST payload. Decompresses the payload if necessary and then walks through the FieldStorage converting from multipart/form-data to Python datatypes. NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It has a list attribute that is a list of FieldStorage items--one for each key/val in the form. For attached files, the FieldStorage will have a name, value and filename and the type should be application/octet-stream. Thus we parse it looking for things of type text/plain and application/octet-stream. :arg falcon.request.Request req: a Falcon Request instance :returns: (raw_crash dict, dumps dict) """ # If we don't have a content type, return an empty crash if not req.content_type: mymetrics.incr('malformed', tags=['reason:no_content_type']) return {}, {} # If it's the wrong content type or there's no boundary section, return # an empty crash content_type = [part.strip() for part in req.content_type.split(';', 1)] if ((len(content_type) != 2 or content_type[0] != 'multipart/form-data' or not content_type[1].startswith('boundary='))): if content_type[0] != 'multipart/form-data': mymetrics.incr('malformed', tags=['reason:wrong_content_type']) else: mymetrics.incr('malformed', tags=['reason:no_boundary']) return {}, {} content_length = req.content_length or 0 # If there's no content, return an empty crash if content_length == 0: mymetrics.incr('malformed', tags=['reason:no_content_length']) return {}, {} # Decompress payload if it's compressed if req.env.get('HTTP_CONTENT_ENCODING') == 'gzip': mymetrics.incr('gzipped_crash') # If the content is gzipped, we pull it out and decompress it. We # have to do that here because nginx doesn't have a good way to do # that in nginx-land. gzip_header = 16 + zlib.MAX_WBITS try: data = zlib.decompress(req.stream.read(content_length), gzip_header) except zlib.error: # This indicates this isn't a valid compressed stream. Given # that the HTTP request insists it is, we're just going to # assume it's junk and not try to process any further. mymetrics.incr('malformed', tags=['reason:bad_gzip']) return {}, {} # Stomp on the content length to correct it because we've changed # the payload size by decompressing it. We save the original value # in case we need to debug something later on. req.env['ORIG_CONTENT_LENGTH'] = content_length content_length = len(data) req.env['CONTENT_LENGTH'] = str(content_length) data = io.BytesIO(data) mymetrics.histogram('crash_size', value=content_length, tags=['payload:compressed']) else: # NOTE(willkg): At this point, req.stream is either a # falcon.request_helper.BoundedStream (in tests) or a # gunicorn.http.body.Body (in production). # # FieldStorage doesn't work with BoundedStream so we pluck out the # internal stream from that which works fine. # # FIXME(willkg): why don't tests work with BoundedStream? if isinstance(req.stream, BoundedStream): data = req.stream.stream else: data = req.stream mymetrics.histogram('crash_size', value=content_length, tags=['payload:uncompressed']) fs = cgi.FieldStorage(fp=data, environ=req.env, keep_blank_values=1) # NOTE(willkg): In the original collector, this returned request # querystring data as well as request body data, but we're not doing # that because the query string just duplicates data in the payload. raw_crash = {} dumps = {} has_json = False has_kvpairs = False for fs_item in fs.list: # NOTE(willkg): We saw some crashes come in where the raw crash ends up with # a None as a key. Make sure we can't end up with non-strings as keys. item_name = fs_item.name or '' if item_name == 'dump_checksums': # We don't want to pick up the dump_checksums from a raw # crash that was re-submitted. continue elif fs_item.type and fs_item.type.startswith('application/json'): # This is a JSON blob, so load it and override raw_crash with # it. has_json = True raw_crash = json.loads(fs_item.value) elif fs_item.type and (fs_item.type.startswith('application/octet-stream') or isinstance(fs_item.value, bytes)): # This is a dump, so add it to dumps using a sanitized dump # name. dump_name = sanitize_dump_name(item_name) dumps[dump_name] = fs_item.value else: # This isn't a dump, so it's a key/val pair, so we add that. has_kvpairs = True raw_crash[item_name] = fs_item.value if has_json and has_kvpairs: # If the crash payload has both kvpairs and a JSON blob, then it's # malformed and we should dump it. mymetrics.incr('malformed', tags=['reason:has_json_and_kv']) return {}, {} return raw_crash, dumps def get_throttle_result(self, raw_crash): """Run raw_crash through throttler for a throttling result. :arg dict raw_crash: the raw crash to throttle :returns tuple: ``(result, rule_name, percentage)`` """ # At this stage, nothing has given us a throttle answer, so we # throttle the crash. result, rule_name, throttle_rate = self.throttler.throttle(raw_crash) # Save the results in the raw_crash itself raw_crash['legacy_processing'] = result raw_crash['throttle_rate'] = throttle_rate return result, rule_name, throttle_rate @mymetrics.timer_decorator('on_post.time') def on_post(self, req, resp): """Handle incoming HTTP POSTs. Note: This is executed by the WSGI app, so it and anything it does is covered by the Sentry middleware. """ resp.status = falcon.HTTP_200 start_time = time.time() # NOTE(willkg): This has to return text/plain since that's what the # breakpad clients expect. resp.content_type = 'text/plain' raw_crash, dumps = self.extract_payload(req) # If we didn't get any crash data, then just drop it and move on--don't # count this as an incoming crash and don't do any more work on it if not raw_crash: resp.body = 'Discarded=1' return mymetrics.incr('incoming_crash') # Add timestamps current_timestamp = utc_now() raw_crash['submitted_timestamp'] = current_timestamp.isoformat() raw_crash['timestamp'] = start_time # Add checksums and MinidumpSha256Hash raw_crash['dump_checksums'] = { dump_name: hashlib.sha256(dump).hexdigest() for dump_name, dump in dumps.items() } raw_crash['MinidumpSha256Hash'] = raw_crash['dump_checksums'].get('upload_file_minidump', '') # First throttle the crash which gives us the information we need # to generate a crash id. throttle_result, rule_name, percentage = self.get_throttle_result(raw_crash) # Use a uuid if they gave us one and it's valid--otherwise create a new # one. if 'uuid' in raw_crash and validate_crash_id(raw_crash['uuid']): crash_id = raw_crash['uuid'] logger.info('%s has existing crash_id', crash_id) else: crash_id = create_crash_id( timestamp=current_timestamp, throttle_result=throttle_result ) raw_crash['uuid'] = crash_id raw_crash['type_tag'] = self.config('dump_id_prefix').strip('-') # Log the throttle result logger.info('%s: matched by %s; returned %s', crash_id, rule_name, RESULT_TO_TEXT[throttle_result]) mymetrics.incr('throttle_rule', tags=['rule:%s' % rule_name]) mymetrics.incr('throttle', tags=['result:%s' % RESULT_TO_TEXT[throttle_result].lower()]) if throttle_result is REJECT: # If the result is REJECT, then discard it resp.body = 'Discarded=1' elif throttle_result is FAKEACCEPT: # If the result is a FAKEACCEPT, then we return a crash id, but throw # the crash away resp.body = 'CrashID=%s%s\n' % (self.config('dump_id_prefix'), crash_id) else: # If the result is not REJECT, then save it and return the CrashID to # the client crash_report = CrashReport(raw_crash, dumps, crash_id) crash_report.set_state(STATE_SAVE) self.crashmover_queue.append(crash_report) self.hb_run_crashmover() resp.body = 'CrashID=%s%s\n' % (self.config('dump_id_prefix'), crash_id) def hb_run_crashmover(self): """Spawn a crashmover if there's work to do.""" # Spawn a new crashmover if there's stuff in the queue and we haven't # hit the limit of how many we can run if self.crashmover_queue and self.crashmover_pool.free_count() > 0: self.crashmover_pool.spawn(self.crashmover_process_queue) def crashmover_process_queue(self): """Process crashmover work. NOTE(willkg): This has to be super careful not to lose crash reports. If there's any kind of problem, this must return the crash report to the relevant queue. """ while self.crashmover_queue: crash_report = self.crashmover_queue.popleft() try: if crash_report.state == STATE_SAVE: # Save crash and then toss crash_id in the publish queue self.crashmover_save(crash_report) crash_report.set_state(STATE_PUBLISH) self.crashmover_queue.append(crash_report) elif crash_report.state == STATE_PUBLISH: # Publish crash and we're done self.crashmover_publish(crash_report) self.crashmover_finish(crash_report) except Exception: mymetrics.incr('%s_crash_exception.count' % crash_report.state) crash_report.errors += 1 logger.exception( 'Exception when processing queue (%s), state: %s; error %d/%d', crash_report.crash_id, crash_report.state, crash_report.errors, MAX_ATTEMPTS ) # After MAX_ATTEMPTS, we give up on this crash and move on if crash_report.errors < MAX_ATTEMPTS: self.crashmover_queue.append(crash_report) else: logger.error( '%s: too many errors trying to %s; dropped', crash_report.crash_id, crash_report.state ) mymetrics.incr('%s_crash_dropped.count' % crash_report.state) def crashmover_finish(self, crash_report): """Finish bookkeeping on crash report.""" # Capture the total time it took for this crash to be handled from # being received from breakpad client to saving to s3. # # NOTE(willkg): time.time returns seconds, but .timing() wants # milliseconds, so we multiply! delta = (time.time() - crash_report.raw_crash['timestamp']) * 1000 mymetrics.timing('crash_handling.time', value=delta) mymetrics.incr('save_crash.count') @mymetrics.timer('crash_save.time') def crashmover_save(self, crash_report): """Save crash report to storage.""" self.crashstorage.save_crash(crash_report) logger.info('%s saved', crash_report.crash_id) @mymetrics.timer('crash_publish.time') def crashmover_publish(self, crash_report): """Publish crash_id in publish queue.""" self.crashpublish.publish_crash(crash_report) logger.info('%s published', crash_report.crash_id) def join_pool(self): """Join the pool. NOTE(willkg): Only use this in tests! This is helpful for forcing all the coroutines in the pool to complete so that we can verify outcomes in the test suite for work that might cross coroutines. """ self.crashmover_pool.join()
def test_is_nothing(self): # None of the rules will match an empty crash raw_crash = {} throttler = Throttler(ConfigManager.from_dict({})) assert throttler.throttle(raw_crash) == (DEFER, 'NO_MATCH', 0)