def extract_payload(self, req): """Parse HTTP POST payload. Decompresses the payload if necessary and then walks through the FieldStorage converting from multipart/form-data to Python datatypes. NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It has a list attribute that is a list of FieldStorage items--one for each key/val in the form. For attached files, the FieldStorage will have a name, value and filename and the type should be ``application/octet-stream``. Thus we parse it looking for things of type ``text/plain``, ``application/json``, and application/octet-stream. :arg falcon.request.Request req: a Falcon Request instance :returns: (raw_crash dict, dumps dict) :raises MalformedCrashReport: """ # If we don't have a content type, raise MalformedCrashReport if not req.content_type: raise MalformedCrashReport("no_content_type") # If it's the wrong content type or there's no boundary section, raise # MalformedCrashReport content_type = [part.strip() for part in req.content_type.split(";", 1)] if ( len(content_type) != 2 or content_type[0] != "multipart/form-data" or not content_type[1].startswith("boundary=") ): if content_type[0] != "multipart/form-data": raise MalformedCrashReport("wrong_content_type") else: raise MalformedCrashReport("no_boundary") content_length = req.content_length or 0 # If there's no content, raise MalformedCrashReport if content_length == 0: raise MalformedCrashReport("no_content_length") # Decompress payload if it's compressed if req.env.get("HTTP_CONTENT_ENCODING") == "gzip": mymetrics.incr("gzipped_crash") # If the content is gzipped, we pull it out and decompress it. We # have to do that here because nginx doesn't have a good way to do # that in nginx-land. gzip_header = 16 + zlib.MAX_WBITS try: data = zlib.decompress(req.stream.read(content_length), gzip_header) except zlib.error: # This indicates this isn't a valid compressed stream. Given # that the HTTP request insists it is, we're just going to # assume it's junk and not try to process any further. raise MalformedCrashReport("bad_gzip") # Stomp on the content length to correct it because we've changed # the payload size by decompressing it. We save the original value # in case we need to debug something later on. req.env["ORIG_CONTENT_LENGTH"] = content_length content_length = len(data) req.env["CONTENT_LENGTH"] = str(content_length) data = io.BytesIO(data) mymetrics.histogram( "crash_size", value=content_length, tags=["payload:compressed"] ) else: # NOTE(willkg): At this point, req.stream is either a # falcon.request_helper.BoundedStream (in tests) or a # gunicorn.http.body.Body (in production). # # FieldStorage doesn't work with BoundedStream so we pluck out the # internal stream from that which works fine. # # FIXME(willkg): why don't tests work with BoundedStream? if isinstance(req.stream, BoundedStream): data = req.stream.stream else: data = req.stream mymetrics.histogram( "crash_size", value=content_length, tags=["payload:uncompressed"] ) # Stomp on querystring so we don't pull it in request_env = dict(req.env) request_env["QUERY_STRING"] = "" fs = cgi.FieldStorage(fp=data, environ=request_env, keep_blank_values=1) raw_crash = {} dumps = {} has_json = False has_kvpairs = False for fs_item in fs.list: # If the field has no name, then it's probably junk, so let's drop it. if not fs_item.name: continue if fs_item.name == "dump_checksums": # We don't want to pick up the dump_checksums from a raw # crash that was re-submitted. continue elif fs_item.type and fs_item.type.startswith("application/json"): # This is a JSON blob, so load it and override raw_crash with # it. has_json = True try: raw_crash = json.loads(fs_item.value) except json.decoder.JSONDecodeError: raise MalformedCrashReport("bad_json") elif fs_item.type and ( fs_item.type.startswith("application/octet-stream") or isinstance(fs_item.value, bytes) ): # This is a dump, so add it to dumps using a sanitized dump # name. dump_name = sanitize_dump_name(fs_item.name) dumps[dump_name] = fs_item.value else: # This isn't a dump, so it's a key/val pair, so we add that. has_kvpairs = True raw_crash[fs_item.name] = fs_item.value if not raw_crash: raise MalformedCrashReport("no_annotations") if has_json and has_kvpairs: # If the crash payload has both kvpairs and a JSON blob, then it's # malformed and we should dump it. raise MalformedCrashReport("has_json_and_kv") # Add a note about how the annotations were encoded in the crash report. # For now, there are two options: json and multipart. if has_json: raw_crash["payload"] = "json" else: raw_crash["payload"] = "multipart" return raw_crash, dumps
def test_sanitize_dump_name(data, expected): assert sanitize_dump_name(data) == expected
def extract_payload(self, req): """Parses the HTTP POST payload Decompresses the payload if necessary and then walks through the FieldStorage converting from multipart/form-data to Python datatypes. NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It has a list attribute that is a list of FieldStorage items--one for each key/val in the form. For attached files, the FieldStorage will have a name, value and filename and the type should be application/octet-stream. Thus we parse it looking for things of type text/plain and application/octet-stream. :arg falcon.request.Request req: a Falcon Request instance :returns: (raw_crash dict, dumps dict) """ # If we don't have a content type, return an empty crash if not req.content_type: return {}, {} # If it's the wrong content type or there's no boundary section, return # an empty crash content_type = [ part.strip() for part in req.content_type.split(';', 1) ] if ((len(content_type) != 2 or content_type[0] != 'multipart/form-data' or not content_type[1].startswith('boundary='))): return {}, {} content_length = req.content_length or 0 # If there's no content, return an empty crash if content_length == 0: return {}, {} # Decompress payload if it's compressed if req.env.get('HTTP_CONTENT_ENCODING') == 'gzip': mymetrics.incr('gzipped_crash') # If the content is gzipped, we pull it out and decompress it. We # have to do that here because nginx doesn't have a good way to do # that in nginx-land. gzip_header = 16 + zlib.MAX_WBITS try: data = zlib.decompress(req.stream.read(content_length), gzip_header) except zlib.error: # This indicates this isn't a valid compressed stream. Given # that the HTTP request insists it is, we're just going to # assume it's junk and not try to process any further. mymetrics.incr('bad_gzipped_crash') return {}, {} # Stomp on the content length to correct it because we've changed # the payload size by decompressing it. We save the original value # in case we need to debug something later on. req.env['ORIG_CONTENT_LENGTH'] = content_length content_length = len(data) req.env['CONTENT_LENGTH'] = str(content_length) data = io.BytesIO(data) mymetrics.histogram('crash_size', value=content_length, tags=['payload:compressed']) else: # NOTE(willkg): At this point, req.stream is either a # falcon.request_helper.BoundedStream (in tests) or a # gunicorn.http.body.Body (in production). # # FieldStorage doesn't work with BoundedStream so we pluck out the # internal stream from that which works fine. # # FIXME(willkg): why don't tests work with BoundedStream? if isinstance(req.stream, BoundedStream): data = req.stream.stream else: data = req.stream mymetrics.histogram('crash_size', value=content_length, tags=['payload:uncompressed']) fs = cgi.FieldStorage(fp=data, environ=req.env, keep_blank_values=1) # NOTE(willkg): In the original collector, this returned request # querystring data as well as request body data, but we're not doing # that because the query string just duplicates data in the payload. raw_crash = {} dumps = {} for fs_item in fs.list: # NOTE(willkg): We saw some crashes come in where the raw crash ends up with # a None as a key. Make sure we can't end up with non-strings as keys. item_name = de_null(fs_item.name or '') if item_name == 'dump_checksums': # We don't want to pick up the dump_checksums from a raw # crash that was re-submitted. continue elif fs_item.type and ( fs_item.type.startswith('application/octet-stream') or isinstance(fs_item.value, bytes)): # This is a dump, so add it to dumps using a sanitized dump # name. dump_name = sanitize_dump_name(item_name) dumps[dump_name] = fs_item.value else: # This isn't a dump, so it's a key/val pair, so we add that. raw_crash[item_name] = de_null(fs_item.value) return raw_crash, dumps
def extract_payload(self, req): """Parse HTTP POST payload. Decompresses the payload if necessary and then walks through the FieldStorage converting from multipart/form-data to Python datatypes. NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It has a list attribute that is a list of FieldStorage items--one for each key/val in the form. For attached files, the FieldStorage will have a name, value and filename and the type should be application/octet-stream. Thus we parse it looking for things of type text/plain and application/octet-stream. :arg falcon.request.Request req: a Falcon Request instance :returns: (raw_crash dict, dumps dict) """ # If we don't have a content type, return an empty crash if not req.content_type: mymetrics.incr('malformed', tags=['reason:no_content_type']) return {}, {} # If it's the wrong content type or there's no boundary section, return # an empty crash content_type = [part.strip() for part in req.content_type.split(';', 1)] if ((len(content_type) != 2 or content_type[0] != 'multipart/form-data' or not content_type[1].startswith('boundary='))): if content_type[0] != 'multipart/form-data': mymetrics.incr('malformed', tags=['reason:wrong_content_type']) else: mymetrics.incr('malformed', tags=['reason:no_boundary']) return {}, {} content_length = req.content_length or 0 # If there's no content, return an empty crash if content_length == 0: mymetrics.incr('malformed', tags=['reason:no_content_length']) return {}, {} # Decompress payload if it's compressed if req.env.get('HTTP_CONTENT_ENCODING') == 'gzip': mymetrics.incr('gzipped_crash') # If the content is gzipped, we pull it out and decompress it. We # have to do that here because nginx doesn't have a good way to do # that in nginx-land. gzip_header = 16 + zlib.MAX_WBITS try: data = zlib.decompress(req.stream.read(content_length), gzip_header) except zlib.error: # This indicates this isn't a valid compressed stream. Given # that the HTTP request insists it is, we're just going to # assume it's junk and not try to process any further. mymetrics.incr('malformed', tags=['reason:bad_gzip']) return {}, {} # Stomp on the content length to correct it because we've changed # the payload size by decompressing it. We save the original value # in case we need to debug something later on. req.env['ORIG_CONTENT_LENGTH'] = content_length content_length = len(data) req.env['CONTENT_LENGTH'] = str(content_length) data = io.BytesIO(data) mymetrics.histogram('crash_size', value=content_length, tags=['payload:compressed']) else: # NOTE(willkg): At this point, req.stream is either a # falcon.request_helper.BoundedStream (in tests) or a # gunicorn.http.body.Body (in production). # # FieldStorage doesn't work with BoundedStream so we pluck out the # internal stream from that which works fine. # # FIXME(willkg): why don't tests work with BoundedStream? if isinstance(req.stream, BoundedStream): data = req.stream.stream else: data = req.stream mymetrics.histogram('crash_size', value=content_length, tags=['payload:uncompressed']) fs = cgi.FieldStorage(fp=data, environ=req.env, keep_blank_values=1) # NOTE(willkg): In the original collector, this returned request # querystring data as well as request body data, but we're not doing # that because the query string just duplicates data in the payload. raw_crash = {} dumps = {} has_json = False has_kvpairs = False for fs_item in fs.list: # NOTE(willkg): We saw some crashes come in where the raw crash ends up with # a None as a key. Make sure we can't end up with non-strings as keys. item_name = fs_item.name or '' if item_name == 'dump_checksums': # We don't want to pick up the dump_checksums from a raw # crash that was re-submitted. continue elif fs_item.type and fs_item.type.startswith('application/json'): # This is a JSON blob, so load it and override raw_crash with # it. has_json = True raw_crash = json.loads(fs_item.value) elif fs_item.type and (fs_item.type.startswith('application/octet-stream') or isinstance(fs_item.value, bytes)): # This is a dump, so add it to dumps using a sanitized dump # name. dump_name = sanitize_dump_name(item_name) dumps[dump_name] = fs_item.value else: # This isn't a dump, so it's a key/val pair, so we add that. has_kvpairs = True raw_crash[item_name] = fs_item.value if has_json and has_kvpairs: # If the crash payload has both kvpairs and a JSON blob, then it's # malformed and we should dump it. mymetrics.incr('malformed', tags=['reason:has_json_and_kv']) return {}, {} return raw_crash, dumps