def extract_payload(self, req):
        """Parse HTTP POST payload.

        Decompresses the payload if necessary and then walks through the
        FieldStorage converting from multipart/form-data to Python datatypes.

        NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It
        has a list attribute that is a list of FieldStorage items--one for each
        key/val in the form. For attached files, the FieldStorage will have a
        name, value and filename and the type should be
        ``application/octet-stream``. Thus we parse it looking for things of type
        ``text/plain``, ``application/json``, and application/octet-stream.

        :arg falcon.request.Request req: a Falcon Request instance

        :returns: (raw_crash dict, dumps dict)

        :raises MalformedCrashReport:

        """
        # If we don't have a content type, raise MalformedCrashReport
        if not req.content_type:
            raise MalformedCrashReport("no_content_type")

        # If it's the wrong content type or there's no boundary section, raise
        # MalformedCrashReport
        content_type = [part.strip() for part in req.content_type.split(";", 1)]
        if (
            len(content_type) != 2
            or content_type[0] != "multipart/form-data"
            or not content_type[1].startswith("boundary=")
        ):
            if content_type[0] != "multipart/form-data":
                raise MalformedCrashReport("wrong_content_type")
            else:
                raise MalformedCrashReport("no_boundary")

        content_length = req.content_length or 0

        # If there's no content, raise MalformedCrashReport
        if content_length == 0:
            raise MalformedCrashReport("no_content_length")

        # Decompress payload if it's compressed
        if req.env.get("HTTP_CONTENT_ENCODING") == "gzip":
            mymetrics.incr("gzipped_crash")

            # If the content is gzipped, we pull it out and decompress it. We
            # have to do that here because nginx doesn't have a good way to do
            # that in nginx-land.
            gzip_header = 16 + zlib.MAX_WBITS
            try:
                data = zlib.decompress(req.stream.read(content_length), gzip_header)
            except zlib.error:
                # This indicates this isn't a valid compressed stream. Given
                # that the HTTP request insists it is, we're just going to
                # assume it's junk and not try to process any further.
                raise MalformedCrashReport("bad_gzip")

            # Stomp on the content length to correct it because we've changed
            # the payload size by decompressing it. We save the original value
            # in case we need to debug something later on.
            req.env["ORIG_CONTENT_LENGTH"] = content_length
            content_length = len(data)
            req.env["CONTENT_LENGTH"] = str(content_length)

            data = io.BytesIO(data)
            mymetrics.histogram(
                "crash_size", value=content_length, tags=["payload:compressed"]
            )
        else:
            # NOTE(willkg): At this point, req.stream is either a
            # falcon.request_helper.BoundedStream (in tests) or a
            # gunicorn.http.body.Body (in production).
            #
            # FieldStorage doesn't work with BoundedStream so we pluck out the
            # internal stream from that which works fine.
            #
            # FIXME(willkg): why don't tests work with BoundedStream?
            if isinstance(req.stream, BoundedStream):
                data = req.stream.stream
            else:
                data = req.stream

            mymetrics.histogram(
                "crash_size", value=content_length, tags=["payload:uncompressed"]
            )

        # Stomp on querystring so we don't pull it in
        request_env = dict(req.env)
        request_env["QUERY_STRING"] = ""

        fs = cgi.FieldStorage(fp=data, environ=request_env, keep_blank_values=1)

        raw_crash = {}
        dumps = {}

        has_json = False
        has_kvpairs = False

        for fs_item in fs.list:
            # If the field has no name, then it's probably junk, so let's drop it.
            if not fs_item.name:
                continue

            if fs_item.name == "dump_checksums":
                # We don't want to pick up the dump_checksums from a raw
                # crash that was re-submitted.
                continue

            elif fs_item.type and fs_item.type.startswith("application/json"):
                # This is a JSON blob, so load it and override raw_crash with
                # it.
                has_json = True
                try:
                    raw_crash = json.loads(fs_item.value)
                except json.decoder.JSONDecodeError:
                    raise MalformedCrashReport("bad_json")

            elif fs_item.type and (
                fs_item.type.startswith("application/octet-stream")
                or isinstance(fs_item.value, bytes)
            ):
                # This is a dump, so add it to dumps using a sanitized dump
                # name.
                dump_name = sanitize_dump_name(fs_item.name)
                dumps[dump_name] = fs_item.value

            else:
                # This isn't a dump, so it's a key/val pair, so we add that.
                has_kvpairs = True
                raw_crash[fs_item.name] = fs_item.value

        if not raw_crash:
            raise MalformedCrashReport("no_annotations")

        if has_json and has_kvpairs:
            # If the crash payload has both kvpairs and a JSON blob, then it's
            # malformed and we should dump it.
            raise MalformedCrashReport("has_json_and_kv")

        # Add a note about how the annotations were encoded in the crash report.
        # For now, there are two options: json and multipart.
        if has_json:
            raw_crash["payload"] = "json"
        else:
            raw_crash["payload"] = "multipart"

        return raw_crash, dumps
Exemple #2
0
def test_sanitize_dump_name(data, expected):
    assert sanitize_dump_name(data) == expected
Exemple #3
0
def test_sanitize_dump_name(data, expected):
    assert sanitize_dump_name(data) == expected
Exemple #4
0
    def extract_payload(self, req):
        """Parses the HTTP POST payload

        Decompresses the payload if necessary and then walks through the
        FieldStorage converting from multipart/form-data to Python datatypes.

        NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It
        has a list attribute that is a list of FieldStorage items--one for each
        key/val in the form. For attached files, the FieldStorage will have a
        name, value and filename and the type should be
        application/octet-stream. Thus we parse it looking for things of type
        text/plain and application/octet-stream.

        :arg falcon.request.Request req: a Falcon Request instance

        :returns: (raw_crash dict, dumps dict)

        """
        # If we don't have a content type, return an empty crash
        if not req.content_type:
            return {}, {}

        # If it's the wrong content type or there's no boundary section, return
        # an empty crash
        content_type = [
            part.strip() for part in req.content_type.split(';', 1)
        ]
        if ((len(content_type) != 2 or content_type[0] != 'multipart/form-data'
             or not content_type[1].startswith('boundary='))):
            return {}, {}

        content_length = req.content_length or 0

        # If there's no content, return an empty crash
        if content_length == 0:
            return {}, {}

        # Decompress payload if it's compressed
        if req.env.get('HTTP_CONTENT_ENCODING') == 'gzip':
            mymetrics.incr('gzipped_crash')

            # If the content is gzipped, we pull it out and decompress it. We
            # have to do that here because nginx doesn't have a good way to do
            # that in nginx-land.
            gzip_header = 16 + zlib.MAX_WBITS
            try:
                data = zlib.decompress(req.stream.read(content_length),
                                       gzip_header)
            except zlib.error:
                # This indicates this isn't a valid compressed stream. Given
                # that the HTTP request insists it is, we're just going to
                # assume it's junk and not try to process any further.
                mymetrics.incr('bad_gzipped_crash')
                return {}, {}

            # Stomp on the content length to correct it because we've changed
            # the payload size by decompressing it. We save the original value
            # in case we need to debug something later on.
            req.env['ORIG_CONTENT_LENGTH'] = content_length
            content_length = len(data)
            req.env['CONTENT_LENGTH'] = str(content_length)

            data = io.BytesIO(data)
            mymetrics.histogram('crash_size',
                                value=content_length,
                                tags=['payload:compressed'])
        else:
            # NOTE(willkg): At this point, req.stream is either a
            # falcon.request_helper.BoundedStream (in tests) or a
            # gunicorn.http.body.Body (in production).
            #
            # FieldStorage doesn't work with BoundedStream so we pluck out the
            # internal stream from that which works fine.
            #
            # FIXME(willkg): why don't tests work with BoundedStream?
            if isinstance(req.stream, BoundedStream):
                data = req.stream.stream
            else:
                data = req.stream

            mymetrics.histogram('crash_size',
                                value=content_length,
                                tags=['payload:uncompressed'])

        fs = cgi.FieldStorage(fp=data, environ=req.env, keep_blank_values=1)

        # NOTE(willkg): In the original collector, this returned request
        # querystring data as well as request body data, but we're not doing
        # that because the query string just duplicates data in the payload.

        raw_crash = {}
        dumps = {}

        for fs_item in fs.list:
            # NOTE(willkg): We saw some crashes come in where the raw crash ends up with
            # a None as a key. Make sure we can't end up with non-strings as keys.
            item_name = de_null(fs_item.name or '')

            if item_name == 'dump_checksums':
                # We don't want to pick up the dump_checksums from a raw
                # crash that was re-submitted.
                continue

            elif fs_item.type and (
                    fs_item.type.startswith('application/octet-stream')
                    or isinstance(fs_item.value, bytes)):
                # This is a dump, so add it to dumps using a sanitized dump
                # name.
                dump_name = sanitize_dump_name(item_name)
                dumps[dump_name] = fs_item.value

            else:
                # This isn't a dump, so it's a key/val pair, so we add that.
                raw_crash[item_name] = de_null(fs_item.value)

        return raw_crash, dumps
Exemple #5
0
    def extract_payload(self, req):
        """Parse HTTP POST payload.

        Decompresses the payload if necessary and then walks through the
        FieldStorage converting from multipart/form-data to Python datatypes.

        NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It
        has a list attribute that is a list of FieldStorage items--one for each
        key/val in the form. For attached files, the FieldStorage will have a
        name, value and filename and the type should be
        application/octet-stream. Thus we parse it looking for things of type
        text/plain and application/octet-stream.

        :arg falcon.request.Request req: a Falcon Request instance

        :returns: (raw_crash dict, dumps dict)

        """
        # If we don't have a content type, return an empty crash
        if not req.content_type:
            mymetrics.incr('malformed', tags=['reason:no_content_type'])
            return {}, {}

        # If it's the wrong content type or there's no boundary section, return
        # an empty crash
        content_type = [part.strip() for part in req.content_type.split(';', 1)]
        if ((len(content_type) != 2 or
             content_type[0] != 'multipart/form-data' or
             not content_type[1].startswith('boundary='))):
            if content_type[0] != 'multipart/form-data':
                mymetrics.incr('malformed', tags=['reason:wrong_content_type'])
            else:
                mymetrics.incr('malformed', tags=['reason:no_boundary'])
            return {}, {}

        content_length = req.content_length or 0

        # If there's no content, return an empty crash
        if content_length == 0:
            mymetrics.incr('malformed', tags=['reason:no_content_length'])
            return {}, {}

        # Decompress payload if it's compressed
        if req.env.get('HTTP_CONTENT_ENCODING') == 'gzip':
            mymetrics.incr('gzipped_crash')

            # If the content is gzipped, we pull it out and decompress it. We
            # have to do that here because nginx doesn't have a good way to do
            # that in nginx-land.
            gzip_header = 16 + zlib.MAX_WBITS
            try:
                data = zlib.decompress(req.stream.read(content_length), gzip_header)
            except zlib.error:
                # This indicates this isn't a valid compressed stream. Given
                # that the HTTP request insists it is, we're just going to
                # assume it's junk and not try to process any further.
                mymetrics.incr('malformed', tags=['reason:bad_gzip'])
                return {}, {}

            # Stomp on the content length to correct it because we've changed
            # the payload size by decompressing it. We save the original value
            # in case we need to debug something later on.
            req.env['ORIG_CONTENT_LENGTH'] = content_length
            content_length = len(data)
            req.env['CONTENT_LENGTH'] = str(content_length)

            data = io.BytesIO(data)
            mymetrics.histogram('crash_size', value=content_length, tags=['payload:compressed'])
        else:
            # NOTE(willkg): At this point, req.stream is either a
            # falcon.request_helper.BoundedStream (in tests) or a
            # gunicorn.http.body.Body (in production).
            #
            # FieldStorage doesn't work with BoundedStream so we pluck out the
            # internal stream from that which works fine.
            #
            # FIXME(willkg): why don't tests work with BoundedStream?
            if isinstance(req.stream, BoundedStream):
                data = req.stream.stream
            else:
                data = req.stream

            mymetrics.histogram('crash_size', value=content_length, tags=['payload:uncompressed'])

        fs = cgi.FieldStorage(fp=data, environ=req.env, keep_blank_values=1)

        # NOTE(willkg): In the original collector, this returned request
        # querystring data as well as request body data, but we're not doing
        # that because the query string just duplicates data in the payload.

        raw_crash = {}
        dumps = {}

        has_json = False
        has_kvpairs = False

        for fs_item in fs.list:
            # NOTE(willkg): We saw some crashes come in where the raw crash ends up with
            # a None as a key. Make sure we can't end up with non-strings as keys.
            item_name = fs_item.name or ''

            if item_name == 'dump_checksums':
                # We don't want to pick up the dump_checksums from a raw
                # crash that was re-submitted.
                continue

            elif fs_item.type and fs_item.type.startswith('application/json'):
                # This is a JSON blob, so load it and override raw_crash with
                # it.
                has_json = True
                raw_crash = json.loads(fs_item.value)

            elif fs_item.type and (fs_item.type.startswith('application/octet-stream') or isinstance(fs_item.value, bytes)):
                # This is a dump, so add it to dumps using a sanitized dump
                # name.
                dump_name = sanitize_dump_name(item_name)
                dumps[dump_name] = fs_item.value

            else:
                # This isn't a dump, so it's a key/val pair, so we add that.
                has_kvpairs = True
                raw_crash[item_name] = fs_item.value

        if has_json and has_kvpairs:
            # If the crash payload has both kvpairs and a JSON blob, then it's
            # malformed and we should dump it.
            mymetrics.incr('malformed', tags=['reason:has_json_and_kv'])
            return {}, {}

        return raw_crash, dumps