def test_prefer(): [exch1] = load_from_file('funny_prefer') assert exch1.request.headers.prefer.value == [ Parametrized(Parametrized(prefer.handling, u'lenient'), [ Parametrized(u'param1', u"this is a parameter to 'handling'!"), Parametrized(u'param2', None), ]), Unavailable(b'BWS-is-not-parsed = because-see-errata'), Parametrized(Parametrized(prefer.wait, 600), []), Parametrized(Parametrized(u'my-pref', None), [ None, None, Parametrized(u'foo', None), None, None, Parametrized(u'bar', None), ]), Parametrized(Parametrized(prefer.respond_async, None), []), Parametrized(Parametrized(prefer.wait, 0), []), Parametrized( Parametrized(prefer.return_, Unavailable(b'something-else')), []), ] assert exch1.request.headers.prefer.wait == 600 assert exch1.request.headers.prefer.respond_async assert isinstance(exch1.request.headers.prefer.return_, Unavailable) assert exch1.request.headers.prefer[u'quux'] is None assert isinstance( exch1.responses[0].headers.preference_applied.respond_async, Unavailable) check_exchange(exch1) buf = io.BytesIO() text_report([exch1], buf) assert b'Preference-Applied: respond-async=true was not requested' \ in buf.getvalue() # not "respond-async=Unavailable"
def test_hsts(): [exch1] = load_from_file('funny_hsts.https') sts = exch1.responses[0].headers.strict_transport_security assert sts.value == [ Parametrized(hsts.max_age, 15768000), Parametrized(hsts.includesubdomains, Unavailable(u'xyzzy')), Parametrized(hsts.max_age, Unavailable(u'')), Parametrized(u'fooBar', None), ] assert sts.max_age == 15768000 assert sts.includesubdomains
def unicode_body(self): if not okay(self.decoded_body): return self.decoded_body if not okay(self.guessed_charset): return Unavailable(self.decoded_body) # pylint: disable=no-member return self.decoded_body.decode(self.guessed_charset)
def _process_pair(self, entry, pair): name, argument = pair if argument is None: if self.knowledge.argument_required(name): self.message.complain(1156, entry=entry, name=name) argument = Unavailable(u'') else: syntax = self.knowledge.syntax_for(name) if self.knowledge.no_argument(name): self.message.complain(1157, entry=entry, name=name) argument = Unavailable(argument) elif syntax is not None: argument = parse(argument, syntax, self.message.complain, 1158, place=entry, name=name, value=argument) return Parametrized(name, argument)
def _to_date(complain, d, m, y): try: return date(y, m, d) except ValueError: date_s = u'%d %s %d' % (d, _MONTH_NAMES[m - 1], y) complain(1222, date=date_s) return Unavailable(date_s)
def _process_content_length(msg, stream): n = msg.headers.content_length.value if not okay(n): msg.body = Unavailable() stream.sane = False elif n > MAX_BODY_SIZE: msg.body = Unavailable() stream.sane = False msg.complain(1298, place=msg.headers.content_length, size=n, max_size=MAX_BODY_SIZE) else: try: msg.body = stream.read(n) except ParseError as exc: msg.body = Unavailable() msg.complain(1004, error=exc)
def xml_data(self): if self.headers.content_type.is_okay and \ known.media_type.is_xml(self.headers.content_type.item) and \ okay(self.decoded_body) and self.content_is_full: try: # It's not inconceivable that a message might contain # maliciously constructed XML data, so we use `defusedxml`. return defusedxml.ElementTree.fromstring(self.decoded_body) except defusedxml.EntitiesForbidden: self.complain(1275) return Unavailable(self.decoded_body) # http://bugs.python.org/issue29896 except (xml.etree.ElementTree.ParseError, UnicodeError) as e: self.complain(1039, error=e) return Unavailable(self.decoded_body) else: return None
def _to_time(complain, h, m, s): try: # This doesn't parse the leap second 23:59:60 # that is explicitly specified in the RFC. # I can ignore this for now. return time(h, m, s) except ValueError: time_s = u'%02d:%02d:%02d' % (h, m, s) complain(1223, time=time_s) return Unavailable(time_s)
def test_cache_control(): [exch1] = load_from_file('funny_cache_control') headers = exch1.request.headers assert headers.cache_control.value == [ Parametrized(cache.max_age, 3600), Parametrized(cache.max_stale, 60), Unavailable(b'"foo bar"'), Parametrized(u'qux', u'xyzzy 123'), Parametrized(cache.no_transform, None), Parametrized(u'abcde', None), Parametrized(cache.min_fresh, Unavailable(u'')), Parametrized(cache.no_store, Unavailable(u'yes')), ] assert headers.pragma.value == [ u'no-cache', (u'foo', None), (u'bar', u'baz'), (u'qux', u'xyzzy'), Unavailable(b'no-cache=krekfewhrfk') ] assert cache.max_age in headers.cache_control assert headers.cache_control.max_age == 3600 assert cache.max_stale in headers.cache_control assert headers.cache_control.max_stale == 60 assert headers.cache_control[u'qux'] == u'xyzzy 123' assert cache.no_transform in headers.cache_control assert headers.cache_control.no_transform is True assert headers.cache_control[u'abcde'] is True assert headers.cache_control.no_cache is None assert cache.min_fresh in headers.cache_control assert headers.cache_control.min_fresh == Unavailable(u'') assert cache.no_store in headers.cache_control assert headers.cache_control.no_store assert cache.only_if_cached not in headers.cache_control
def test_unparseable_body(): [exch1, exch2] = load_from_file('1000_01') assert exch1.request.method == m.GET assert exch1.request.body == b'' assert len(exch1.responses) == 1 assert exch1.responses[0].status == st.ok assert not exch1.responses[0].headers.content_length.is_okay assert exch1.responses[0].headers.content_length.value == \ Unavailable(b'<ERROR>') # Headers that failed to parse never compare equal or unequal to anything. assert not exch1.responses[0].headers.content_length == \ Unavailable(b'<ERROR>') assert not exch1.responses[0].headers.content_length != \ Unavailable(b'<ERROR>') assert isinstance(exch1.responses[0].body, Unavailable) # The other one is just a box for no. 1010. assert exch2.request is None assert exch2.responses == []
def url_encoded_data(self): if self.headers.content_type == \ media.application_x_www_form_urlencoded and \ okay(self.decoded_body) and self.content_is_full: for char in iterbytes(self.decoded_body): if not URL_ENCODED_GOOD_CHARS[ord(char)]: self.complain(1040, char=format_chars([char])) return Unavailable(self.decoded_body) # pylint: disable=no-member return parse_qs(self.decoded_body.decode('ascii')) return None
def _parse_chunked(msg, stream): data = [] place = u'chunked framing' try: while _parse_chunk(stream, data): pass trailer = parse_header_fields(stream) with stream.parsing(chunked_body): stream.readlineend() except ParseError as e: msg.complain(1005, error=e) msg.body = Unavailable() except BodyTooLongError as e: msg.complain(1298, place=place, size=e.size, max_size=e.max_size) msg.body = Unavailable() else: stream.dump_complaints(msg.complain, place=place) msg.body = b''.join(data) msg.trailer_entries = trailer if trailer: msg.rebuild_headers() # Rebuild the `HeadersView` cache
def _decode_transfer_coding(msg, coding): if coding == tc.chunked: # The outermost chunked has already been peeled off at this point. msg.complain(1002) msg.body = Unavailable(msg.body) elif coding == tc.gzip or coding == tc.x_gzip: try: msg.body = decode_gzip(msg.body) except Exception as e: msg.complain(1027, coding=coding, error=e) msg.body = Unavailable(msg.body) elif coding == tc.deflate: try: msg.body = decode_deflate(msg.body) except Exception as e: msg.complain(1027, coding=coding, error=e) msg.body = Unavailable(msg.body) else: if okay(coding): msg.complain(1003, coding=coding) msg.body = Unavailable(msg.body)
def url_encoded_data(self): if self.headers.content_type == \ media.application_x_www_form_urlencoded and \ okay(self.decoded_body) and self.content_is_full: for byte in six.iterbytes(self.decoded_body): if not URL_ENCODED_GOOD_BYTES[byte]: char = six.int2byte(byte) self.complain(1040, char=format_chars([char])) return Unavailable(self.decoded_body) # pylint: disable=no-member return parse_qs(self.decoded_body.decode('ascii')) else: return None
def decoded_body(self): """The payload body with Content-Encoding removed.""" r = self.body codings = self.headers.content_encoding.value[:] while codings and okay(r) and r: coding = codings.pop() decoder = {cc.gzip: decode_gzip, cc.x_gzip: decode_gzip, cc.deflate: decode_deflate, cc.br: decode_brotli}.get(coding) if decoder is not None: try: r = decoder(r) except Exception as e: self.complain(1037, coding=coding, error=e) r = Unavailable(r) elif okay(coding): self.complain(1036, coding=coding) r = Unavailable(r) else: r = Unavailable(r) return r
def _process_response(data, req, creator, path): if data['status'] == 0: # Indicates error in Chrome. return None version, header_entries = _process_message(data, creator) status = StatusCode(data['status']) reason = data['statusText'] if creator in FIDDLER and req.method == m.CONNECT and status.successful: # Fiddler's HAR export adds extra debug headers to CONNECT responses # after the tunnel is closed. header_entries = [(name, value) for (name, value) in header_entries if name not in [u'EndTime', u'ClientToServerBytes', u'ServerToClientBytes']] # The logic for body is mostly like that for requests (see above). if data['bodySize'] == 0 or data['content']['size'] == 0 or \ status == st.not_modified: # Firefox also includes body on 304 body = b'' elif creator in FIREFOX: # Firefox seems to exports bogus bodySize: # see test/har_data/firefox_gif.har body = None # Browsers may set ``bodySize = -1`` even when ``content.size >= 0``. elif data['bodySize'] > 0 or data['content']['size'] > 0: body = Unavailable() else: body = None resp = Response(version, status, reason, header_entries, body=body, remark=u'from %s' % path) if data['content'].get('text') and status != st.not_modified: if data['content'].get('encoding', u'').lower() == u'base64': try: decoded_body = base64.b64decode(data['content']['text']) except ValueError: pass else: if creator in FIDDLER and req.method == m.CONNECT and \ status.successful and b'Fiddler' in decoded_body: # Fiddler's HAR export adds a body with debug information # to CONNECT responses. resp.body = b'' else: resp.decoded_body = decoded_body elif 'encoding' not in data['content']: resp.unicode_body = data['content']['text'] return resp
def _process_request(data, creator, path): version, header_entries = _process_message(data, creator) method = data['method'] parsed = urlparse(data['url']) scheme = parsed.scheme if method == m.CONNECT: target = parsed.netloc elif any(name == h.host for (name, _) in header_entries): # With HAR, we can't tell if the request was to a proxy or to a server. # So we force most requests into the "origin form" of the target, target = parsed.path if parsed.query: target += u'?' + parsed.query else: # However, if the request has no ``Host`` header, # the user won't be able to see the target host # unless we set the full URL ("absolute form") as the target. # To prevent this from having an effect on the proxy logic, # we explicitly set `Request.is_to_proxy` to `None` later. target = data['url'] if data['bodySize'] == 0: # No body, or a body of length 0 (which we do not distinguish). body = b'' elif data['bodySize'] > 0: # A message body was present, but we cannot recover it, # because message body is the body *with* ``Content-Encoding``, # and HAR does not include that. body = Unavailable() else: # Unknown. Maybe there was a body, maybe there wasn't. body = None text = None post = data.get('postData') if post and post.get('text'): text = post['text'] if creator in FIDDLER and method == m.CONNECT and u'Fiddler' in text: # Fiddler's HAR export adds a body with debug information # to CONNECT requests. text = None body = b'' req = Request(scheme, method, target, version, header_entries, body, remark=u'from %s' % path) if text is not None: req.unicode_body = text req.is_to_proxy = None # See above. return req
def json_data(self): if self.headers.content_type.is_okay and \ known.media_type.is_json(self.headers.content_type.item) and \ okay(self.unicode_body) and self.content_is_full: try: r = json.loads(self.unicode_body) except ValueError as e: self.complain(1038, error=e) r = Unavailable(self.unicode_body) else: if self.guessed_charset not in ['ascii', 'utf-8', None]: self.complain(1281) return r return None
def test_warning(): [exch1] = load_from_file('funny_warning') warning = exch1.responses[0].headers.warning assert warning.value == [ WarningValue(123, u'-', u'something', None), WarningValue(234, u'[::0]:8080', u'something else', datetime(2016, 1, 28, 8, 22, 4)), Unavailable(b'345 - forgot to quote this one'), WarningValue(456, u'baz', u'qux', None), WarningValue(567, u'-', u'xyzzy', None), ] assert repr(warning.value[0].code) == 'WarnCode(123)' assert 123 in warning assert 567 in warning assert 199 not in warning
def test_www_authenticate(): [exch1] = load_from_file('funny_www_authenticate') assert exch1.responses[0].headers.www_authenticate.value == [ Parametrized(u'Basic', MultiDict([(u'realm', u'my "magical" realm')])), Parametrized(u'Foo', MultiDict()), Parametrized(u'Bar', u'jgfCGSU8u=='), Parametrized(u'Baz', MultiDict()), Unavailable(b'Wrong=bad, Better'), Parametrized(u'Scheme1', MultiDict([(u'foo', u'bar'), (u'baz', u'qux')])), Parametrized(u'Scheme2', MultiDict()), Parametrized( u'Newauth', MultiDict([(u'realm', u'apps'), (u'type', u'1'), (u'title', u'Login to "apps"')])), Parametrized(auth.basic, MultiDict([(u'realm', u'simple')])), ]
def _parse_request_body(req, stream): # RFC 7230 section 3.3.3. if req.headers.transfer_encoding: codings = req.headers.transfer_encoding.value[:] if codings.pop() == tc.chunked: _parse_chunked(req, stream) else: req.body = Unavailable() req.complain(1001) stream.sane = False while codings and okay(req.body): _decode_transfer_coding(req, codings.pop()) elif req.headers.content_length: _process_content_length(req, stream) else: req.body = b''
def multipart_data(self): ctype = self.headers.content_type if ctype.is_okay and \ known.media_type.is_multipart(ctype.value.item) and \ okay(self.decoded_body) and self.content_is_full: # All multipart media types obey the same general syntax # specified in RFC 2046 Section 5.1, # and should be parseable as email message payloads. multipart_code = (b'Content-Type: ' + ctype.entries[0].value + b'\r\n\r\n' + self.decoded_body) parsed = parse_email_message(multipart_code) for d in parsed.defects: if isinstance(d, email.errors.NoBoundaryInMultipartDefect): self.complain(1139) elif isinstance(d, email.errors.StartBoundaryNotFoundDefect): self.complain(1140) if not parsed.is_multipart(): return Unavailable(self.decoded_body) return parsed return None
def _parse(self): entries, values = self._pre_parse() # Some headers, such as ``Vary``, permit both a comma-separated list # (which can be spread over multiple entries) as well as a singular # value (which cannot be combined with any other). singular = [v for v in values if not isinstance(v, (list, Unavailable))] if singular: if len(singular) == len(entries) == 1: self._value = singular[0] else: self._value = [Unavailable()] self.message.complain(1013, header=self, entries=entries) else: self._value = [] for v in values: if isinstance(v, Unavailable): self._value.append(v) else: self._value.extend(v) self._entries = entries
def parse(data, symbol, complain=None, fail_notice_id=None, annotate_classes=None, **extra_context): """(Try to) parse a string as a grammar symbol. Uses memoization internally, so parsing the same strings many times isn't expensive. :param data: The bytestring or Unicode string to parse. Unicode will be encoded to ISO-8859-1 first; encoding failure is treated as a parse failure. :param symbol: The :class:`Symbol` to parse as. :param complain: If not `None`, this function will be called with any complaints produced while parsing (only if parsing was successful), like `Blackboard.complain`. :param fail_notice_id: If not `None`, failure to parse will be reported as this notice ID instead of raising `ParseError`. The complaint will have an ``error`` key with the `ParseError` as value. :param annotate_classes: If not `None`, these classes will be annotated in the input `data`. Any `extra_context` will be passed to `complain` with every complaint. :return: If `annotate_classes` is `None`, then the result of parsing (`Unavailable` if parse failed). Otherwise, a pair: the same result + the annotated input string as a list of bytestrings and instances of `annotate_classes`. :raises: If `fail_notice_id` is `None`, raises :exc:`ParseError` on parse failure, or :exc:`UnicodeError` if `data` cannot be encoded to bytes. """ annotate_classes = tuple(annotate_classes or ()) # for `isinstance` if not isinstance(data, bytes): try: data = data.encode('iso-8859-1') except UnicodeError as e: if fail_notice_id is None: # pragma: no cover raise complain(fail_notice_id, error=e, **extra_context) r = Unavailable(data) return (r, None) if annotate_classes else r # Check if we have already memoized this. key = (data, symbol, annotate_classes) parse_result = _memo.pop(key, None) if parse_result is not None: _memo[key] = parse_result # Reinsertion maintains LRU order. else: try: parse_result = _inner_parse(data, symbol.as_nonterminal(), annotate_classes) except ParseError as e: if fail_notice_id is None: raise complaint = (fail_notice_id, {'error': e}) parse_result = (Unavailable(data), [complaint], []) else: _memo[key] = parse_result while len(_memo) > MEMO_LIMIT: _memo.popitem(last=False) (r, complaints, annotations) = parse_result if complain is not None: for (notice_id, context) in complaints: context = dict(extra_context, **context) complain(notice_id, **context) if not annotate_classes: return r return (r, _splice_annotations(data, annotations))
def _process_request(data, creator, path): (version, header_entries, pseudo_headers) = _process_message(data, creator) if creator.is_chrome and version == http11 and u':host' in pseudo_headers: # SPDY exported from Chrome. version = None # Firefox exports "Connection: keep-alive" on HTTP/2 requests # (which triggers notice 1244) # even though it does not actually send it # (this can be verified with SSLKEYLOGFILE + Wireshark). if creator.is_firefox and version == http2: header_entries = [(name, value) for (name, value) in header_entries if (name, value) != (h.connection, u'keep-alive')] method = data['method'] header_names = {name for (name, _) in header_entries} parsed = urlparse(data['url']) scheme = parsed.scheme if creator.is_insomnia: # https://github.com/getinsomnia/insomnia/issues/840 if h.host not in header_names: header_entries.insert(0, (h.host, parsed.netloc)) if h.user_agent not in header_names: # The actual version can probably be extracted from ua_string = u'insomnia/%s' % creator.reconstruct_insomnia_version() header_entries.append((h.user_agent, ua_string)) if h.accept not in header_names: header_entries.append((h.accept, u'*/*')) header_names = {name for (name, _) in header_entries} if method == m.CONNECT: target = parsed.netloc elif h.host in header_names: # With HAR, we can't tell if the request was to a proxy or to a server. # So we force most requests into the "origin form" of the target, target = parsed.path if parsed.query: target += u'?' + parsed.query else: # However, if the request has no ``Host`` header, # the user won't be able to see the target host # unless we set the full URL ("absolute form") as the target. # To prevent this from having an effect on the proxy logic, # we explicitly set `Request.is_to_proxy` to `None` later. target = data['url'] if data['bodySize'] == 0: # No body, or a body of length 0 (which we do not distinguish). body = b'' elif data['bodySize'] > 0: # A message body was present, but we cannot recover it, # because message body is the body *with* ``Content-Encoding``, # and HAR does not include that. body = Unavailable() else: # Unknown. Maybe there was a body, maybe there wasn't. body = None text = None post = data.get('postData') if post and post.get('text'): text = post['text'] if creator.is_firefox and \ post['mimeType'] == media.application_x_www_form_urlencoded \ and u'\r\n' in text: # Yes, Firefox actually outputs this stuff. Go figure. (wtf, actual_text) = text.rsplit(u'\r\n', 1) try: buf = io.BufferedReader(io.BytesIO(wtf.encode('iso-8859-1'))) more_entries = framing1.parse_header_fields(Stream(buf)) except (UnicodeError, ParseError): # pragma: no cover pass else: header_entries.extend(more_entries) text = actual_text if creator.is_fiddler and method == m.CONNECT and u'Fiddler' in text: # Fiddler's HAR export adds a body with debug information # to CONNECT requests. text = None body = b'' req = Request(scheme, method, target, version, header_entries, body, remark=u'from %s' % path) if text is not None: req.unicode_body = text req.is_to_proxy = None # See above. return req
def _process_response(data, req, creator, path): if data['status'] == 0: # Indicates error in Chrome. return None (version, header_entries, _) = _process_message(data, creator) status = StatusCode(data['status']) reason = data['statusText'] if creator.is_firefox: # Firefox joins all ``Set-Cookie`` response fields with newlines. # (It also joins other fields with commas, # but that is permitted by RFC 7230 Section 3.2.2.) header_entries = [(name, value) for (name, joined_value) in header_entries for value in (joined_value.split(u'\n') if name == h.set_cookie else [joined_value])] if creator.is_fiddler and req.method == m.CONNECT and status.successful: # Fiddler's HAR export adds extra debug headers to CONNECT responses # after the tunnel is closed. header_entries = [ (name, value) for (name, value) in header_entries if name not in [u'EndTime', u'ClientToServerBytes', u'ServerToClientBytes'] ] # The logic for body is similar to that for requests (see above), # except that # (1) Firefox also includes a body with 304 responses; # (2) browsers may set ``bodySize = -1`` even when ``content.size >= 0``. if data['bodySize'] == 0 or data['content']['size'] == 0 or \ status == st.not_modified: body = b'' elif data['bodySize'] > 0 or data['content']['size'] > 0: body = Unavailable() else: body = None if version == http11 and creator.is_firefox and \ any(name == u'x-firefox-spdy' for (name, _) in header_entries): # Helps with SPDY in Firefox. version = None if creator.is_chrome and version != req.version: # Helps with SPDY in Chrome. version = None resp = Response(version, status, reason, header_entries, body=body, remark=u'from %s' % path) if data['content'].get('text') and status != st.not_modified: if data['content'].get('encoding', u'').lower() == u'base64': try: decoded_body = base64.b64decode(data['content']['text']) except ValueError: # Firefox sometimes marks normal, unencoded text as "base64" # (see ``test/har_data/firefox_gif.har``). # But let's not try to guess. pass else: if creator.is_fiddler and req.method == m.CONNECT and \ status.successful and b'Fiddler' in decoded_body: # Fiddler's HAR export adds a body with debug information # to CONNECT responses. resp.body = b'' else: resp.decoded_body = decoded_body elif 'encoding' not in data['content']: resp.unicode_body = data['content']['text'] return resp
def _to_datetime(dow, d, t): if not okay(d) or not okay(t): return (dow, Unavailable(u'%s %s' % (d, t))) else: return (dow, datetime(d.year, d.month, d.day, t.hour, t.minute, t.second))
def test_bad_content_encoding(): [exch1] = load_from_file('bad_content_encoding') assert exch1.responses[0].decoded_body == Unavailable(b'Hello world!\r\n')