Beispiel #1
0
 def test_implicit_utf8(self):
     """Test whether UTF-8 is tried even when not specified."""
     to_try = (('ascii', 'bad header'), )
     with self.assertLogs(logger, logging.WARNING):
         text, encoding = decode_and_report(b'smile \xf0\x9f\x98\x83',
                                            to_try, logger)
     self.assertEqual(text, 'smile \U0001f603')
     self.assertEqual(encoding, 'utf-8')
Beispiel #2
0
    def test_nonstandard(self):
        """Test handling of a non-standard encoding name."""
        def to_try():
            yield 'ascii', 'header'

        with self.assertLogs(logger, logging.INFO):
            text, encoding = decode_and_report(b'Hello', to_try(), logger)
        self.assertEqual(text, 'Hello')
        self.assertEqual(encoding, 'us-ascii')
Beispiel #3
0
    def test_trivial(self):
        """Test an input that should succeed without logging."""
        def to_try():
            yield 'us-ascii', 'header'

        with no_log(logger):
            text, encoding = decode_and_report(b'Hello', to_try(), logger)
        self.assertEqual(text, 'Hello')
        self.assertEqual(encoding, 'us-ascii')
Beispiel #4
0
 def test_invalid(self):
     """Test what happens when there is no valid way to decode."""
     to_try = (
         ('us-ascii', 'HTTP header'),
         (None, 'Unicode BOM'),
         ('utf-8', 'XML declaration'),
     )
     with self.assertRaises(ValueError):
         text, encoding = decode_and_report(b'cut-off smile \xf0\x9f\x98',
                                            to_try, logger)
Beispiel #5
0
 def test_none(self):
     """Test whether None entries are ignored."""
     to_try = (
         (None, 'HTTP header'),
         ('utf-8', 'XML declaration'),
         (None, 'Unicode BOM'),
     )
     with no_log(logger):
         text, encoding = decode_and_report(b'smile \xf0\x9f\x98\x83',
                                            to_try, logger)
     self.assertEqual(text, 'smile \U0001f603')
     self.assertEqual(encoding, 'utf-8')
Beispiel #6
0
def load_text(
    url: str,
    accept_header: str = 'text/plain'
) -> Tuple[Report, Optional[addinfourl], Optional[List[str]]]:
    """Load a text document.

    @param url:
        The URL of the document to load.
    @param accept_header:
        HTTP C{Accept} header to use for the request.
    @return: C{(report, response, contents)}

        C{report} is a L{Report} instance that may already
        have some messages logged to it.

        C{response} is an L{http.client.HTTPResponse} object if
        a response was received from the server, or C{None} otherwise.

        C{contents} is the document as a list of lines, or C{None} if
        the loading failed.
    """
    redirect_count = 0
    while True:
        report, response, content_bytes = load_page(
            url, accept_header=accept_header)
        if response is not None:
            if response.code in (200, None):
                break
            if response.code in (301, 302, 303, 307):
                redirect_count += 1
                if redirect_count <= 10:
                    # Note: The new URL could be outside our crawl root,
                    #       but since this function is not used for the
                    #       actual crawling, that is fine.
                    url = response.url
                    continue
                report.warning('Redirect limit exceeded')
        return report, response, None

    assert content_bytes is not None
    bom_encoding = encoding_from_bom(content_bytes)
    http_encoding = response.headers.get_content_charset()
    try:
        content, used_encoding_ = decode_and_report(
            content_bytes, ((bom_encoding, 'Byte Order Mark'),
                            (http_encoding, 'HTTP header')), report)
    except ValueError as ex:
        report.error('Failed to decode text document: %s', ex)
        return report, response, None
    else:
        return report, response, _RE_EOLN.split(content)
Beispiel #7
0
    def _check_response(self, req_url: str, report: Report,
                        response: addinfourl,
                        content_bytes: bytes) -> Iterator[Referrer]:
        """Check the server's response to a request."""

        if response.code not in (200, None):
            # TODO: This should probably be user-selectable.
            #       A lot of web servers produce error and redirection
            #       notices that are not HTML5 compliant. Checking the
            #       content is likely only useful if the application
            #       under test is producing the content instead.
            report.info('Skipping content check because of HTTP status %d',
                        response.code)
            report.checked = Checked.HTTP_STATUS_SKIP
            return

        headers = response.headers
        content_type_header = headers['Content-Type']
        if content_type_header is None:
            message = 'Missing Content-Type header'
            _LOG.error(message)
            report.error(message)
            return
        else:
            # Convert Header to plain string.
            content_type_header = str(content_type_header)

        content_type = headers.get_content_type()
        is_html = content_type in ('text/html', 'application/xhtml+xml')
        is_xml = content_type.endswith('/xml') or content_type.endswith('+xml')
        http_encoding = headers.get_content_charset()

        # Speculatively decode the first 1024 bytes, so we can look inside
        # the document for encoding clues.
        bom_encoding = encoding_from_bom(content_bytes)
        content_head = content_bytes[:1024].decode(bom_encoding or 'ascii',
                                                   'replace')

        if not is_xml and content_head.startswith('<?xml'):
            is_xml = True
            if req_url.startswith('file:'):
                # Silently correct content-type detection for local files.
                # This is not something the user can easily fix, so issuing
                # a warning would not be helpful.
                if content_type == 'text/html':
                    content_type = 'application/xhtml+xml'
            else:
                report.warning(
                    'Document is served with content type "%s" '
                    'but starts with an XML declaration', content_type)

        if is_html and is_xml and self.accept is Accept.HTML:
            report.warning(
                'HTML document is serialized as XML, while the HTTP Accept '
                'header did not include "application/xhtml+xml"')

        if is_xml or content_type.startswith('text/'):
            # This looks like a text document, now figure out the encoding.

            # Look for encoding in XML declaration (if any).
            decl_encoding = encoding_from_xml_decl(content_head)

            # TODO: Also look at HTML <meta> tags.

            # Try possible encodings in order of precedence.
            # W3C recommends giving the BOM, if present, precedence over HTTP.
            #   http://www.w3.org/International/questions/qa-byte-order-mark
            try:
                content, used_encoding = decode_and_report(
                    content_bytes, ((bom_encoding, 'Byte Order Mark'),
                                    (decl_encoding, 'XML declaration'),
                                    (http_encoding, 'HTTP header')), report)
            except ValueError as ex:
                # All likely encodings failed.
                report.error('Failed to decode contents: %s', ex)
            else:
                if req_url.startswith('file:'):
                    # Construct a new header that is likely more accurate.
                    content_type_header = \
                            f'{content_type}; charset={used_encoding}'

                if is_html or is_xml:
                    tree = parse_document(content, is_xml, report)
                    if tree is not None:
                        if repair_tree(tree, content_type, report):
                            # Offer the repaired tree to plugins, so they
                            # are more likely to be able to do their work.
                            repaired = etree.tostring(tree, encoding='utf-8')
                            assert isinstance(repaired, bytes)
                            content_bytes = repaired

                        # Find links to other documents.
                        yield from self.find_referrers_in_xml(
                            tree, req_url, report)
                        if is_html:
                            yield from self.find_referrers_in_html(
                                tree, req_url)

        self.plugins.resource_loaded(content_bytes, content_type_header,
                                     report)