Ejemplo n.º 1
0
    def capture(self, url):
        """
        Capture all stylesheets at given URL's HTML document.
        Any HTTPError is raised to caller.

        url
            to capture CSS from

        Returns ``cssutils.stylesheets.StyleSheetList``.
        """
        self._log.info('\nCapturing CSS from URL:\n    %s\n', url)
        self._nonparsed = {}
        self.stylesheetlist = cssutils.stylesheets.StyleSheetList()

        # used to save inline styles
        scheme, loc, path, query, fragment = urllib.parse.urlsplit(url)
        self._filename = os.path.basename(path)

        # get url content
        url, res = self._doRequest(url)
        if not res:
            sys.exit(1)

        rawdoc = res.read()

        self.docencoding = encutils.getEncodingInfo(
            res, rawdoc, log=self._log).encoding
        self._log.info('\nUsing Encoding: %s\n', self.docencoding)

        doctext = rawdoc.decode(self.docencoding)

        # fill list of stylesheets and list of raw css
        self._findStyleSheets(url, doctext)

        return self.stylesheetlist
Ejemplo n.º 2
0
    def capture(self, url):
        """
        Capture all stylesheets at given URL's HTML document.
        Any HTTPError is raised to caller.

        url
            to capture CSS from

        Returns ``cssutils.stylesheets.StyleSheetList``.
        """
        self._log.info('\nCapturing CSS from URL:\n    %s\n', url)
        self._nonparsed = {}
        self.stylesheetlist = cssutils.stylesheets.StyleSheetList()

        # used to save inline styles
        scheme, loc, path, query, fragment = urllib.parse.urlsplit(url)
        self._filename = os.path.basename(path)

        # get url content
        url, res = self._doRequest(url)
        if not res:
            sys.exit(1)

        rawdoc = res.read()

        self.docencoding = encutils.getEncodingInfo(
            res, rawdoc, log=self._log).encoding
        self._log.info('\nUsing Encoding: %s\n', self.docencoding)

        doctext = rawdoc.decode(self.docencoding)

        # fill list of stylesheets and list of raw css
        self._findStyleSheets(url, doctext)

        return self.stylesheetlist
Ejemplo n.º 3
0
    def test_getEncodingInfo(self):
        "encutils.getEncodingInfo"
        # (expectedencoding, expectedmismatch): (httpheader, filecontent)
        tests = [

            # --- application/xhtml+xml ---

            # header default and XML default
            (('utf-8', False), (
                '''Content-Type: application/xhtml+xml''',
                '''<?xml version="1.0" ?>
                    <example>
                        <meta http-equiv="Content-Type"
                            content="application/xhtml+xml"/>
                    </example>''')),
            # XML default
            (('utf-8', False), (
                None,
                '''<?xml version="1.0" ?>
                    <example>
                        <meta http-equiv="Content-Type"
                            content="application/xhtml+xml"/>
                    </example>''')),
            # meta is ignored!
            (('utf-8', False), (
                '''Content-Type: application/xhtml+xml''',
                '''<?xml version="1.0" ?>
                    <example>
                        <meta http-equiv="Content-Type"
                            content="application/xhtml+xml;charset=iso_M"/>
                    </example>''')),

            # header enc and XML default
            (('iso-h', True), (
                '''Content-Type: application/xhtml+xml;charset=iso-H''',
                '''<?xml version="1.0" ?>
                    <example>
                        <meta http-equiv="Content-Type"
                            content="application/xhtml+xml"/>
                    </example>''')),

            # mismatch header and XML explicit, header wins
            (('iso-h', True), (
                '''Content-Type: application/xhtml+xml;charset=iso-H''',
                '''<?xml version="1.0" encoding="iso-X" ?>
                    <example/>''')),

            # header == XML, meta ignored!
            (('iso-h', False), (
                '''Content-Type: application/xhtml+xml;charset=iso-H''',
                '''<?xml version="1.0" encoding="iso-h" ?>
                    <example>
                        <meta http-equiv="Content-Type"
                            content="application/xhtml+xml;charset=iso_M"/>
                    </example>''')),

            # XML only, meta ignored!
            (('iso-x', False), (
                '''Content-Type: application/xhtml+xml''',
                '''<?xml version="1.0" encoding="iso-X" ?>
                    <example>
                        <meta http-equiv="Content-Type"
                            content="application/xhtml+xml;charset=iso_M"/>
                    </example>''')),


            # no text or not enough text:
            (('iso-h', False), ('Content-Type: application/xml;charset=iso-h',
                             '1')),
            (('utf-8', False), ('Content-Type: application/xml',
                                None)),
            ((None, False), ('Content-Type: application/xml',
                             '1')),


            # --- text/xml ---

            # default enc
            (('ascii', False), (
                '''Content-Type: text/xml''',
                '''<?xml version="1.0" ?>
                    <example>
                        <meta http-equiv="Content-Type"
                            content="text/xml"/>
                    </example>''')),
            # default as XML ignored and meta completely ignored
            (('ascii', False), (
                '''Content-Type: text/xml''',
                '''<?xml version="1.0" encoding="iso-X" ?>
                    <example>
                        <meta http-equiv="Content-Type"
                            content="text/xml;charset=iso_M"/>
                    </example>''')),
            (('ascii', False), ('Content-Type: text/xml',
                                '1')),
            (('ascii', False), ('Content-Type: text/xml',
                                None)),

            # header enc
            (('iso-h', False), (
                '''Content-Type: text/xml;charset=iso-H''',
                '''<?xml version="1.0" ?>
                    <example>
                        <meta http-equiv="Content-Type"
                            content="text/xml"/>
                    </example>''')),

            # header only, XML and meta ignored!
            (('iso-h', False), (
                '''Content-Type: text/xml;charset=iso-H''',
                '''<?xml version="1.0" encoding="iso-X" ?>
                    <example/>''')),
            (('iso-h', False), (
                '''Content-Type: text/xml;charset=iso-H''',
                '''<?xml version="1.0"  encoding="iso-h" ?>
                    <example>
                        <meta http-equiv="Content-Type"
                            content="text/xml;charset=iso_M"/>
                    </example>''')),


            # --- text/html ---

            # default enc
            (('iso-8859-1', False), ('Content-Type: text/html;',
                                     '''<meta http-equiv="Content-Type"
                                        content="text/html">''')),
            (('iso-8859-1', False), ('Content-Type: text/html;',
                                     None)),

            # header enc
            (('iso-h', False), ('Content-Type: text/html;charset=iso-H',
                                '''<meta http-equiv="Content-Type"
                                    content="text/html">''')),
            # meta enc
            (('iso-m', False), ('Content-Type: text/html',
                                '''<meta http-equiv="Content-Type"
                                    content="text/html;charset=iso-m">''')),

            # mismatch header and meta, header wins
            (('iso-h', True), ('Content-Type: text/html;charset=iso-H',
                               '''<meta http-equiv="Content-Type"
                                    content="text/html;charset=iso-m">''')),

            # no header:
            ((None, False), (None,
                             '''<meta http-equiv="Content-Type"
                                content="text/html;charset=iso-m">''')),
            # no encoding at all
            ((None, False), (None,
                             '''<meta http-equiv="Content-Type"
                                content="text/html">''')),


            ((None, False), (None,
                             '''text''')),


            # --- no header ---

            ((None, False), (None, '')),
            (('iso-8859-1', False), ('''NoContentType''',
                                     '''OnlyText''')),
            (('iso-8859-1', False), ('Content-Type: text/html;',
                                     None)),
            (('iso-8859-1', False), ('Content-Type: text/html;',
                                     '1')),

            # XML
            (('utf-8', False), (None,
                                '''<?xml version=''')),
            (('iso-x', False), (None,
                                '''<?xml version="1.0" encoding="iso-X"?>''')),
            # meta ignored
            (('utf-8', False), (None,
                                '''<?xml version="1.0" ?>
                                    <html><meta http-equiv="Content-Type"
                                    content="text/html;charset=iso-m"></html>''')),

            (('utf-8', False), ('Content-Type: text/css;',
                                '1')),
            (('iso-h', False), ('Content-Type: text/css;charset=iso-h',
                                '1')),
            # only header is used by encutils
            (('utf-8', False), ('Content-Type: text/css',
                                '@charset "ascii";')),

        ]
        for exp, test in tests:
            header, text = test
            if header:
                res = encutils.getEncodingInfo(self._fakeRes(header), text)
            else:
                res = encutils.getEncodingInfo(text=text)

            res = (res.encoding, res.mismatch)
            self.assertEqual(exp, res)