def capture(self, url): """ Capture all stylesheets at given URL's HTML document. Any HTTPError is raised to caller. url to capture CSS from Returns ``cssutils.stylesheets.StyleSheetList``. """ self._log.info('\nCapturing CSS from URL:\n %s\n', url) self._nonparsed = {} self.stylesheetlist = cssutils.stylesheets.StyleSheetList() # used to save inline styles scheme, loc, path, query, fragment = urllib.parse.urlsplit(url) self._filename = os.path.basename(path) # get url content url, res = self._doRequest(url) if not res: sys.exit(1) rawdoc = res.read() self.docencoding = encutils.getEncodingInfo( res, rawdoc, log=self._log).encoding self._log.info('\nUsing Encoding: %s\n', self.docencoding) doctext = rawdoc.decode(self.docencoding) # fill list of stylesheets and list of raw css self._findStyleSheets(url, doctext) return self.stylesheetlist
def test_getEncodingInfo(self): "encutils.getEncodingInfo" # (expectedencoding, expectedmismatch): (httpheader, filecontent) tests = [ # --- application/xhtml+xml --- # header default and XML default (('utf-8', False), ( '''Content-Type: application/xhtml+xml''', '''<?xml version="1.0" ?> <example> <meta http-equiv="Content-Type" content="application/xhtml+xml"/> </example>''')), # XML default (('utf-8', False), ( None, '''<?xml version="1.0" ?> <example> <meta http-equiv="Content-Type" content="application/xhtml+xml"/> </example>''')), # meta is ignored! (('utf-8', False), ( '''Content-Type: application/xhtml+xml''', '''<?xml version="1.0" ?> <example> <meta http-equiv="Content-Type" content="application/xhtml+xml;charset=iso_M"/> </example>''')), # header enc and XML default (('iso-h', True), ( '''Content-Type: application/xhtml+xml;charset=iso-H''', '''<?xml version="1.0" ?> <example> <meta http-equiv="Content-Type" content="application/xhtml+xml"/> </example>''')), # mismatch header and XML explicit, header wins (('iso-h', True), ( '''Content-Type: application/xhtml+xml;charset=iso-H''', '''<?xml version="1.0" encoding="iso-X" ?> <example/>''')), # header == XML, meta ignored! (('iso-h', False), ( '''Content-Type: application/xhtml+xml;charset=iso-H''', '''<?xml version="1.0" encoding="iso-h" ?> <example> <meta http-equiv="Content-Type" content="application/xhtml+xml;charset=iso_M"/> </example>''')), # XML only, meta ignored! (('iso-x', False), ( '''Content-Type: application/xhtml+xml''', '''<?xml version="1.0" encoding="iso-X" ?> <example> <meta http-equiv="Content-Type" content="application/xhtml+xml;charset=iso_M"/> </example>''')), # no text or not enough text: (('iso-h', False), ('Content-Type: application/xml;charset=iso-h', '1')), (('utf-8', False), ('Content-Type: application/xml', None)), ((None, False), ('Content-Type: application/xml', '1')), # --- text/xml --- # default enc (('ascii', False), ( '''Content-Type: text/xml''', '''<?xml version="1.0" ?> <example> <meta http-equiv="Content-Type" content="text/xml"/> </example>''')), # default as XML ignored and meta completely ignored (('ascii', False), ( '''Content-Type: text/xml''', '''<?xml version="1.0" encoding="iso-X" ?> <example> <meta http-equiv="Content-Type" content="text/xml;charset=iso_M"/> </example>''')), (('ascii', False), ('Content-Type: text/xml', '1')), (('ascii', False), ('Content-Type: text/xml', None)), # header enc (('iso-h', False), ( '''Content-Type: text/xml;charset=iso-H''', '''<?xml version="1.0" ?> <example> <meta http-equiv="Content-Type" content="text/xml"/> </example>''')), # header only, XML and meta ignored! (('iso-h', False), ( '''Content-Type: text/xml;charset=iso-H''', '''<?xml version="1.0" encoding="iso-X" ?> <example/>''')), (('iso-h', False), ( '''Content-Type: text/xml;charset=iso-H''', '''<?xml version="1.0" encoding="iso-h" ?> <example> <meta http-equiv="Content-Type" content="text/xml;charset=iso_M"/> </example>''')), # --- text/html --- # default enc (('iso-8859-1', False), ('Content-Type: text/html;', '''<meta http-equiv="Content-Type" content="text/html">''')), (('iso-8859-1', False), ('Content-Type: text/html;', None)), # header enc (('iso-h', False), ('Content-Type: text/html;charset=iso-H', '''<meta http-equiv="Content-Type" content="text/html">''')), # meta enc (('iso-m', False), ('Content-Type: text/html', '''<meta http-equiv="Content-Type" content="text/html;charset=iso-m">''')), # mismatch header and meta, header wins (('iso-h', True), ('Content-Type: text/html;charset=iso-H', '''<meta http-equiv="Content-Type" content="text/html;charset=iso-m">''')), # no header: ((None, False), (None, '''<meta http-equiv="Content-Type" content="text/html;charset=iso-m">''')), # no encoding at all ((None, False), (None, '''<meta http-equiv="Content-Type" content="text/html">''')), ((None, False), (None, '''text''')), # --- no header --- ((None, False), (None, '')), (('iso-8859-1', False), ('''NoContentType''', '''OnlyText''')), (('iso-8859-1', False), ('Content-Type: text/html;', None)), (('iso-8859-1', False), ('Content-Type: text/html;', '1')), # XML (('utf-8', False), (None, '''<?xml version=''')), (('iso-x', False), (None, '''<?xml version="1.0" encoding="iso-X"?>''')), # meta ignored (('utf-8', False), (None, '''<?xml version="1.0" ?> <html><meta http-equiv="Content-Type" content="text/html;charset=iso-m"></html>''')), (('utf-8', False), ('Content-Type: text/css;', '1')), (('iso-h', False), ('Content-Type: text/css;charset=iso-h', '1')), # only header is used by encutils (('utf-8', False), ('Content-Type: text/css', '@charset "ascii";')), ] for exp, test in tests: header, text = test if header: res = encutils.getEncodingInfo(self._fakeRes(header), text) else: res = encutils.getEncodingInfo(text=text) res = (res.encoding, res.mismatch) self.assertEqual(exp, res)