def pisaParser(src, c, default_css="", xhtml=False, encoding=None, xml_output=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ if xhtml: parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in types.StringTypes: if type(src) is types.UnicodeType: encoding = "utf8" src = src.encode(encoding) #src = pisaTempFile(src, capacity=c.capacity) # Test for the restrictions of html5lib if encoding: # Workaround for html5lib<0.11.1 if hasattr(inputstream, "isValidEncoding"): if encoding.strip().lower() == "utf8": encoding = "utf-8" if not inputstream.isValidEncoding(encoding): log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) else: if inputstream.codecName(encoding) is None: log.error("%r is not a valid encoding", encoding) #encoding = 'utf-8' document = parser.parse( src, encoding=encoding) if xml_output: xml_output.write(document.toprettyxml(encoding="utf8")) if default_css: c.addCSS(default_css) #from html5lib import treewalkers, serializer #walker = treewalkers.getTreeWalker("dom") #stream = walker(document) #s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False) #output_generator = s.serialize(stream) #for item in output_generator: # print item #pisaPreLoop(document, c) #try: c.parseCSS() #except: # c.cssText = DEFAULT_CSS # c.parseCSS() # c.debug(9, pprint.pformat(c.css)) pisaLoop(document, c) return c
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ global CSSAttrCache CSSAttrCache = {} if xhtml: #TODO: XHTMLParser doesn't see to exist... parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in StringTypes: if type(src) is unicode: # If an encoding was provided, do not change it. if not encoding: encoding = "utf-8" src = src.encode(encoding) src = pisaTempFile(src, capacity=context.capacity) # Test for the restrictions of html5lib if encoding: # Workaround for html5lib<0.11.1 if hasattr(inputstream, "isValidEncoding"): if encoding.strip().lower() == "utf8": encoding = "utf-8" if not inputstream.isValidEncoding(encoding): log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) else: if inputstream.codecName(encoding) is None: log.error("%r is not a valid encoding", encoding) document = parser.parse( src, encoding=encoding) if xml_output: if encoding: xml_output.write(document.toprettyxml(encoding=encoding)) else: xml_output.write(document.toprettyxml(encoding="utf8")) if default_css: context.addDefaultCSS(default_css) pisaPreLoop(document, context) #try: context.parseCSS() #except: # context.cssText = DEFAULT_CSS # context.parseCSS() # context.debug(9, pprint.pformat(context.css)) pisaLoop(document, context) return context
def pisaParser(src, c, default_css="", xhtml=False, encoding=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ if xhtml: parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in types.StringTypes: if type(src) is types.UnicodeType: encoding = "utf8" src = src.encode(encoding) src = StringIO.StringIO(src) # Test for the restrictions of html5lib if encoding: # Workaround for html5lib<0.11.1 if hasattr(inputstream, "isValidEncoding"): if encoding.strip().lower() == "utf8": encoding = "utf-8" if not inputstream.isValidEncoding(encoding): log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) else: if inputstream.codecName(encoding) is None: log.error("%r is not a valid encoding", encoding) document = parser.parse( src, encoding=encoding) # print document.toprettyxml() if default_css: c.addCSS(default_css) pisaPreLoop(document, c) #try: c.parseCSS() #except: # c.cssText = DEFAULT_CSS # c.parseCSS() # c.debug(9, pprint.pformat(c.css)) pisaLoop(document, c) return c
def pisaParser(src, c, default_css="", xhtml=False, encoding=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ if xhtml: parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in types.StringTypes: if type(src) is types.UnicodeType: encoding = "utf8" src = src.encode(encoding) src = StringIO.StringIO(src) # Test for the restrictions of html5lib if encoding: # Workaround for html5lib<0.11.1 if hasattr(inputstream, "isValidEncoding"): if encoding.strip().lower() == "utf8": encoding = "utf-8" if not inputstream.isValidEncoding(encoding): log.error( "%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) else: if inputstream.codecName(encoding) is None: log.error("%r is not a valid encoding", encoding) document = parser.parse(src, encoding=encoding) # print document.toprettyxml() if default_css: c.addCSS(default_css) pisaPreLoop(document, c) #try: c.parseCSS() #except: # c.cssText = DEFAULT_CSS # c.parseCSS() # c.debug(9, pprint.pformat(c.css)) pisaLoop(document, c) return c
def test_codec_name(self): self.assertEquals(inputstream.codecName("utf-8"), "utf-8") self.assertEquals(inputstream.codecName("utf8"), "utf-8") self.assertEquals(inputstream.codecName(" utf8 "), "utf-8") self.assertEquals(inputstream.codecName("ISO_8859--1"), "windows-1252")
def test_codec_name_d(self): self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
def test_codec_name_c(self): self.assertEqual(inputstream.codecName(" utf8 "), "utf-8")
def test_codec_name_d(self): self.assertEqual(inputstream.codecName('ISO_8859--1'), 'windows-1252')
def test_codec_name_c(self): self.assertEqual(inputstream.codecName(' utf8 '), 'utf-8')
def test_codec_name_a(self): self.assertEqual(inputstream.codecName(u"utf-8"), u"utf-8")