def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ global CSSAttrCache CSSAttrCache = {} if xhtml: #TODO: XHTMLParser doesn't see to exist... parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in StringTypes: if type(src) is unicode: # If an encoding was provided, do not change it. if not encoding: encoding = "utf-8" src = src.encode(encoding) src = pisaTempFile(src, capacity=context.capacity) # Test for the restrictions of html5lib if encoding: # Workaround for html5lib<0.11.1 if hasattr(inputstream, "isValidEncoding"): if encoding.strip().lower() == "utf8": encoding = "utf-8" if not inputstream.isValidEncoding(encoding): log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) else: if inputstream.codecName(encoding) is None: log.error("%r is not a valid encoding", encoding) document = parser.parse( src, encoding=encoding) if xml_output: if encoding: xml_output.write(document.toprettyxml(encoding=encoding)) else: xml_output.write(document.toprettyxml(encoding="utf8")) if default_css: context.addDefaultCSS(default_css) pisaPreLoop(document, context) #try: context.parseCSS() #except: # context.cssText = DEFAULT_CSS # context.parseCSS() # context.debug(9, pprint.pformat(context.css)) pisaLoop(document, context) return context
def pisaParser(src, c, default_css="", xhtml=False, encoding=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ if xhtml: parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in types.StringTypes: if type(src) is types.UnicodeType: encoding = "utf8" src = src.encode(encoding) src = StringIO.StringIO(src) # Test for the restrictions of html5lib if encoding: # Workaround for html5lib<0.11.1 if hasattr(inputstream, "isValidEncoding"): if encoding.strip().lower() == "utf8": encoding = "utf-8" if not inputstream.isValidEncoding(encoding): log.error( "%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) else: if inputstream.codecName(encoding) is None: log.error("%r is not a valid encoding", encoding) document = parser.parse(src, encoding=encoding) # print document.toprettyxml() if default_css: c.addCSS(default_css) pisaPreLoop(document, c) #try: c.parseCSS() #except: # c.cssText = DEFAULT_CSS # c.parseCSS() # c.debug(9, pprint.pformat(c.css)) pisaLoop(document, c) return c
def pisaParser(src, c, default_css="", xhtml=False, encoding=None, xml_output=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ if xhtml: parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in types.StringTypes: if type(src) is types.UnicodeType: encoding = "utf8" src = src.encode(encoding) src = pisaTempFile(src, capacity=c.capacity) document = parser.parse(src, encoding=encoding) if xml_output: xml_output.write(document.toprettyxml(encoding="utf8")) if default_css: c.addCSS(default_css) pisaPreLoop(document, c) #try: c.parseCSS() #except: # c.cssText = DEFAULT_CSS # c.parseCSS() # c.debug(9, pprint.pformat(c.css)) pisaLoop(document, c) return c
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ global CSSAttrCache CSSAttrCache = {} if xhtml: # TODO: XHTMLParser doesn't seem to exist... parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) parser_kwargs = {} if isinstance(src, six.text_type): # If an encoding was provided, do not change it. if not encoding: encoding = "utf-8" src = src.encode(encoding) src = pisaTempFile(src, capacity=context.capacity) # To pass the encoding used to convert the text_type src to binary_type # on to html5lib's parser to ensure proper decoding parser_kwargs['transport_encoding'] = encoding # # Test for the restrictions of html5lib # if encoding: # # Workaround for html5lib<0.11.1 # if hasattr(inputstream, "isValidEncoding"): # if encoding.strip().lower() == "utf8": # encoding = "utf-8" # if not inputstream.isValidEncoding(encoding): # log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) # else: # if inputstream.codecName(encoding) is None: # log.error("%r is not a valid encoding", encoding) document = parser.parse(src, **parser_kwargs) # encoding=encoding) if xml_output: if encoding: xml_output.write(document.toprettyxml(encoding=encoding)) else: xml_output.write(document.toprettyxml(encoding="utf8")) if default_css: context.addDefaultCSS(default_css) pisaPreLoop(document, context) # try: context.parseCSS() # except: # context.cssText = DEFAULT_CSS # context.parseCSS() # context.debug(9, pprint.pformat(context.css)) pisaLoop(document, context) return context