Example #1
0
def pisaParser(src, c, default_css="", xhtml=False, encoding=None, xml_output=None):
    """    
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object     
    """
    
    if xhtml:
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in types.StringTypes:
        if type(src) is types.UnicodeType:
            encoding = "utf8"
            src = src.encode(encoding)
        #src = pisaTempFile(src, capacity=c.capacity)    

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1        
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
        else:
             if inputstream.codecName(encoding) is None:
                 log.error("%r is not a valid encoding", encoding)
    
    
    #encoding = 'utf-8'
    document = parser.parse(
        src,
        encoding=encoding)
        
    if xml_output:
        xml_output.write(document.toprettyxml(encoding="utf8"))    

    if default_css:
        c.addCSS(default_css)
    
    #from html5lib import treewalkers, serializer
    #walker = treewalkers.getTreeWalker("dom")
    #stream = walker(document)
    #s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
    #output_generator = s.serialize(stream)
    #for item in output_generator:
    #    print item
    
    #pisaPreLoop(document, c)
    #try:
    c.parseCSS()        
    #except:
    #    c.cssText = DEFAULT_CSS
    #    c.parseCSS()        
    # c.debug(9, pprint.pformat(c.css))
    pisaLoop(document, c)
    return c
Example #2
0
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        #TODO: XHTMLParser doesn't see to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in StringTypes:
        if type(src) is unicode:
            # If an encoding was provided, do not change it.
            if not encoding:
                encoding = "utf-8"
            src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
        else:
            if inputstream.codecName(encoding) is None:
                log.error("%r is not a valid encoding", encoding)
    document = parser.parse(
        src,
        encoding=encoding)

    if xml_output:
        if encoding:
            xml_output.write(document.toprettyxml(encoding=encoding))
        else:
            xml_output.write(document.toprettyxml(encoding="utf8"))


    if default_css:
        context.addDefaultCSS(default_css)

    pisaPreLoop(document, context)
    #try:
    context.parseCSS()
    #except:
    #    context.cssText = DEFAULT_CSS
    #    context.parseCSS()
    # context.debug(9, pprint.pformat(context.css))

    pisaLoop(document, context)
    return context
Example #3
0
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        #TODO: XHTMLParser doesn't see to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in StringTypes:
        if type(src) is unicode:
            # If an encoding was provided, do not change it.
            if not encoding:
                encoding = "utf-8"
            src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
        else:
            if inputstream.codecName(encoding) is None:
                log.error("%r is not a valid encoding", encoding)
    document = parser.parse(
        src,
        encoding=encoding)

    if xml_output:
        if encoding:
            xml_output.write(document.toprettyxml(encoding=encoding))
        else:
            xml_output.write(document.toprettyxml(encoding="utf8"))


    if default_css:
        context.addDefaultCSS(default_css)

    pisaPreLoop(document, context)
    #try:
    context.parseCSS()
    #except:
    #    context.cssText = DEFAULT_CSS
    #    context.parseCSS()
    # context.debug(9, pprint.pformat(context.css))

    pisaLoop(document, context)
    return context
def pisaParser(src, c, default_css="", xhtml=False, encoding=None):
    """    
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object     
    """
    
    if xhtml:
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in types.StringTypes:
        if type(src) is types.UnicodeType:
            encoding = "utf8"
            src = src.encode(encoding)
        src = StringIO.StringIO(src)    

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1        
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
        else:
             if inputstream.codecName(encoding) is None:
                 log.error("%r is not a valid encoding", encoding)
            
    document = parser.parse(
        src, 
        encoding=encoding)
    # print document.toprettyxml()    

    if default_css:
        c.addCSS(default_css)
        
    pisaPreLoop(document, c)    
    #try:
    c.parseCSS()        
    #except:
    #    c.cssText = DEFAULT_CSS
    #    c.parseCSS()        
    # c.debug(9, pprint.pformat(c.css))        
    pisaLoop(document, c)
    return c
Example #5
0
def pisaParser(src, c, default_css="", xhtml=False, encoding=None):
    """    
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object     
    """

    if xhtml:
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in types.StringTypes:
        if type(src) is types.UnicodeType:
            encoding = "utf8"
            src = src.encode(encoding)
        src = StringIO.StringIO(src)

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error(
                    "%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!",
                    encoding)
        else:
            if inputstream.codecName(encoding) is None:
                log.error("%r is not a valid encoding", encoding)

    document = parser.parse(src, encoding=encoding)
    # print document.toprettyxml()

    if default_css:
        c.addCSS(default_css)

    pisaPreLoop(document, c)
    #try:
    c.parseCSS()
    #except:
    #    c.cssText = DEFAULT_CSS
    #    c.parseCSS()
    # c.debug(9, pprint.pformat(c.css))
    pisaLoop(document, c)
    return c
Example #6
0
 def test_codec_name(self):
     self.assertEquals(inputstream.codecName("utf-8"), "utf-8")
     self.assertEquals(inputstream.codecName("utf8"), "utf-8")
     self.assertEquals(inputstream.codecName("  utf8  "), "utf-8")
     self.assertEquals(inputstream.codecName("ISO_8859--1"), "windows-1252")
 def test_codec_name_d(self):
     self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
 def test_codec_name_c(self):
     self.assertEqual(inputstream.codecName("  utf8  "), "utf-8")
Example #9
0
 def test_codec_name_d(self):
     self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
Example #10
0
 def test_codec_name_c(self):
     self.assertEqual(inputstream.codecName("  utf8  "), "utf-8")
Example #11
0
 def test_codec_name(self):
     self.assertEquals(inputstream.codecName("utf-8"), "utf-8")
     self.assertEquals(inputstream.codecName("utf8"), "utf-8")
     self.assertEquals(inputstream.codecName("  utf8  "), "utf-8")
     self.assertEquals(inputstream.codecName("ISO_8859--1"), "windows-1252")
Example #12
0
 def test_codec_name_d(self):
     self.assertEqual(inputstream.codecName('ISO_8859--1'), 'windows-1252')
Example #13
0
 def test_codec_name_c(self):
     self.assertEqual(inputstream.codecName('  utf8  '), 'utf-8')
Example #14
0
 def test_codec_name_a(self):
     self.assertEqual(inputstream.codecName(u"utf-8"), u"utf-8")