Beispiel #1
0
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        #TODO: XHTMLParser doesn't see to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in StringTypes:
        if type(src) is unicode:
            # If an encoding was provided, do not change it.
            if not encoding:
                encoding = "utf-8"
            src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
        else:
            if inputstream.codecName(encoding) is None:
                log.error("%r is not a valid encoding", encoding)
    document = parser.parse(
        src,
        encoding=encoding)

    if xml_output:
        if encoding:
            xml_output.write(document.toprettyxml(encoding=encoding))
        else:
            xml_output.write(document.toprettyxml(encoding="utf8"))


    if default_css:
        context.addDefaultCSS(default_css)

    pisaPreLoop(document, context)
    #try:
    context.parseCSS()
    #except:
    #    context.cssText = DEFAULT_CSS
    #    context.parseCSS()
    # context.debug(9, pprint.pformat(context.css))

    pisaLoop(document, context)
    return context
Beispiel #2
0
def pisaParser(src, c, default_css="", xhtml=False, encoding=None):
    """    
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object     
    """

    if xhtml:
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in types.StringTypes:
        if type(src) is types.UnicodeType:
            encoding = "utf8"
            src = src.encode(encoding)
        src = StringIO.StringIO(src)

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error(
                    "%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!",
                    encoding)
        else:
            if inputstream.codecName(encoding) is None:
                log.error("%r is not a valid encoding", encoding)

    document = parser.parse(src, encoding=encoding)
    # print document.toprettyxml()

    if default_css:
        c.addCSS(default_css)

    pisaPreLoop(document, c)
    #try:
    c.parseCSS()
    #except:
    #    c.cssText = DEFAULT_CSS
    #    c.parseCSS()
    # c.debug(9, pprint.pformat(c.css))
    pisaLoop(document, c)
    return c
Beispiel #3
0
def pisaParser(src,
               c,
               default_css="",
               xhtml=False,
               encoding=None,
               xml_output=None):
    """    
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object     
    """

    if xhtml:
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in types.StringTypes:
        if type(src) is types.UnicodeType:
            encoding = "utf8"
            src = src.encode(encoding)
        src = pisaTempFile(src, capacity=c.capacity)

    document = parser.parse(src, encoding=encoding)

    if xml_output:
        xml_output.write(document.toprettyxml(encoding="utf8"))

    if default_css:
        c.addCSS(default_css)

    pisaPreLoop(document, c)
    #try:
    c.parseCSS()
    #except:
    #    c.cssText = DEFAULT_CSS
    #    c.parseCSS()
    # c.debug(9, pprint.pformat(c.css))
    pisaLoop(document, c)
    return c
Beispiel #4
0
def pisaParser(src,
               context,
               default_css="",
               xhtml=False,
               encoding=None,
               xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        # TODO: XHTMLParser doesn't seem to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    parser_kwargs = {}
    if isinstance(src, six.text_type):
        # If an encoding was provided, do not change it.
        if not encoding:
            encoding = "utf-8"
        src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)
        # To pass the encoding used to convert the text_type src to binary_type
        # on to html5lib's parser to ensure proper decoding
        parser_kwargs['transport_encoding'] = encoding

    # # Test for the restrictions of html5lib
    # if encoding:
    #     # Workaround for html5lib<0.11.1
    #     if hasattr(inputstream, "isValidEncoding"):
    #         if encoding.strip().lower() == "utf8":
    #             encoding = "utf-8"
    #         if not inputstream.isValidEncoding(encoding):
    #             log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
    #     else:
    #         if inputstream.codecName(encoding) is None:
    #             log.error("%r is not a valid encoding", encoding)
    document = parser.parse(src, **parser_kwargs)  # encoding=encoding)

    if xml_output:
        if encoding:
            xml_output.write(document.toprettyxml(encoding=encoding))
        else:
            xml_output.write(document.toprettyxml(encoding="utf8"))

    if default_css:
        context.addDefaultCSS(default_css)

    pisaPreLoop(document, context)
    # try:
    context.parseCSS()
    # except:
    #    context.cssText = DEFAULT_CSS
    #    context.parseCSS()
    # context.debug(9, pprint.pformat(context.css))

    pisaLoop(document, context)
    return context