예제 #1
0
 def __init__(self, builder=None, encoding=None):
     self.__stack = []
     if builder is None:
         builder = ElementTree.TreeBuilder()
     self.__builder = builder
     self.encoding = encoding or "iso-8859-1"
     HTMLParser.__init__(self)
예제 #2
0
 def __init__(self, html=0):
     try:
         import sgmlop
     except ImportError:
         raise RuntimeError("sgmlop parser not available")
     self.__builder = ElementTree.TreeBuilder()
     if html:
         import htmlentitydefs
         self.entitydefs.update(htmlentitydefs.entitydefs)
     self.__parser = sgmlop.XMLParser()
     self.__parser.register(self)
예제 #3
0
def parse(file, builder=None):
    """Loads an XHTML or HTML file into an Element structure, using
    Leonard Richardson's tolerant BeautifulSoup parser.

    @param file Source file (a file object). Even on Python 2, this must
        be a filehandle that returns Unicode (see the codecs module),
        such as the one returned by codecs or a StringIO constructed
        from a Unicode string. Will raise an AssertionError otherwise.
    @param builder Optional tree builder. If omitted, defaults to the
        "best" available <b>TreeBuilder</b> implementation.
    @return An Element instance representing the HTML root element."""

    bob = builder
    if bob == None:
        bob = ET.TreeBuilder()

    def emit(soup):
        if isinstance(soup, BS.NavigableString):
            if isinstance(soup, ignorable_soup):
                return
            bob.data(unescape(soup))
        else:
            attrib = dict([(k, unescape(v)) for k, v in soup.attrs])
            bob.start(soup.name, attrib)
            for s in soup:
                emit(s)
            bob.end(soup.name)

    text = file.read()
    #assert isinstance(text, unicode)
    soup = BS.BeautifulSoup(text)

    # build the tree
    emit(soup)
    root = bob.close()

    # wrap the document in a html root element, if necessary
    if len(root) == 1 and root[0].tag == "html":
        return root[0]

    root.tag = "html"
    return root
예제 #4
0
 def __init__(self, html=0):
     self.__builder = ElementTree.TreeBuilder()
     if html:
         import htmlentitydefs
         self.entitydefs.update(htmlentitydefs.entitydefs)
     xmllib.XMLParser.__init__(self)
예제 #5
0
 def __init__(self):
     self.__stack = []
     self.__builder = ElementTree.TreeBuilder()
     HTMLParser.__init__(self)