def __init__(self, builder=None, encoding=None): self.__stack = [] if builder is None: builder = ElementTree.TreeBuilder() self.__builder = builder self.encoding = encoding or "iso-8859-1" HTMLParser.__init__(self)
def __init__(self, html=0): try: import sgmlop except ImportError: raise RuntimeError("sgmlop parser not available") self.__builder = ElementTree.TreeBuilder() if html: import htmlentitydefs self.entitydefs.update(htmlentitydefs.entitydefs) self.__parser = sgmlop.XMLParser() self.__parser.register(self)
def parse(file, builder=None): """Loads an XHTML or HTML file into an Element structure, using Leonard Richardson's tolerant BeautifulSoup parser. @param file Source file (a file object). Even on Python 2, this must be a filehandle that returns Unicode (see the codecs module), such as the one returned by codecs or a StringIO constructed from a Unicode string. Will raise an AssertionError otherwise. @param builder Optional tree builder. If omitted, defaults to the "best" available <b>TreeBuilder</b> implementation. @return An Element instance representing the HTML root element.""" bob = builder if bob == None: bob = ET.TreeBuilder() def emit(soup): if isinstance(soup, BS.NavigableString): if isinstance(soup, ignorable_soup): return bob.data(unescape(soup)) else: attrib = dict([(k, unescape(v)) for k, v in soup.attrs]) bob.start(soup.name, attrib) for s in soup: emit(s) bob.end(soup.name) text = file.read() #assert isinstance(text, unicode) soup = BS.BeautifulSoup(text) # build the tree emit(soup) root = bob.close() # wrap the document in a html root element, if necessary if len(root) == 1 and root[0].tag == "html": return root[0] root.tag = "html" return root
def __init__(self, html=0): self.__builder = ElementTree.TreeBuilder() if html: import htmlentitydefs self.entitydefs.update(htmlentitydefs.entitydefs) xmllib.XMLParser.__init__(self)
def __init__(self): self.__stack = [] self.__builder = ElementTree.TreeBuilder() HTMLParser.__init__(self)