def html(self): try: return self.html5lib.parseFragment(self.content, treebuilder="etree") except ImportError as err: raise ImproperlyConfigured("Error while importing html5lib: %s" % err) except Exception as err: raise ParserError("Error while initializing Parser: %s" % err)
class Html5LibParser(ParserBase): def __init__(self, content): super(Html5LibParser, self).__init__(content) import html5lib self.html5lib = html5lib def _serialize(self, elem): fragment = self.html5lib.treebuilders.simpletree.DocumentFragment() fragment.appendChild(elem) return self.html5lib.serialize(fragment, quote_attr_values=True, omit_optional_tags=False) def _find(self, *names): for node in self.html.childNodes: if node.type == 5 and node.name in names: yield node @cached_property def html(self): try: return self.html5lib.parseFragment(self.content) except ImportError, err: raise ImproperlyConfigured("Error while importing html5lib: %s" % err) except Exception, err: raise ParserError("Error while initializing Parser: %s" % err)
class BeautifulSoupParser(ParserBase): @cached_property def soup(self): try: from BeautifulSoup import BeautifulSoup return BeautifulSoup(self.content) except ImportError, err: raise ImproperlyConfigured( "Error while importing BeautifulSoup: %s" % err) except Exception, err: raise ParserError("Error while initializing Parser: %s" % err)
def __init__(self, content): HTMLParser.__init__(self) self.content = content self._css_elems = [] self._js_elems = [] self._current_tag = None try: self.feed(self.content) self.close() except Exception, err: raise ParserError("Error while initializing HtmlParser: %s" % err)
def soup(self): try: if six.PY3: from bs4 import BeautifulSoup else: from BeautifulSoup import BeautifulSoup return BeautifulSoup(self.content) except ImportError as err: raise ImproperlyConfigured( "Error while importing BeautifulSoup: %s" % err) except Exception as err: raise ParserError("Error while initializing Parser: %s" % err)
class LxmlParser(ParserBase): def __init__(self, content): try: from lxml.html import fromstring, soupparser from lxml.etree import tostring self.fromstring = fromstring self.soupparser = soupparser self.tostring = tostring except ImportError, err: raise ImproperlyConfigured("Error while importing lxml: %s" % err) except Exception, err: raise ParserError("Error while initializing Parser: %s" % err)
def __init__(self, content): try: from lxml.html import fromstring from lxml.etree import tostring except ImportError as err: raise ImproperlyConfigured("Error while importing lxml: %s" % err) except Exception as err: raise ParserError("Error while initializing parser: %s" % err) self.fromstring = fromstring self.tostring = tostring super(LxmlParser, self).__init__(content)
def __init__(self, content): six.moves.html_parser.HTMLParser.__init__(self, **HTML_PARSER_ARGS) self.content = content self._css_elems = [] self._js_elems = [] self._current_tag = None try: self.feed(self.content) self.close() except Exception as err: lineno = err.lineno line = self.content.splitlines()[lineno] raise ParserError("Error while initializing HtmlParser: %s (line: %s)" % (err, repr(line)))
def __init__(self, content): try: from lxml.html import fromstring from lxml.etree import tostring except ImportError as err: raise ImproperlyConfigured("Error while importing lxml: %s" % err) except Exception as err: raise ParserError("Error while initializing parser: %s" % err) if not six.PY3: # soupparser uses Beautiful Soup 3 which does not run on python 3.x try: from lxml.html import soupparser except ImportError as err: soupparser = None except Exception as err: raise ParserError("Error while initializing parser: %s" % err) else: soupparser = None self.soupparser = soupparser self.fromstring = fromstring self.tostring = tostring super(LxmlParser, self).__init__(content)
class LxmlParser(ParserBase): @cached_property def tree(self): content = '<root>%s</root>' % self.content try: from lxml.html import fromstring, soupparser from lxml.etree import tostring self.tostring = tostring tree = fromstring(content) try: ignore = tostring(tree, encoding=unicode) except UnicodeDecodeError: tree = soupparser.fromstring(content) except ImportError, err: raise ImproperlyConfigured("Error while importing lxml: %s" % err) except Exception, err: raise ParserError("Error while initializing Parser: %s" % err)
def tree(self): try: from lxml import html from lxml.etree import tostring except ImportError, e: raise ParserError("Error while initializing Parser: %s" % e)
def soup(self): try: from BeautifulSoup import BeautifulSoup except ImportError, e: raise ParserError("Error while initializing Parser: %s" % e)