def is_valid_html(html): try: fragments_fromstring(html.encode('utf-8'), parser=HTMLParser(strict=True)) except ParseError: return False return True
def _validate_htmloutput(self, htmloutput, field, value): try: fragments_fromstring(value.encode('utf-8'), parser=HTMLParser(strict=True)) except ParseError as e: return self._error( field, 'The provided HTML template is not valid: {}'.format(e)) if isinstance(htmloutput, dict): if htmloutput.get('template_vars_required'): vars = re.findall('\$(\w+)', value) if not len(vars): return self._error( field, "The provided HTML template is not valid: no vars available." )
def make_one(self, **kwargs): from lxml.html.html5parser import HTMLParser return HTMLParser(**kwargs)
# Constants VALID_NON_UNICODE_IDENTIFIER_RE = re.compile(r'^[_a-zA-Z][_a-zA-Z0-9]*$') PY_KEYWORDS = { # from: https://docs.python.org/3.3/reference/lexical_analysis.html#keywords 'False', 'class', 'finally', 'is', 'return', 'None', 'continue', 'for', 'lambda', 'try', 'True', 'def', 'from', 'nonlocal', 'while', 'and', 'del', 'global', 'not', 'with', 'as', 'elif', 'if', 'or', 'yield', 'assert', 'else', 'import', 'pass', 'break', 'except', 'in', 'raise', } _html5Parser = HTMLParser( # tree=TreeBuilder -> done by lxml.html.html5parser.HTMLParser.__init__ strict=False, # default namespaceHTMLElements=False, # non-default debug=False # default ) _xmlParser = XMLParser( encoding='utf-8', remove_blank_text=True, huge_tree=True, recover=True, # ATTENTION: recover=True should *never* be needed at this point, but html5lib is broken in it's namespace-support (reading namespaced stuff correctly). Disable this and see HtmlToTagTest.test_full_cicle fail horribly. # Default from LXML: # attribute_defaults=False, # dtd_validation=False, # load_dtd=False, # no_network=True, # ns_clean=False,