def test(xhtml_file: Path, dtd: DTD, schematron: Schematron) -> bool: """ Test that an XHTML file matches a DTD and passes Schematron tests. Error messages are printed to stderr if the file doesn't pass. :param xhtml_file: the XHTML file to test :param dtd: the DTD :param schematron: the Schematron :return: True if the file passes """ if settings.verbose: print(xhtml_file) clear_error_log() parser = XHTMLParser(dtd_validation=True, ns_clean=True) try: tree = parse(source=str(xhtml_file), parser=parser) html = tree.getroot() except IOError as e: print(f"{xhtml_file}: {e.strerror}", file=stderr) return False except XMLSyntaxError: print_error_log(parser.error_log) return False if not dtd.validate(html): print_error_log(dtd.error_log) return False if not schematron.validate(html): print_schematron_error_log(html, schematron) return False return test_links(xhtml_file, html) and test_images(xhtml_file, html)
def run(xhtml_files: List[Path], dtd_file: Path, images: bool, links: bool) -> bool: try: dtd = DTD(str(dtd_file)) except DTDParseError as e: print(e.error_log, file=stderr) clear_error_log() return False else: success = True for file in xhtml_files: # if you reuse the parser on too many documents it gets confused parser = XHTMLParser(dtd_validation=True, ns_clean=True) dtd = DTD(str(dtd_file)) if settings.verbose: print(xhtml_file) if not test(file, parser, dtd, images, links): success = False return success
def clean_html(code, base_url=None): parser = XHTMLParser() doc = fragment_fromstring(code, create_parent=True, parser=parser) # Remove processing instructions for pi_element in doc.xpath('//processing-instruction()'): pi_element.drop_tree() # Remove scripts for script_element in doc.xpath('//script'): script_element.drop_tree() # Remove events for event_attrib in doc.xpath("//@*[starts-with(name(), 'on')]"): event_attrib.getparent().attrib.pop(event_attrib.attrname) # Remove audio elements for audio_element in doc.xpath('//audio'): audio_element.drop_tree() # Force controls on videos for element in doc.xpath('//video'): element.attrib['controls'] = 'controls' if base_url is not None: # Fix relative media urls for element in doc.xpath('//img[@src]|//video[@src]|source[@src]'): element.attrib['src'] = urljoin(base_url, element.attrib['src']) # Fix links for link_element in doc.xpath('//a[@href]'): # Remove server relative links if not urlparse(link_element.attrib['href']).netloc: link_element.drop_tag() continue # Add target="_blank" to general links link_element.attrib['target'] = '_blank' return (doc.text or '') + ''.join([ etree.tostring(child, method='xml').decode('utf-8') for child in doc.iterchildren() ])
def filter_changelog(code, from_version): parser = XHTMLParser() doc = fragment_fromstring(code, create_parent=True, parser=parser) parentelement = None for header in doc.xpath('/div/h1|/div/h2|/div/h3'): title = header.text[1:] if header.text.startswith('v') else header.text try: version = Version(VERSION_HEADER_RE.split(title, 1)[0]) except: continue parentelement = header.getparent() headerelement = header.tag for elem in header.itersiblings(preceding=True): elem.tail = "" elem.drop_tree() break if parentelement is not None: for header in parentelement.xpath(headerelement): title = header.text[1:] if header.text.startswith( 'v') else header.text try: version = Version(VERSION_HEADER_RE.split(title, 1)[0]) except: continue if version <= from_version: for elem in header.itersiblings(): elem.drop_tree() header.tail = "" header.drop_tree() break return (doc.text or '') + ''.join([ etree.tostring(child, method='xml').decode('utf-8') for child in doc.iterchildren() ])
def parse_html(filename: str) -> Element: with open(filename, "r", encoding="utf-8") as f: text = f.read().replace("<br>", "<br/>") parser = XHTMLParser(recover=True, huge_tree=True) return fromstring(text, parser=parser)
from lxml.html import XHTMLParser, XHTML_NAMESPACE xhtml_parser = XHTMLParser(resolve_entities=False) def get_content_by_xpath(element, query, prefix="x", *args, **kwargs): """ Get document elements by xpath. The xpath query should use the appropriate prefix, e.g. >>> asset = Statutecomponent.objects.all()[0] >>> asset.xpath("/x:html/x:body") [<Element {http://www.w3.org/1999/xhtml}body at 0x2b3df4be2878>] :param element: a document Element :param path: the XPath query statement :param prefix: the namespace prefix :param args: additional arguments to pass to the element's xpath method :param kwargs: additional keyword-arguments to pass to the element's xpath method :returns: a list of matching elements :rtype: list """ kwargs.setdefault("namespaces", dict([(prefix, XHTML_NAMESPACE)])) return element.xpath(query, *args, **kwargs) def replace_amp(stream): # namedentities does not convert the ampersand intentionally, so we # do it manually (SSR-1185)