Esempio n. 1
0
def test(xhtml_file: Path, dtd: DTD, schematron: Schematron) -> bool:
    """
    Test that an XHTML file matches a DTD and passes Schematron tests.
    Error messages are printed to stderr if the file doesn't pass.

    :param xhtml_file: the XHTML file to test
    :param dtd: the DTD
    :param schematron: the Schematron
    :return: True if the file passes
    """
    if settings.verbose:
        print(xhtml_file)

    clear_error_log()

    parser = XHTMLParser(dtd_validation=True, ns_clean=True)
    try:
        tree = parse(source=str(xhtml_file), parser=parser)
        html = tree.getroot()
    except IOError as e:
        print(f"{xhtml_file}: {e.strerror}", file=stderr)
        return False
    except XMLSyntaxError:
        print_error_log(parser.error_log)
        return False

    if not dtd.validate(html):
        print_error_log(dtd.error_log)
        return False

    if not schematron.validate(html):
        print_schematron_error_log(html, schematron)
        return False

    return test_links(xhtml_file, html) and test_images(xhtml_file, html)
Esempio n. 2
0
def run(xhtml_files: List[Path], dtd_file: Path, images: bool,
        links: bool) -> bool:
    try:
        dtd = DTD(str(dtd_file))
    except DTDParseError as e:
        print(e.error_log, file=stderr)
        clear_error_log()
        return False
    else:
        success = True
        for file in xhtml_files:
            # if you reuse the parser on too many documents it gets confused
            parser = XHTMLParser(dtd_validation=True, ns_clean=True)
            dtd = DTD(str(dtd_file))
            if settings.verbose:
                print(xhtml_file)
            if not test(file, parser, dtd, images, links):
                success = False
        return success
Esempio n. 3
0
def clean_html(code, base_url=None):
    parser = XHTMLParser()
    doc = fragment_fromstring(code, create_parent=True, parser=parser)

    # Remove processing instructions
    for pi_element in doc.xpath('//processing-instruction()'):
        pi_element.drop_tree()

    # Remove scripts
    for script_element in doc.xpath('//script'):
        script_element.drop_tree()

    # Remove events
    for event_attrib in doc.xpath("//@*[starts-with(name(), 'on')]"):
        event_attrib.getparent().attrib.pop(event_attrib.attrname)

    # Remove audio elements
    for audio_element in doc.xpath('//audio'):
        audio_element.drop_tree()

    # Force controls on videos
    for element in doc.xpath('//video'):
        element.attrib['controls'] = 'controls'

    if base_url is not None:
        # Fix relative media urls
        for element in doc.xpath('//img[@src]|//video[@src]|source[@src]'):
            element.attrib['src'] = urljoin(base_url, element.attrib['src'])

    # Fix links
    for link_element in doc.xpath('//a[@href]'):
        # Remove server relative links
        if not urlparse(link_element.attrib['href']).netloc:
            link_element.drop_tag()
            continue

        # Add target="_blank" to general links
        link_element.attrib['target'] = '_blank'

    return (doc.text or '') + ''.join([
        etree.tostring(child, method='xml').decode('utf-8')
        for child in doc.iterchildren()
    ])
Esempio n. 4
0
def filter_changelog(code, from_version):

    parser = XHTMLParser()
    doc = fragment_fromstring(code, create_parent=True, parser=parser)

    parentelement = None
    for header in doc.xpath('/div/h1|/div/h2|/div/h3'):
        title = header.text[1:] if header.text.startswith('v') else header.text
        try:
            version = Version(VERSION_HEADER_RE.split(title, 1)[0])
        except:
            continue

        parentelement = header.getparent()
        headerelement = header.tag
        for elem in header.itersiblings(preceding=True):
            elem.tail = ""
            elem.drop_tree()
        break

    if parentelement is not None:

        for header in parentelement.xpath(headerelement):
            title = header.text[1:] if header.text.startswith(
                'v') else header.text
            try:
                version = Version(VERSION_HEADER_RE.split(title, 1)[0])
            except:
                continue

            if version <= from_version:
                for elem in header.itersiblings():
                    elem.drop_tree()
                header.tail = ""
                header.drop_tree()
                break

    return (doc.text or '') + ''.join([
        etree.tostring(child, method='xml').decode('utf-8')
        for child in doc.iterchildren()
    ])
Esempio n. 5
0
def parse_html(filename: str) -> Element:

    with open(filename, "r", encoding="utf-8") as f:
        text = f.read().replace("<br>", "<br/>")
    parser = XHTMLParser(recover=True, huge_tree=True)
    return fromstring(text, parser=parser)
Esempio n. 6
0
from lxml.html import XHTMLParser, XHTML_NAMESPACE

xhtml_parser = XHTMLParser(resolve_entities=False)


def get_content_by_xpath(element, query, prefix="x", *args, **kwargs):
    """
    Get document elements by xpath.

    The xpath query should use the appropriate prefix, e.g.

        >>> asset = Statutecomponent.objects.all()[0]
        >>> asset.xpath("/x:html/x:body")
        [<Element {http://www.w3.org/1999/xhtml}body at 0x2b3df4be2878>]

    :param element: a document Element
    :param path: the XPath query statement
    :param prefix: the namespace prefix
    :param args: additional arguments to pass to the element's xpath method
    :param kwargs: additional keyword-arguments to pass to the element's
        xpath method
    :returns: a list of matching elements
    :rtype: list
    """
    kwargs.setdefault("namespaces", dict([(prefix, XHTML_NAMESPACE)]))
    return element.xpath(query, *args, **kwargs)


def replace_amp(stream):
    # namedentities does not convert the ampersand intentionally, so we
    # do it manually (SSR-1185)