Esempio n. 1
0
 def test_lxml_integration(self):
     capsule = html_parser.parse(b'<p id=1>xxx')
     root = etree.adopt_external_document(capsule).getroot()
     self.ae(list(root.iterchildren('body')), list(root.xpath('./body')))
     self.ae(root.find('body/p').text, 'xxx')
     self.ae(root.xpath('//@id'), ['1'])
     # Test that lxml is not copying the doc internally
     root.set('attr', 'abc')
     cap2 = html_parser.clone_doc(capsule)
     root2 = etree.adopt_external_document(cap2).getroot()
     self.ae(tostring(root), tostring(root2))
Esempio n. 2
0
 def parse_html(self, path):
     '''Return head and content elements of the document.'''
     capsule = html_parser.parse(path.read(), maybe_xhtml=True)
     doc = etree.adopt_external_document(capsule).getroot()
     selectors = {
         'head': 'head',
         'main_content': ' '.join(['.main-column', '.section'])
     }
     return {k: doc.cssselect(sel)[0] for k, sel in selectors.items()}
Esempio n. 3
0
    def parse_html(self, fh: IO) -> Dict[str, Any]:
        '''Return head and content elements of the document.'''
        capsule = html_parser.parse(fh.read(), maybe_xhtml=True)
        doc = etree.adopt_external_document(capsule).getroot()

        result = {}
        result['head'] = doc.cssselect('head')[0]

        for candidate in ('.main-column .section', '.main__content'):
            elements = doc.cssselect(candidate)
            if elements:
                result['main_content'] = elements[0]
                break

        if 'main_content' not in result:
            raise ValueError('No main content element found')

        return result
Esempio n. 4
0
def parse(html,
          transport_encoding=None,
          namespace_elements=False,
          treebuilder='lxml',
          fallback_encoding=None,
          keep_doctype=True,
          maybe_xhtml=False,
          return_root=True,
          line_number_attr=None,
          sanitize_names=True,
          stack_size=16 * 1024):
    '''
    Parse the specified :attr:`html` and return the parsed representation.

    :param html: The HTML to be parsed. Can be either bytes or a unicode string.

    :param transport_encoding: If specified, assume the passed in bytes are in this encoding.
        Ignored if :attr:`html` is unicode.

    :param namespace_elements:
        Add XML namespaces when parsing so that the resulting tree is XHTML.

    :param treebuilder:
        The type of tree to return. Note that only the lxml treebuilder is fast, as all
        other treebuilders are implemented in python, not C. Supported values are:
          * `lxml <https://lxml.de>`_  -- the default, and fastest
          * `lxml_html <https://lxml.de>`_  -- tree of lxml.html.HtmlElement, same speed as lxml
          * etree (the python stdlib :mod:`xml.etree.ElementTree`)
          * dom (the python stdlib :mod:`xml.dom.minidom`)
          * `soup <https://www.crummy.com/software/BeautifulSoup>`_ -- BeautifulSoup,
            which must be installed or it will raise an :class:`ImportError`

    :param fallback_encoding: If no encoding could be detected, then use this encoding.
        Defaults to an encoding based on system locale.

    :param keep_doctype: Keep the <DOCTYPE> (if any).

    :param maybe_xhtml: Useful when it is unknown if the HTML to be parsed is
        actually XHTML. Changes the HTML 5 parsing algorithm to be more
        suitable for XHTML. In particular handles self-closed CDATA elements.
        So a ``<title/>`` or ``<style/>`` in the HTML will not completely break
        parsing. Also preserves namespaced tags and attributes even for namespaces
        not supported by HTML 5 (this works only with the ``lxml`` and ``lxml_html``
        treebuilder).
        Note that setting this also implicitly sets ``namespace_elements``.

    :param return_root: If True, return the root node of the document, otherwise
        return the tree object for the document.

    :param line_number_attr: The optional name of an attribute used to store the line number
        of every element. If set, this attribute will be added to each element with the
        element's line number.

    :param sanitize_names: Ensure tag and attributes contain only ASCII alphanumeric
        charactes, underscores, hyphens and periods. This ensures that the resulting
        tree is also valid XML. Any characters outside this set are replaced by
        underscores. Note that this is not strictly HTML 5 spec compliant, so turn it
        off if you need strict spec compliance.

    :param stack_size: The initial size (number of items) in the stack. The
        default is sufficient to avoid memory allocations for all but the
        largest documents.

    '''
    data = as_utf8(html or b'', transport_encoding, fallback_encoding)
    treebuilder = normalize_treebuilder(treebuilder)
    if treebuilder == 'soup':
        from .soup import parse
        return parse(data,
                     return_root=return_root,
                     keep_doctype=keep_doctype,
                     stack_size=stack_size)
    if treebuilder not in NAMESPACE_SUPPORTING_BUILDERS:
        namespace_elements = False

    capsule = html_parser.parse(data,
                                namespace_elements=namespace_elements
                                or maybe_xhtml,
                                keep_doctype=keep_doctype,
                                maybe_xhtml=maybe_xhtml,
                                line_number_attr=line_number_attr,
                                sanitize_names=sanitize_names,
                                stack_size=stack_size)

    interpreter = None
    if treebuilder == 'lxml_html':
        from lxml.html import HTMLParser
        interpreter = HTMLParser()
    ans = etree.adopt_external_document(capsule, parser=interpreter)
    if treebuilder in ('lxml', 'lxml_html'):
        return ans.getroot() if return_root else ans
    m = importlib.import_module('html5_parser.' + treebuilder)
    return m.adapt(ans, return_root=return_root)