Example #1
0
 def runTest(self):
     f = open(doc_fn, 'rb')
     c = f.read()
     f.close()
     doc = html.document_fromstring(c)
     body = doc.xpath('//body')[0]
     bad = []
     selector, count = self.selectors[self.index]
     options = dict(regex_prefix='re')
     xpath = cssselect.css_to_xpath(cssselect.parse(selector, options),
                                    **options)
     try:
         results = body.xpath(xpath, namespaces=namespaces)
     except Exception:
         e = sys.exc_info()[1]
         e.args = ("%s for xpath %r" % (e, xpath), )
         raise
     found = {}
     for item in results:
         if item in found:
             assert 0, ("Element shows up multiple times: %r" % item)
         found[item] = None
     if isinstance(results, basestring):
         assert 0, ("Got string result (%r), not element, for xpath %r" %
                    (results[:20], str(xpath)))
     if len(results) != count:
         #if self.shortDescription() == 'div.character, div.dialog':
         #    import pdb; pdb.set_trace()
         assert 0, (
             "Did not get expected results (%s) instead %s for xpath %r" %
             (count, len(results), str(xpath)))
 def runTest(self):
     f = open(doc_fn, "rb")
     c = f.read()
     f.close()
     doc = html.document_fromstring(c)
     body = doc.xpath("//body")[0]
     bad = []
     selector, count = self.selectors[self.index]
     xpath = cssselect.css_to_xpath(cssselect.parse(selector))
     try:
         results = body.xpath(xpath)
     except Exception:
         e = sys.exc_info()[1]
         e.args = "%s for xpath %r" % (e, xpath)
         raise
     found = {}
     for item in results:
         if item in found:
             assert 0, "Element shows up multiple times: %r" % item
         found[item] = None
     if isinstance(results, basestring):
         assert 0, "Got string result (%r), not element, for xpath %r" % (results[:20], str(xpath))
     if len(results) != count:
         # if self.shortDescription() == 'div.character, div.dialog':
         #    import pdb; pdb.set_trace()
         assert 0, "Did not get expected results (%s) instead %s for xpath %r" % (count, len(results), str(xpath))
Example #3
0
def match_selectors_against_html_root_element(selectors, html_element):
    '''
    Find the selectors that match with the DOM from the given HTML.

    @param selectors set of CSS selectors (strings)
    @param html_element lxml.etree.Element object

    @return set of found selectors
    '''
    found_selectors = set()
    css_to_xpath_translator = CssDeadwoodHtmlTranslator()
    for selector_str in selectors:
        try:
            # Instead of just calling css_to_xpath(selector_str),
            # we first convert the css selector string to a cssselect.Selector instance
            # to pass to selector_to_xpath(), so we can properly ignore pseudo elements.
            # Note that cssselect.parse() always returns a list, so we do a for loop.
            for selector in cssselect.parse(selector_str):
                selector.pseudo_element = None
                xpath_expr = css_to_xpath_translator.selector_to_xpath(
                    selector)
                if len(html_element.xpath(xpath_expr)) > 0:
                    found_selectors.add(selector_str)
        except Exception:
            global _log
            _log.exception('lxml css select failed on selector %r' %
                           selector_str)
    return found_selectors
Example #4
0
def selector_to_xpath(selector):
    """Return ``pseudo_type, selector_callable`` from a cssutils ``selector``.

    ``pseudo_type`` is a string and ``selector_callable`` is a
    :class:`lxml.cssselect` XPath callable.

    """
    try:
        return selector._x_weasyprint_parsed_cssselect
    except AttributeError:
        parsed_selector = cssselect.parse(selector.selectorText)
        # cssutils made sure that `selector` is not a "group of selectors"
        # in CSS3 terms (`rule.selectorList` is) so `parsed_selector` cannot be
        # of type `cssselect.Or`.
        # This leaves only three cases:
        # - The selector ends with a pseudo-element. As `cssselect.parse()`
        #   parses left-to-right, `parsed_selector` is a `cssselect.Pseudo`
        #   instance that we can unwrap. This is the only place where CSS
        #   allows pseudo-element selectors.
        # - The selector has a pseudo-element not at the end. This is invalid
        #   and the whole ruleset should be ignored.
        #   cssselect.CSSSelector() will raise a cssselect.ExpressionError.
        # - The selector has no pseudo-element and is supported by
        #   `cssselect.CSSSelector`.
        if isinstance(parsed_selector, cssselect.CombinedSelector):
            simple_selector = parsed_selector.subselector
            if isinstance(simple_selector, cssselect.Pseudo) \
                    and simple_selector.ident in PSEUDO_ELEMENTS:
                pseudo_type = str(simple_selector.ident)
                # Remove the pseudo-element from the selector
                parsed_selector.subselector = simple_selector.element
            else:
                # No pseudo-element or invalid selector.
                pseudo_type = None
        else:
            if isinstance(parsed_selector, cssselect.Pseudo) \
                    and parsed_selector.ident in PSEUDO_ELEMENTS:
                pseudo_type = str(parsed_selector.ident)
                # Remove the pseudo-element from the selector
                parsed_selector = parsed_selector.element
            else:
                # No pseudo-element or invalid selector.
                pseudo_type = None

        selector_callable = cssselect.CSSSelector(parsed_selector)
        result = (pseudo_type, selector_callable)

        # Cache for next time we use the same stylesheet
        selector._x_weasyprint_parsed_cssselect = result
        return result