Ejemplo n.º 1
0
def calculate_neo4j_query(css_selector):
    gen = var_name_generator()
    query = Query(gen, where_clause=[], match_clause=[])
    parsed_css = cssselect.parse(css_selector)[0].parsed_tree
    _calculate_neo4j_query(parsed_css, query, last_var=query.last_created_var)

    return query
Ejemplo n.º 2
0
 def series(css):
     selector, = parse(':nth-child(%s)' % css)
     args = selector.parsed_tree.arguments
     try:
         return parse_series(args)
     except ValueError:
         return None
Ejemplo n.º 3
0
 def series(css):
     selector, = parse(':nth-child(%s)' % css)
     args = selector.parsed_tree.arguments
     try:
         return parse_series(args)
     except ValueError:
         return None
Ejemplo n.º 4
0
def check_selector_list(sel, doc):
    tr = cs.HTMLTranslator()

    def convert(x):
        return tr.selector_to_xpath(x)

    # convert e.g. a:hover to a::hover (css3)
    def S(m):
        if m.group(0) == ':': return '::'
        else: return m.group(0)

    for s in sel:
        s = re.sub(':+', S, s)
        try:
            sel_list = cs.parse(s)
            for x in map(convert, sel_list):
                if doc.xpath(x):
                    return True
        except cs.parser.SelectorSyntaxError as e:
            # probably unsupported @media selector
            # may still be matched by subrules' selectors
            # so just skip this selector
            pass
        except Exception as e:
            print(e, "; sel='{}'".format(s), file=sys.stderr)

    return False
Ejemplo n.º 5
0
    def process_rule(self, rule, is_ancestor, maximum_specificities):
        selector = rule['selector']
        sheet_index = rule['sheet_index']
        rule_address = rule['rule_address'] or ()
        if selector is not None:
            try:
                specificity = [0] + list(parse(selector)[0].specificity())
            except (AttributeError, TypeError):
                specificity = [0, 0, 0, 0]
        else:  # style attribute
            specificity = [1, 0, 0, 0]
        specificity.extend((sheet_index, tuple(rule_address)))
        ancestor_specificity = 0 if is_ancestor else 1
        properties = []
        for prop in rule['properties']:
            important = 1 if prop[-1] == 'important' else 0
            p = Property(prop, [ancestor_specificity] + [important] + specificity)
            properties.append(p)
            if p.specificity > maximum_specificities.get(p.name, (0,0,0,0,0,0)):
                maximum_specificities[p.name] = p.specificity
        rule['properties'] = properties

        href = rule['href']
        if hasattr(href, 'startswith') and href.startswith('file://'):
            href = href[len('file://'):]
            if iswindows and href.startswith('/'):
                href = href[1:]
            if href:
                rule['href'] = current_container().abspath_to_name(href, root=self.preview.current_root)
Ejemplo n.º 6
0
 def isapplicable(cls, selector, node, enable_debug=False):
     cls._enable_debug = enable_debug
     s = parse(selector)
     for item in s:
         if cls.walk(item, node):
             return True
     return False
Ejemplo n.º 7
0
    def tag(self, selector, **attrs):
        parsed = cssselect.parse(selector)
        if len(parsed) > 1:
            raise ValueError('Cannot specify more than 1 tag.')

        tag_name = None
        kwargs = {}
        item = parsed[0].parsed_tree
        while item:
            if item.__class__ is cssselect.parser.Hash:
                kwargs['id'] = item.id
                item = item.selector
            elif item.__class__ is cssselect.parser.Class:
                kwargs['class_'] = ' '.join([
                    kwargs.get('class_', ''), item.class_name]).strip()
                item = item.selector
            elif item.__class__ is cssselect.parser.Attrib:
                kwargs[item.attrib] = item.value
                item = item.selector
            elif item.__class__ is cssselect.parser.Element:
                tag_name = item.element
                break
            else:
                raise ValueError('Unsupported selector: %s.' % selector)
        kwargs.update(attrs)

        return self.settings.tag_class(tag_name, self, **kwargs)
Ejemplo n.º 8
0
 def parse_pseudo(css):
     result = []
     for selector in parse(css):
         result.append((
             repr(selector._tree).replace("(u'", "('"),
             selector.pseudo_element))
     return result
Ejemplo n.º 9
0
def check_selector_list(sel, doc):
    tr = cs.HTMLTranslator()
    def convert(x):
        return tr.selector_to_xpath( x )

    # convert e.g. a:hover to a::hover (css3)
    def S(m):
        if m.group(0) == ':': return '::'
        else: return m.group(0)

    for s in sel:
        s = re.sub(':+', S, s)
        try:
            sel_list = cs.parse(s)
            for x in map(convert, sel_list):
                if doc.xpath(x):
                    return True
        except cs.parser.SelectorSyntaxError as e:
            # probably unsupported @media selector
            # may still be matched by subrules' selectors
            # so just skip this selector
            pass
        except Exception as e:
            print(e, "; sel='{}'".format(s), file=sys.stderr)

    return False
Ejemplo n.º 10
0
 def _get_rule_specificity(self, rule):
     """
     For a given CSSRule get its selector specificity in base 10
     """
     import cssselect
     sels = (s.specificity() for s in cssselect.parse(rule.selector.as_css()))
     return sum(map(self._get_specificity_from_list, sels))
Ejemplo n.º 11
0
    def collect_for_nodes(self):
        for ruleset in self.rules:
            try:
                selectors = cssselect.parse(ruleset.selector.as_css())
            except:
                continue

            for selector in selectors:
                try:
                    xpath = cssselect.HTMLTranslator().selector_to_xpath(selector)
                except cssselect.xpath.ExpressionError:
                    continue

                # constructs a dictionnary for each node addressed by css, and
                # collects the associated declarations and priorities
                for node in self.tree.xpath(xpath):
                    if node not in self.nodes:
                        self.nodes[node] = {}

                    for declaration in ruleset.declarations:
                        # replaces if priority is equal of higher and if not important
                        new_specificity = selector.specificity()

                        if declaration.name in self.nodes[node]:
                            if self.nodes[node][declaration.name][1] > new_specificity:
                                continue

                        self.nodes[node][declaration.name] = (declaration.value.as_css(), selector.specificity())

                    style_attr = node.get('style')
                    if style_attr:
                        declarations = HTMLBaker.css_parser.parse_style_attr(style_attr)[0]
                        for declaration in declarations:
                            self.nodes[node][declaration.name] = (declaration.value.as_css(), (1, 0, 0, 0))
Ejemplo n.º 12
0
def classes_in_selector(text):
    classes = set()
    try:
        for selector in parse(text):
            _classes_in_selector(selector, classes)
    except SelectorSyntaxError:
        pass
    return classes
Ejemplo n.º 13
0
def classes_in_selector(text):
    classes = set()
    try:
        for selector in parse(text):
            _classes_in_selector(selector, classes)
    except SelectorSyntaxError:
        pass
    return classes
Ejemplo n.º 14
0
 def repr_parse(css):
     selectors = parse(css)
     for selector in selectors:
         assert selector.pseudo_element is None
     return [
         repr(selector.parsed_tree).replace("(u'", "('")
         for selector in selectors
     ]
Ejemplo n.º 15
0
def match_selector(rule, tree):
    """Yield the ``(element, specificity)`` in ``tree`` matching ``rule``."""
    selector_list = cssselect.parse(rule.selector.as_css())
    translator = cssselect.GenericTranslator()
    for selector in selector_list:
        if not selector.pseudo_element:
            specificity = selector.specificity()
            for element in tree.xpath(translator.selector_to_xpath(selector)):
                yield element, specificity
Ejemplo n.º 16
0
 def parse_pseudo(css):
     result = []
     for selector in parse(css):
         pseudo = selector.pseudo_element
         # No Symbol here
         assert pseudo is None or type(pseudo) is _unicode
         selector = repr(selector.parsed_tree).replace("(u'", "('")
         result.append((selector, pseudo))
     return result
Ejemplo n.º 17
0
def is_selector(string):
    ''' Check to see if string represents valid HTML selector. '''
    try:
        # cssselect doesn't like links, so we replace them.
        string = re.sub(r"(href.=)([^\]]*)\]", r"\1fakelink]", string)
        tree = cssselect.parse(string)
    except SelectorSyntaxError:
        return False
    return _do_elements_have_standard_tags(tree) and not _is_file_extension(tree)
Ejemplo n.º 18
0
 def _css(cls, current, css_selector):
     # The given CSS selector may be a group selector (multiple selectors
     # delimited by commas), so we must parse out and convert the individual
     # selectors, then return their union.
     selectors = parse(css_selector)
     xpath_selectors = ["{current}//{selector}".format(current=current,
                                                       selector=_selector_to_xpath(selector))
                        for selector in selectors]
     return cls._union(*xpath_selectors)
Ejemplo n.º 19
0
 def parse_pseudo(css):
     result = []
     for selector in parse(css):
         pseudo = selector.pseudo_element
         # No Symbol here
         assert pseudo is None or type(pseudo) is _unicode
         selector = repr(selector.parsed_tree).replace("(u'", "('")
         result.append((selector, pseudo))
     return result
Ejemplo n.º 20
0
def match_selector(rule, tree):
    """Yield the ``(element, specificity)`` in ``tree`` matching ``rule``."""
    selector_list = cssselect.parse(rule.selector.as_css())
    translator = cssselect.GenericTranslator()
    for selector in selector_list:
        if not selector.pseudo_element:
            specificity = selector.specificity()
            for element in tree.xpath(translator.selector_to_xpath(selector)):
                yield element, specificity
Ejemplo n.º 21
0
def get_css_nodes(string):
    ''' REUSE: _is_selector in Tutorons server code. '''
    try:
        # cssselect doesn't like links, so we replace them.
        string = re.sub(r"(href.=)([^\]]*)\]", r"\1fakelink]", string)
        tree = cssselect.parse(string)
        selector_nodes = get_descendants(tree)
        return selector_nodes
    except SelectorSyntaxError:
        return []
Ejemplo n.º 22
0
def is_selector(string):
    ''' Check to see if string represents valid HTML selector. '''
    try:
        # cssselect doesn't like links, so we replace them.
        string = re.sub(r"(href.=)([^\]]*)\]", r"\1fakelink]", string)
        tree = cssselect.parse(string)
    except SelectorSyntaxError:
        return False
    return _do_elements_have_standard_tags(
        tree) and not _is_file_extension(tree)
Ejemplo n.º 23
0
 def _css(self, current, css_selector):
     # The given CSS selector may be a group selector (multiple selectors
     # delimited by commas), so we must parse out and convert the individual
     # selectors, then return their union.
     selectors = parse(css_selector)
     xpath_selectors = [
         "{0}//{1}".format(current, _selector_to_xpath(selector))
         for selector in selectors
     ]
     return self._union(*xpath_selectors)
Ejemplo n.º 24
0
def _parse_locator(css_or_xpath: str) -> tuple:
    if not isinstance(css_or_xpath, str):
        raise TypeError("Locator {!r} is not a string.".format(css_or_xpath))

    try:
        cssselect.parse(css_or_xpath)
    except cssselect.SelectorSyntaxError:
        pass
    else:
        return "css", css_or_xpath

    try:
        etree.XPath(css_or_xpath)
    except etree.XPathSyntaxError:
        pass
    else:
        return "xpath", css_or_xpath

    raise ValueError(
        "Locator {!r} neither a css nor an xpath string.".format(css_or_xpath))
Ejemplo n.º 25
0
def parse_qualified_rule(rule):
    strselector = tinycss2.serializer.serialize(rule.prelude)
    if not strselector:
        return
    try:
        selector = cssselect.parse(strselector)
    except cssselect.SelectorError as ex:
        #log.error('Error: parsing css select: %s %s' % (strselector, ex))
        print('Error: parsing css select: %s %s' % (strselector, ex))
        raise
    for s in selector:
        process_cssselect_comp(s.parsed_tree, rule)
Ejemplo n.º 26
0
def parse_qualified_rule(rule):
    strselector = tinycss2.serializer.serialize(rule.prelude)
    if not strselector:
        return
    try:
        selector = cssselect.parse(strselector)
    except cssselect.SelectorError as ex:
        #log.error('Error: parsing css select: %s %s' % (strselector, ex))
        print('Error: parsing css select: %s %s' % (strselector, ex))
        raise
    for s in selector:
        process_cssselect_comp(s.parsed_tree, rule)
Ejemplo n.º 27
0
def is_selector(string):
    ''' Check to see if string represents valid HTML selector. '''
    try:
        ''' cssselect doesn't play well with links, so we replace them for now. '''
        string = re.sub(r"(href.=)([^\]]*)\]", r"\1fakelink]", string)
        tree = cssselect.parse(string)
        selector_parts = get_descendants(tree)
        for part in selector_parts:
            if isinstance(part, Element):
                if part.element not in HTML_TAGS:
                    return False
        return True
    except SelectorSyntaxError:
        return False
Ejemplo n.º 28
0
def is_selector(string):
    """ Check to see if string represents valid HTML selector. """
    try:
        """ cssselect doesn't play well with links, so we replace them for now. """
        string = re.sub(r"(href.=)([^\]]*)\]", r"\1fakelink]", string)
        tree = cssselect.parse(string)
        selector_parts = get_descendants(tree)
        for part in selector_parts:
            if isinstance(part, Element):
                if part.element not in HTML_TAGS:
                    return False
        return True
    except SelectorSyntaxError:
        return False
Ejemplo n.º 29
0
    def _token_list_matches_tree(self, token_list):
        """
        Returns whether the token list matches the HTML tree

        :param selector: A Token list to check
        :type selector: list of Token objects
        :returns: True if the token list has matches in self.tree
        :rtype: bool
        """
        try:
            parsed_selector = cssselect.parse("".join(token.as_css() for token in token_list))[0]

            return bool(self.tree.xpath(self.xpath_translator.selector_to_xpath(parsed_selector)))
        except:
            # On error, assume the selector matches the tree
            return True
Ejemplo n.º 30
0
    def _token_list_matches_tree(self, token_list):
        """
        Returns whether the token list matches the HTML tree

        :param selector: A Token list to check
        :type selector: list of Token objects
        :returns: True if the token list has matches in self.tree
        :rtype: bool
        """
        try:
            parsed_selector = cssselect.parse(''.join(
                token.as_css() for token in token_list))[0]

            return bool(
                self.tree.xpath(
                    self.xpath_translator.selector_to_xpath(parsed_selector)))
        except:
            # On error, assume the selector matches the tree
            return True
Ejemplo n.º 31
0
def get_metrics(rule):
    proc = subprocess.Popen(['echo "'+rule+'" | analyze-css -'],stdout=subprocess.PIPE, shell=True)
    (o, err) = proc.communicate()
    result = json.loads(o)
    metrics = result["metrics"]

    if ("rules" not in metrics):
        metrics["rules"] = 0
    if ("declarations" not in metrics):
        metrics["declarations"] = 0
    if ("selectors" not in metrics):
        metrics["selectors"] = 0

    remove = ["imports","rules","comments","commentsLength","duplicatedSelectors","emptyRules","base64Length","redundantBodySelectors","redundantChildNodesSelectors"]
    for key in remove:
        metrics.pop(key, None)

    selectors = rule[:rule.find("{")]
    css = cssselect.parse(selectors)
    i = 0
    specificity = 0
    while i<len(css):
        tmp = css[i].specificity()
        val = tmp[2] + tmp[1]*10 + tmp[0]*100
        if val>specificity:
            specificity = val
        i+=1

    metrics["specificity"] = specificity
    if specificity>=100:
        metrics["specificity_category"] = 'high'
    elif specificity>=10:
        metrics["specificity_category"] = 'medium'
    else:
        metrics["specificity_category"] = 'low'

    return metrics
Ejemplo n.º 32
0
    def convert(self):
        """Remove HTML and PGDP marker from the text."""

        escaped_unicode_re = re.compile(r"\\u[0-9a-fA-F]{4}")

        def escaped_unicode(m):
            try:
                newstr = bytes(m.group(0), 'utf8').decode('unicode-escape')
            except Exception:
                newstr = m.group(0)

            return newstr

        def new_content(element):
            """Process the "content:" property
            """
            retstr = ""
            for token in val.value:
                if token.type == "STRING":
                    # e.g. { content: "xyz" }
                    retstr += escaped_unicode_re.sub(escaped_unicode,
                                                     token.value)
                elif token.type == "FUNCTION":
                    if token.function_name == 'attr':
                        # e.g. { content: attr(title) }
                        retstr += element.attrib.get(token.content[0].value,
                                                     "")
                elif token.type == "IDENT":
                    if token.value == "content":
                        # Identity, e.g. { content: content }
                        retstr += element.text

            return retstr

        # Process each rule from our transformation CSS
        stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss)
        property_errors = []
        for rule in stylesheet.rules:

            # Extract values we care about
            f_transform = None
            f_replace_with_attr = None
            #f_replace_regex = None
            f_text_replace = None
            f_element_func = None
            f_move = None

            for val in rule.declarations:

                if val.name == 'content':
                    # result depends on element and pseudo elements.
                    pass

                elif val.name == "text-transform":
                    if len(val.value) != 1:
                        property_errors += [(val.line, val.column,
                                             val.name + " takes 1 argument")]
                    else:
                        v = val.value[0].value
                        if v == "uppercase":
                            f_transform = lambda x: x.upper()
                        elif v == "lowercase":
                            f_transform = lambda x: x.lower()
                        elif v == "capitalize":
                            f_transform = lambda x: x.title()
                        else:
                            property_errors += [(
                                val.line, val.column, val.name +
                                " accepts only 'uppercase', 'lowercase' or 'capitalize'"
                            )]

                elif val.name == "_replace_with_attr":
                    f_replace_with_attr = lambda el: el.attrib[val.value[0].
                                                               value]

                elif val.name == "text-replace":
                    # Skip S (spaces) tokens.
                    values = [v for v in val.value if v.type != "S"]
                    if len(values) != 2:
                        property_errors += [
                            (val.line, val.column,
                             val.name + " takes 2 string arguments")
                        ]
                    else:
                        v1 = values[0].value
                        v2 = values[1].value
                        f_text_replace = lambda x: x.replace(v1, v2)

                elif val.name == "display":
                    # Support display none only. So ignore "none" argument.
                    f_element_func = clear_element

                elif val.name == "_graft":
                    values = [v for v in val.value if v.type != "S"]
                    if len(values) < 1:
                        property_errors += [
                            (val.line, val.column,
                             val.name + " takes at least one argument")
                        ]
                        continue
                    f_move = []
                    for v in values:
                        print("[", v.value, "]")
                        if v.value == 'parent':
                            f_move.append(lambda el: el.getparent())
                        elif v.value == 'prev-sib':
                            f_move.append(lambda el: el.getprevious())
                        elif v.value == 'next-sib':
                            f_move.append(lambda el: el.getnext())
                        else:
                            property_errors += [
                                (val.line, val.column,
                                 val.name + " invalid value " + v.value)
                            ]
                            f_move = None
                            break

                    if not f_move:
                        continue


#                elif val.name == "_replace_regex":
#                    f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2")
#                    f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value)

                else:
                    property_errors += [(val.line, val.column,
                                         "Unsupported property " + val.name)]
                    continue

                # Iterate through each selectors in the rule
                for selector in cssselect.parse(rule.selector.as_css()):

                    pseudo_element = selector.pseudo_element

                    xpath = cssselect.HTMLTranslator().selector_to_xpath(
                        selector)
                    find = etree.XPath(xpath)

                    # Find each matching element in the HTML/XHTML document
                    for element in find(self.myfile.tree):

                        # Replace text with content of an attribute.
                        if f_replace_with_attr:
                            element.text = f_replace_with_attr(element)

                        if val.name == 'content':
                            v_content = new_content(element)
                            if pseudo_element == "before":
                                element.text = v_content + (element.text or ''
                                                            )  # opening tag
                            elif pseudo_element == "after":
                                element.tail = v_content + (element.tail or ''
                                                            )  # closing tag
                            else:
                                # Replace all content
                                element.text = new_content(element)

                        if f_transform:
                            self.text_apply(element, f_transform)

                        if f_text_replace:
                            self.text_apply(element, f_text_replace)

                        if f_element_func:
                            f_element_func(element)

                        if f_move:
                            parent = element.getparent()
                            new = element
                            for f in f_move:
                                new = f(new)

                            # Move the tail to the sibling or the parent
                            if element.tail:
                                sibling = element.getprevious()
                                if sibling:
                                    sibling.tail = (sibling.tail
                                                    or "") + element.tail
                                else:
                                    parent.text = (parent.text
                                                   or "") + element.tail
                                element.tail = None

                            # Prune and graft
                            parent.remove(element)
                            new.append(element)

                    # if f_replace_regex and element.text:
                    #     element.text = f_replace_regex(element.text)

        css_errors = ""
        if stylesheet.errors or property_errors:
            # There is transformation CSS errors. If the default css
            # is included, take the offset into account.
            i = 0
            if self.args.css_no_default is False:
                i = DEFAULT_TRANSFORM_CSS.count('\n')
            css_errors = "<div class='error-border bbox'><p>Error(s) in the transformation CSS:</p><ul>"
            for err in stylesheet.errors:
                css_errors += "<li>{0},{1}: {2}</li>".format(
                    err.line - i, err.column, err.reason)
            for err in property_errors:
                css_errors += "<li>{0},{1}: {2}</li>".format(
                    err[0] - i, err[1], err[2])
            css_errors += "</ul>"

        return css_errors
Ejemplo n.º 33
0
def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules,
                          url_fetcher, rules, fonts, font_config):
    """Do the work that can be done early on stylesheet, before they are
    in a document.

    """
    selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath
    for rule in stylesheet_rules:
        if not rule.at_keyword:
            declarations = list(preprocess_declarations(
                base_url, rule.declarations))
            if declarations:
                selector_string = rule.selector.as_css()
                try:
                    selector_list = []
                    for selector in cssselect.parse(selector_string):
                        xpath = selector_to_xpath(selector)
                        try:
                            lxml_xpath = lxml.etree.XPath(xpath)
                        except ValueError as exc:
                            # TODO: Some characters are not supported by lxml's
                            # XPath implementation (including control
                            # characters), but these characters are valid in
                            # the CSS2.1 specification.
                            raise cssselect.SelectorError(str(exc))
                        selector_list.append(Selector(
                            (0,) + selector.specificity(),
                            selector.pseudo_element, lxml_xpath))
                    for selector in selector_list:
                        if selector.pseudo_element not in PSEUDO_ELEMENTS:
                            raise cssselect.ExpressionError(
                                'Unknown pseudo-element: %s'
                                % selector.pseudo_element)
                except cssselect.SelectorError as exc:
                    LOGGER.warning("Invalid or unsupported selector '%s', %s",
                                   selector_string, exc)
                    continue
                rules.append((rule, selector_list, declarations))

        elif rule.at_keyword == '@import':
            if not evaluate_media_query(rule.media, device_media_type):
                continue
            url = url_join(base_url, rule.uri, '@import at %s:%s',
                           rule.line, rule.column)
            if url is not None:
                try:
                    stylesheet = CSS(
                        url=url, url_fetcher=url_fetcher,
                        media_type=device_media_type, font_config=font_config)
                except URLFetchingError as exc:
                    LOGGER.warning('Failed to load stylesheet at %s : %s',
                                   url, exc)
                else:
                    for result in stylesheet.rules:
                        rules.append(result)

        elif rule.at_keyword == '@media':
            if not evaluate_media_query(rule.media, device_media_type):
                continue
            preprocess_stylesheet(
                device_media_type, base_url, rule.rules, url_fetcher, rules,
                fonts, font_config)

        elif rule.at_keyword == '@page':
            page_name, pseudo_class = rule.selector
            # TODO: support named pages (see CSS3 Paged Media)
            if page_name is not None:
                LOGGER.warning('Named pages are not supported yet, the whole '
                               '@page %s rule was ignored.', page_name + (
                                   ':' + pseudo_class if pseudo_class else ''))
                continue
            declarations = list(preprocess_declarations(
                base_url, rule.declarations))

            # Use a double lambda to have a closure that holds page_types
            match = (lambda page_types: lambda _document: page_types)(
                PAGE_PSEUDOCLASS_TARGETS[pseudo_class])
            specificity = rule.specificity

            if declarations:
                selector_list = [Selector(specificity, None, match)]
                rules.append((rule, selector_list, declarations))

            for margin_rule in rule.at_rules:
                declarations = list(preprocess_declarations(
                    base_url, margin_rule.declarations))
                if declarations:
                    selector_list = [Selector(
                        specificity, margin_rule.at_keyword, match)]
                    rules.append((margin_rule, selector_list, declarations))

        elif rule.at_keyword == '@font-face':
            rule_descriptors = dict(list(preprocess_descriptors(
                base_url, rule.declarations)))
            for key in ('src', 'font_family'):
                if key not in rule_descriptors:
                    LOGGER.warning(
                        "Missing %s descriptor in '@font-face' rule at %s:%s",
                        key.replace('_', '-'), rule.line, rule.column)
                    break
            else:
                if font_config is not None:
                    font_filename = font_config.add_font_face(
                        rule_descriptors, url_fetcher)
                    if font_filename:
                        fonts.append(font_filename)
Ejemplo n.º 34
0
	def _parseStylesheet(self, fileName, htmlBody):
		"""
		Reads in a stylesheet and parses it.

		Parameters
		----------
		ssFileNames : string
			File name to parse.
		htmlBody : lxml.etree.Element
			Root element of the body.

		Modifies
		--------
		htmlBody
			Some of the more complex selectors can't be stored as a simple lookup,
			so these values are added to the inline "style" attribute of the
			appropriate tags.

		Returns
		-------
		out : [ dictionary ] * 3
			List of three dictionaries: [tagLookup idLookup classLookup]
			Each is a lookup table for its respective selector type and
			is of the form:
				{ 'name' : {dictionary} }, 	where the linked dictionaries
											contain that selector's parsed
											declarations.
		"""

		cssparser = tinycss.make_parser()
		stylesheet = cssparser.parse_stylesheet_file(fileName)

		out = [{}, {}, {}]

		for rule in stylesheet.rules:
			# Skip at keywords
			if rule.at_keyword != None:
				continue

			thisDecl = self._declaration2dict(rule.declarations)

			## Decode selector types
			parsedSelectors = cssselect.parse(rule.selector.as_css())
			for i, thisSel in enumerate(parsedSelectors):
				
				## Do the easy selectors as a dictionary
				if sum(thisSel.specificity()) < 2:
					if hasattr(thisSel.parsed_tree, 'id'):
						# We have a single ID selector
						out[1][thisSel.parsed_tree.id] = thisDecl
						continue
					elif hasattr(thisSel.parsed_tree, 'class_name'):
						# We have a single CLASS selector
						out[2][thisSel.parsed_tree.class_name] = thisDecl
						continue
					elif hasattr(thisSel.parsed_tree, 'element'):
						# We have a single TAG selector
						out[0][thisSel.parsed_tree.element] = thisDecl
						continue


				## Do the hard selectors as in-line style
				for elt in htmlBody.cssselect(thisSel):
					declStr = self._decl2str(rule.declarations)
					try:
						elt.attrib['style'] += "; " + declStr
					except:
						elt.attrib['style'] = declStr

		return out
Ejemplo n.º 35
0
def parse_rule(rule):
    string = "".join([s.serialize() for s in rule.prelude])
    selectors = cssselect.parse(string)
    for sel in selectors:
        yield Rule(stringify_selector(sel), parse_properties(rule.content))
Ejemplo n.º 36
0
Archivo: css.py Proyecto: Rothera/bpm2
def parse_rule(rule):
    string = "".join([s.serialize() for s in rule.prelude])
    selectors = cssselect.parse(string)
    for sel in selectors:
        yield Rule(stringify_selector(sel), parse_properties(rule.content))
Ejemplo n.º 37
0
from tinycss.css21 import CSS21Parser
import cssselect
import sys

css = open(sys.argv[-1],'r').read()
styles = CSS21Parser().parse_stylesheet(css)

for ruleIdx in range(len(styles.rules)):
	rule = styles.rules[ruleIdx]
	selectors = rule.selector
	selector_string = selectors.as_css()
	specs = [s.specificity() for s in cssselect.parse(selector_string)]
	for i in range(len(specs)):
		spec = specs[i][0] * 100 + specs[i][1] * 10 + specs[i][2]
		spec = spec * 1000 + ruleIdx
		specs[i] = spec

	for decl in rule.declarations:

		if len(decl.value) == 1:
			decl_value = "'" + str(decl.value[0].value) + "'"
		else:
			decl_value = '[' + ','.join(["'"+str(x.value)+"'" for x in decl.value if x.type != 'S']) + ']'
		
		# A0,...,A(N-1) each selector
		# AN final node
		N = len(selectors)
		specIdx = 0
		print "rule(A%d,'%s',%s,%d):-" % (N,decl.name,decl_value,specs[specIdx])
		for i in range(N):
Ejemplo n.º 38
0
 def specificity(css):
     selectors = parse(css)
     assert len(selectors) == 1
     return selectors[0].specificity()
def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules,
                          url_fetcher, rules, fonts, font_config):
    """Do the work that can be done early on stylesheet, before they are
    in a document.

    """
    selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath
    for rule in stylesheet_rules:
        if rule.type == 'qualified-rule':
            declarations = list(
                preprocess_declarations(
                    base_url, tinycss2.parse_declaration_list(rule.content)))
            if declarations:
                selector_string = tinycss2.serialize(rule.prelude)
                try:
                    selector_list = []
                    for selector in cssselect.parse(selector_string):
                        xpath = selector_to_xpath(selector)
                        try:
                            lxml_xpath = lxml.etree.XPath(xpath)
                        except ValueError as exc:
                            # TODO: Some characters are not supported by lxml's
                            # XPath implementation (including control
                            # characters), but these characters are valid in
                            # the CSS2.1 specification.
                            raise cssselect.SelectorError(str(exc))
                        selector_list.append(
                            Selector((0, ) + selector.specificity(),
                                     selector.pseudo_element, lxml_xpath))
                    for selector in selector_list:
                        if selector.pseudo_element not in PSEUDO_ELEMENTS:
                            raise cssselect.ExpressionError(
                                'Unknown pseudo-element: %s' %
                                selector.pseudo_element)
                except cssselect.SelectorError as exc:
                    LOGGER.warning("Invalid or unsupported selector '%s', %s",
                                   selector_string, exc)
                    continue
                rules.append((rule, selector_list, declarations))

        elif rule.type == 'at-rule' and rule.at_keyword == 'import':
            tokens = remove_whitespace(rule.prelude)
            if tokens and tokens[0].type in ('url', 'string'):
                url = tokens[0].value
            else:
                continue
            media = parse_media_query(tokens[1:])
            if media is None:
                LOGGER.warning(
                    'Invalid media type "%s" '
                    'the whole @import rule was ignored at %s:%s.',
                    tinycss2.serialize(rule.prelude), rule.source_line,
                    rule.source_column)
            if not evaluate_media_query(media, device_media_type):
                continue
            url = url_join(base_url,
                           url,
                           allow_relative=False,
                           context='@import at %s:%s',
                           context_args=(rule.source_line, rule.source_column))
            if url is not None:
                try:
                    stylesheet = CSS(url=url,
                                     url_fetcher=url_fetcher,
                                     media_type=device_media_type,
                                     font_config=font_config)
                except URLFetchingError as exc:
                    LOGGER.warning('Failed to load stylesheet at %s : %s', url,
                                   exc)
                else:
                    for result in stylesheet.rules:
                        rules.append(result)

        elif rule.type == 'at-rule' and rule.at_keyword == 'media':
            media = parse_media_query(rule.prelude)
            if media is None:
                LOGGER.warning(
                    'Invalid media type "%s" '
                    'the whole @media rule was ignored at %s:%s.',
                    tinycss2.serialize(rule.prelude), rule.source_line,
                    rule.source_column)
                continue
            if not evaluate_media_query(media, device_media_type):
                continue
            content_rules = tinycss2.parse_rule_list(rule.content)
            preprocess_stylesheet(device_media_type, base_url, content_rules,
                                  url_fetcher, rules, fonts, font_config)

        elif rule.type == 'at-rule' and rule.at_keyword == 'page':
            tokens = remove_whitespace(rule.prelude)
            # TODO: support named pages (see CSS3 Paged Media)
            if not tokens:
                pseudo_class = None
                specificity = (0, 0)
            elif (len(tokens) == 2 and tokens[0].type == 'literal'
                  and tokens[0].value == ':' and tokens[1].type == 'ident'):
                pseudo_class = tokens[1].lower_value
                specificity = {
                    'first': (1, 0),
                    'blank': (1, 0),
                    'left': (0, 1),
                    'right': (0, 1),
                }.get(pseudo_class)
                if not specificity:
                    LOGGER.warning(
                        'Unknown @page pseudo-class "%s", '
                        'the whole @page rule was ignored '
                        'at %s:%s.', pseudo_class, rule.source_line,
                        rule.source_column)
                    continue
            else:
                LOGGER.warning(
                    'Unsupported @page selector "%s", '
                    'the whole @page rule was ignored at %s:%s.',
                    tinycss2.serialize(rule.prelude), rule.source_line,
                    rule.source_column)
                continue
            content = tinycss2.parse_declaration_list(rule.content)
            declarations = list(preprocess_declarations(base_url, content))

            # Use a double lambda to have a closure that holds page_types
            match = (lambda page_types: lambda _document: page_types)(
                PAGE_PSEUDOCLASS_TARGETS[pseudo_class])

            if declarations:
                selector_list = [Selector(specificity, None, match)]
                rules.append((rule, selector_list, declarations))

            for margin_rule in content:
                if margin_rule.type != 'at-rule':
                    continue
                declarations = list(
                    preprocess_declarations(
                        base_url,
                        tinycss2.parse_declaration_list(margin_rule.content)))
                if declarations:
                    selector_list = [
                        Selector(specificity, '@' + margin_rule.at_keyword,
                                 match)
                    ]
                    rules.append((margin_rule, selector_list, declarations))

        elif rule.type == 'at-rule' and rule.at_keyword == 'font-face':
            content = tinycss2.parse_declaration_list(rule.content)
            rule_descriptors = dict(preprocess_descriptors(base_url, content))
            for key in ('src', 'font_family'):
                if key not in rule_descriptors:
                    LOGGER.warning(
                        "Missing %s descriptor in '@font-face' rule at %s:%s",
                        key.replace('_', '-'), rule.source_line,
                        rule.source_column)
                    break
            else:
                if font_config is not None:
                    font_filename = font_config.add_font_face(
                        rule_descriptors, url_fetcher)
                    if font_filename:
                        fonts.append(font_filename)
Ejemplo n.º 40
0
def preprocess_stylesheet(device_media_type, base_url, rules, url_fetcher):
    """Do the work that can be done early on stylesheet, before they are
    in a document.

    """
    selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath
    for rule in rules:
        if not rule.at_keyword:
            declarations = list(
                preprocess_declarations(base_url, rule.declarations))
            if declarations:
                selector_string = rule.selector.as_css()
                try:
                    selector_list = []
                    for selector in cssselect.parse(selector_string):
                        xpath = selector_to_xpath(selector)
                        try:
                            lxml_xpath = lxml.etree.XPath(xpath)
                        except ValueError as exc:
                            # TODO: Some characters are not supported by lxml's
                            # XPath implementation (including control
                            # characters), but these characters are valid in
                            # the CSS2.1 specification.
                            raise cssselect.SelectorError(str(exc))
                        selector_list.append(
                            Selector((0, ) + selector.specificity(),
                                     selector.pseudo_element, lxml_xpath))
                    for selector in selector_list:
                        if selector.pseudo_element not in PSEUDO_ELEMENTS:
                            raise cssselect.ExpressionError(
                                'Unknown pseudo-element: %s' %
                                selector.pseudo_element)
                except cssselect.SelectorError as exc:
                    LOGGER.warn("Invalid or unsupported selector '%s', %s",
                                selector_string, exc)
                    continue
                yield rule, selector_list, declarations

        elif rule.at_keyword == '@import':
            if not evaluate_media_query(rule.media, device_media_type):
                continue
            url = url_join(base_url, rule.uri, '@import at %s:%s', rule.line,
                           rule.column)
            if url is not None:
                try:
                    stylesheet = CSS(url=url,
                                     url_fetcher=url_fetcher,
                                     media_type=device_media_type)
                except URLFetchingError as exc:
                    LOGGER.warn('Failed to load stylesheet at %s : %s', url,
                                exc)
                else:
                    for result in stylesheet.rules:
                        yield result

        elif rule.at_keyword == '@media':
            if not evaluate_media_query(rule.media, device_media_type):
                continue
            for result in preprocess_stylesheet(device_media_type, base_url,
                                                rule.rules, url_fetcher):
                yield result

        elif rule.at_keyword == '@page':
            page_name, pseudo_class = rule.selector
            # TODO: support named pages (see CSS3 Paged Media)
            if page_name is not None:
                LOGGER.warn(
                    'Named pages are not supported yet, the whole '
                    '@page %s rule was ignored.',
                    page_name + (':' + pseudo_class if pseudo_class else ''))
                continue
            declarations = list(
                preprocess_declarations(base_url, rule.declarations))

            # Use a double lambda to have a closure that holds page_types
            match = (lambda page_types: lambda _document: page_types)(
                PAGE_PSEUDOCLASS_TARGETS[pseudo_class])
            specificity = rule.specificity

            if declarations:
                selector_list = [Selector(specificity, None, match)]
                yield rule, selector_list, declarations

            for margin_rule in rule.at_rules:
                declarations = list(
                    preprocess_declarations(base_url,
                                            margin_rule.declarations))
                if declarations:
                    selector_list = [
                        Selector(specificity, margin_rule.at_keyword, match)
                    ]
                    yield margin_rule, selector_list, declarations
Ejemplo n.º 41
0
    def __init__(self, html, css, width, load_resourcefn, text_extents, font_extents, user_data):

        self.text_extents    = text_extents
        self.font_extents    = font_extents
        self.load_resourcefn = load_resourcefn
        self.user_data       = user_data

        if VERBOSE:
            start = time.clock()
            end   = time.clock()
            print "robinson: %8.3fs lxml parsing..." % (end-start)

            pr = cProfile.Profile()

        root = etree.fromstring(html)
        document = etree.ElementTree(root)

        if VERBOSE:
            end   = time.clock()

            print repr(root), root.__class__
            print document, repr(document), document.__class__
            print etree.tostring(document.getroot())

            print "robinson: %8.3fs tinycss.css21.CSS21Parser()..." % (end-start)

        cssparser = tinycss.css21.CSS21Parser()

        stylesheet = cssparser.parse_stylesheet(css)
        
        if VERBOSE:
            end   = time.clock()
            print "robinson: %8.3fs style mapping..." % (end-start)

        style_map = {}

        sel_to_xpath = cssselect.xpath.HTMLTranslator().selector_to_xpath
        for rule in stylesheet.rules:
            if not isinstance (rule, tinycss.css21.RuleSet):
                continue

            sel_css = rule.selector.as_css()
            sels    = cssselect.parse (sel_css)

            #print "CSS Ruleset: %s" % (rule.selector.as_css())

            for sel in sels:
                speci = sel.specificity()
                prio  = speci2prio (speci)
                #print "   selector: %s, specificity: %s (%06d)" % (repr(sel), sel.specificity(), prio)

                xpath = sel_to_xpath (sel)
                #print "   xpath: %s" % repr(xpath)

                for item in document.xpath(xpath):
                    #print "     matched item: %s" % repr(item.tag)

                    if not item in style_map:
                        style_map[item] = {}

                    for decl in rule.declarations:
                        #print "       declaration: %s: %s" % (decl.name, decl.value)

                        if not decl.name in style_map[item]:
                            style_map[item][decl.name] = (prio, Value.from_token(decl.value))
                        else:
                            if prio > style_map[item][decl.name][0]:
                                style_map[item][decl.name] = (prio, Value.from_token(decl.value))
         
        #print "Style map done."
        #print repr(style_map)

        if VERBOSE:
            end   = time.clock()
            print "robinson: %8.3fs building layout tree..." % (end-start)
            pr.enable()

        viewport = Dimensions ()
        viewport.content.width  = width
        self.ltree = self._layout_tree (document.getroot(), style_map, viewport)

        if VERBOSE:
            end   = time.clock()
            print "robinson: %8.3fs __init__ done." % (end-start)
Ejemplo n.º 42
0
 def search(self, selector):
     return [self._search(selector, sel.parsed_tree)
             for sel in cs.parse(selector)]
Ejemplo n.º 43
0
    def convert(self):
        """Remove HTML and PGDP marker from the text."""

        # Process each rule from our transformation CSS
        stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss)
        for rule in stylesheet.rules:

            # Extract values we care about
            v_content = None
            f_transform = None
            f_replace_with_attr = None
            f_replace_regex = None
            f_text_replace = None
            f_element_func = None

            for val in rule.declarations:

                if val.name == 'content':
                    v_content = val.value[0].value

                elif val.name == "text-transform":
                    v = val.value[0].value
                    if v == "uppercase":
                        f_transform = lambda x: x.upper()
                    elif v == "lowercase":
                        f_transform = lambda x: x.lower()
                    elif v == "capitalize":
                        f_transform = lambda x: x.title()

                elif val.name == "_replace_with_attr":
                    f_replace_with_attr = lambda el: el.attrib[val.value[0].value]

                elif val.name == "text-replace":
                    v1 = val.value[0].value
                    v2 = val.value[2].value
                    f_text_replace = lambda x: x.replace(v1, v2)

                elif val.name == "display":
                    # Support display none only. So ignore "none" argument.
                    f_element_func = clear_element

#                elif val.name == "_replace_regex":
#                    f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2")
#                    f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value)

                # Iterate through each selectors in the rule
                for selector in cssselect.parse(rule.selector.as_css()):

                    pseudo_element = selector.pseudo_element

                    xpath = cssselect.HTMLTranslator().selector_to_xpath(selector)
                    find = etree.XPath(xpath)

                    # Find each matching element in the HTML/XHTML document
                    for element in find(self.myfile.tree):

                        # Replace text with content of an attribute.
                        if f_replace_with_attr:
                            element.text = f_replace_with_attr(element)

                        if pseudo_element == "before":
                            element.text = v_content + (element.text or '') # opening tag
                        elif pseudo_element == "after":
                            element.tail = v_content + (element.tail or '') # closing tag

                        if f_transform:
                            self.text_apply(element, f_transform)

                        if f_text_replace:
                            self.text_apply(element, f_text_replace)

                        if f_element_func:
                            f_element_func(element)

                       # if f_replace_regex and element.text:
                       #     element.text = f_replace_regex(element.text)

        return

        # Transform footnote anchors to [..]
        find = etree.XPath("//a")
        for element in find(self.myfile.tree):
            href = element.attrib.get('href', None)
            if not href or not href.startswith("#Footnote_"):
                continue

            if element.text and not element.text.startswith('['):
                # Some PP have [xx], other have just xx for a page
                # number. Do not add [ ] if they are already there.
                element.text = '[' + (element.text or '') # opening tag
                element.tail = ']' + (element.tail or '') # closing tag

        # Add illustration tag, wherever we find it
        for figclass in [ 'figcenter', 'figleft', 'figright', 'caption' ]:
            find = etree.XPath("//div[contains(concat(' ', normalize-space(@class), ' '), ' " + figclass + " ')]")
            for element in find(self.myfile.tree):
                if element.text and len(element.text) > 1:
                    element.text = '[Illustration:' + element.text # opening tag
                else:
                    element.text = '[Illustration' + (element.text or '') # opening tag
                element.tail = ']' + (element.tail or '') # closing tag

#        for figclass in [ 'caption' ]:
#            find = etree.XPath("//p[contains(concat(' ', normalize-space(@class), ' '), ' " + figclass + " ')]")
#            for element in find(self.myfile.tree):
#                element.text = '[Illustration:' + (element.text or '')  # opening tag
#                element.tail = ']' + (element.tail or '') # closing tag

        # Add sidenote tag
        if args.with_sidenote_tags:
            for sntag in [ 'sidenote' ]:
                for find in [ "//p[contains(concat(' ', normalize-space(@class), ' '), ' " + sntag + " ')]",
                              "//div[starts-with(@class, 'sidenote')]" ]:
                    for element in etree.XPath(find)(self.myfile.tree):
                        element.text = '[Sidenote:' + (element.text or '') # opening tag
                        element.tail = ']' + (element.tail or '') # closing tag
Ejemplo n.º 44
0
    def convert(self):
        """Remove HTML and PGDP marker from the text."""

        escaped_unicode_re = re.compile(r"\\u[0-9a-fA-F]{4}")
        def escaped_unicode(m):
            try:
                newstr = bytes(m.group(0), 'utf8').decode('unicode-escape')
            except Exception:
                newstr = m.group(0)

            return newstr

        def new_content(element):
            """Process the "content:" property
            """
            retstr = ""
            for token in val.value:
                if token.type == "STRING":
                    # e.g. { content: "xyz" }
                    retstr += escaped_unicode_re.sub(escaped_unicode, token.value)
                elif token.type == "FUNCTION":
                    if token.function_name == 'attr':
                        # e.g. { content: attr(title) }
                        retstr += element.attrib.get(token.content[0].value, "")
                elif token.type == "IDENT":
                    if token.value == "content":
                        # Identity, e.g. { content: content }
                        retstr += element.text

            return retstr


        # Process each rule from our transformation CSS
        stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss)
        property_errors = []
        for rule in stylesheet.rules:

            # Extract values we care about
            f_transform = None
            f_replace_with_attr = None
            #f_replace_regex = None
            f_text_replace = None
            f_element_func = None
            f_move = None

            for val in rule.declarations:

                if val.name == 'content':
                    # result depends on element and pseudo elements.
                    pass

                elif val.name == "text-transform":
                    if len(val.value) != 1:
                        property_errors += [(val.line, val.column, val.name + " takes 1 argument")]
                    else:
                        v = val.value[0].value
                        if v == "uppercase":
                            f_transform = lambda x: x.upper()
                        elif v == "lowercase":
                            f_transform = lambda x: x.lower()
                        elif v == "capitalize":
                            f_transform = lambda x: x.title()
                        else:
                            property_errors += [(val.line, val.column, val.name + " accepts only 'uppercase', 'lowercase' or 'capitalize'")]

                elif val.name == "_replace_with_attr":
                    f_replace_with_attr = lambda el: el.attrib[val.value[0].value]

                elif val.name == "text-replace":
                    # Skip S (spaces) tokens.
                    values = [v for v in val.value if v.type != "S"]
                    if len(values) != 2:
                        property_errors += [(val.line, val.column, val.name + " takes 2 string arguments")]
                    else:
                        v1 = values[0].value
                        v2 = values[1].value
                        f_text_replace = lambda x: x.replace(v1, v2)

                elif val.name == "display":
                    # Support display none only. So ignore "none" argument.
                    f_element_func = clear_element

                elif val.name == "_graft":
                    values = [v for v in val.value if v.type != "S"]
                    if len(values) < 1:
                        property_errors += [(val.line, val.column, val.name + " takes at least one argument")]
                        continue
                    f_move = []
                    for v in values:
                        print("[", v.value, "]")
                        if v.value == 'parent':
                            f_move.append(lambda el: el.getparent())
                        elif v.value == 'prev-sib':
                            f_move.append(lambda el: el.getprevious())
                        elif v.value == 'next-sib':
                            f_move.append(lambda el: el.getnext())
                        else:
                            property_errors += [(val.line, val.column, val.name + " invalid value " + v.value)]
                            f_move = None
                            break

                    if not f_move:
                        continue

#                elif val.name == "_replace_regex":
#                    f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2")
#                    f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value)

                else:
                    property_errors += [(val.line, val.column, "Unsupported property " + val.name)]
                    continue

                # Iterate through each selectors in the rule
                for selector in cssselect.parse(rule.selector.as_css()):

                    pseudo_element = selector.pseudo_element

                    xpath = cssselect.HTMLTranslator().selector_to_xpath(selector)
                    find = etree.XPath(xpath)

                    # Find each matching element in the HTML/XHTML document
                    for element in find(self.myfile.tree):

                        # Replace text with content of an attribute.
                        if f_replace_with_attr:
                            element.text = f_replace_with_attr(element)

                        if val.name == 'content':
                            v_content = new_content(element)
                            if pseudo_element == "before":
                                element.text = v_content + (element.text or '') # opening tag
                            elif pseudo_element == "after":
                                element.tail = v_content + (element.tail or '') # closing tag
                            else:
                                # Replace all content
                                element.text = new_content(element)

                        if f_transform:
                            self.text_apply(element, f_transform)

                        if f_text_replace:
                            self.text_apply(element, f_text_replace)

                        if f_element_func:
                            f_element_func(element)

                        if f_move:
                            parent = element.getparent()
                            new = element
                            for f in f_move:
                                new = f(new)

                            # Move the tail to the sibling or the parent
                            if element.tail:
                                sibling = element.getprevious()
                                if sibling:
                                    sibling.tail = (sibling.tail or "") + element.tail
                                else:
                                    parent.text = (parent.text or "") + element.tail
                                element.tail = None

                            # Prune and graft
                            parent.remove(element)
                            new.append(element)

                       # if f_replace_regex and element.text:
                       #     element.text = f_replace_regex(element.text)


        css_errors = ""
        if stylesheet.errors or property_errors:
            # There is transformation CSS errors. If the default css
            # is included, take the offset into account.
            i = 0
            if self.args.css_no_default is False:
                i = DEFAULT_TRANSFORM_CSS.count('\n')
            css_errors = "<div class='error-border bbox'><p>Error(s) in the transformation CSS:</p><ul>"
            for err in stylesheet.errors:
                css_errors += "<li>{0},{1}: {2}</li>".format(err.line-i, err.column, err.reason)
            for err in property_errors:
                css_errors += "<li>{0},{1}: {2}</li>".format(err[0]-i, err[1], err[2])
            css_errors += "</ul>"

        return css_errors
Ejemplo n.º 45
0
    def __init__(self, html, css, width, load_resourcefn, text_extents,
                 font_extents, user_data):

        self.text_extents = text_extents
        self.font_extents = font_extents
        self.load_resourcefn = load_resourcefn
        self.user_data = user_data

        if VERBOSE:
            start = time.clock()
            end = time.clock()
            print("robinson: %8.3fs lxml parsing..." % (end - start))

            pr = cProfile.Profile()

        root = etree.fromstring(html)
        document = etree.ElementTree(root)

        if VERBOSE:
            end = time.clock()

            print(repr(root), root.__class__)
            print(document, repr(document), document.__class__)
            print(etree.tostring(document.getroot()))

            print("robinson: %8.3fs tinycss.css21.CSS21Parser()..." %
                  (end - start))

        cssparser = tinycss.css21.CSS21Parser()

        stylesheet = cssparser.parse_stylesheet(css)

        if VERBOSE:
            end = time.clock()
            print("robinson: %8.3fs style mapping..." % (end - start))

        style_map = {}

        sel_to_xpath = cssselect.xpath.HTMLTranslator().selector_to_xpath
        for rule in stylesheet.rules:
            if not isinstance(rule, tinycss.css21.RuleSet):
                continue

            sel_css = rule.selector.as_css()
            sels = cssselect.parse(sel_css)

            #print "CSS Ruleset: %s" % (rule.selector.as_css())

            for sel in sels:
                speci = sel.specificity()
                prio = speci2prio(speci)
                #print "   selector: %s, specificity: %s (%06d)" % (repr(sel), sel.specificity(), prio)

                xpath = sel_to_xpath(sel)
                #print "   xpath: %s" % repr(xpath)

                for item in document.xpath(xpath):
                    #print "     matched item: %s" % repr(item.tag)

                    if not item in style_map:
                        style_map[item] = {}

                    for decl in rule.declarations:
                        #print "       declaration: %s: %s" % (decl.name, decl.value)

                        if not decl.name in style_map[item]:
                            style_map[item][decl.name] = (prio,
                                                          Value.from_token(
                                                              decl.value))
                        else:
                            if prio > style_map[item][decl.name][0]:
                                style_map[item][decl.name] = (prio,
                                                              Value.from_token(
                                                                  decl.value))

        #print "Style map done."
        #print repr(style_map)

        if VERBOSE:
            end = time.clock()
            print("robinson: %8.3fs building layout tree..." % (end - start))
            pr.enable()

        viewport = Dimensions()
        viewport.content.width = width
        self.ltree = self._layout_tree(document.getroot(), style_map, viewport)

        if VERBOSE:
            end = time.clock()
            print("robinson: %8.3fs __init__ done." % (end - start))
Ejemplo n.º 46
0
 def get_error(css):
     try:
         parse(css)
     except SelectorSyntaxError:
         # Py2, Py3, ...
         return str(sys.exc_info()[1]).replace("(u'", "('")
Ejemplo n.º 47
0
 def __init__(self, renderer_dict):
     self._map = []
     for key in renderer_dict:
         selector = cssselect.parse(key)
         self._map.append(RendererMapping(selector, renderer_dict[key]))
Ejemplo n.º 48
0
 def repr_parse(css):
     selectors = parse(css)
     for selector in selectors:
         assert selector.pseudo_element is None
     return [repr(selector._tree).replace("(u'", "('")
             for selector in selectors]
Ejemplo n.º 49
0
 def get_error(css):
     try:
         parse(css)
     except SelectorSyntaxError:
         # Py2, Py3, ...
         return str(sys.exc_info()[1]).replace("(u'", "('")
Ejemplo n.º 50
0
 def specificity(css):
     selectors = parse(css)
     assert len(selectors) == 1
     return selectors[0].specificity()
Ejemplo n.º 51
0
def preprocess_stylesheet(device_media_type, base_url, rules, url_fetcher):
    """Do the work that can be done early on stylesheet, before they are
    in a document.

    """
    selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath
    for rule in rules:
        if not rule.at_keyword:
            declarations = list(preprocess_declarations(
                base_url, rule.declarations))
            if declarations:
                selector_string = rule.selector.as_css()
                try:
                    selector_list = [
                        Selector(
                            (0,) + selector.specificity(),
                            selector.pseudo_element,
                            lxml.etree.XPath(selector_to_xpath(selector)))
                        for selector in cssselect.parse(selector_string)
                    ]
                    for selector in selector_list:
                        if selector.pseudo_element not in PSEUDO_ELEMENTS:
                            raise cssselect.ExpressionError(
                                'Unknown pseudo-element: %s'
                                % selector.pseudo_element)
                except cssselect.SelectorError as exc:
                    LOGGER.warn("Invalid or unsupported selector '%s', %s",
                                selector_string, exc)
                    continue
                yield rule, selector_list, declarations

        elif rule.at_keyword == '@import':
            if not evaluate_media_query(rule.media, device_media_type):
                continue
            url = url_join(base_url, rule.uri, '@import at %s:%s',
                           rule.line, rule.column)
            if url is not None:
                for result in CSS(url=url,
                                  url_fetcher=url_fetcher,
                                  media_type=device_media_type).rules:
                    yield result

        elif rule.at_keyword == '@media':
            if not evaluate_media_query(rule.media, device_media_type):
                continue
            for result in preprocess_stylesheet(
                    device_media_type, base_url, rule.rules, url_fetcher):
                yield result

        elif rule.at_keyword == '@page':
            page_name, pseudo_class = rule.selector
            # TODO: support named pages (see CSS3 Paged Media)
            if page_name is not None:
                LOGGER.warn('Named pages are not supported yet, the whole '
                            '@page %s rule was ignored.', page_name + (
                                ':' + pseudo_class if pseudo_class else ''))
                continue
            declarations = list(preprocess_declarations(
                base_url, rule.declarations))

            # Use a double lambda to have a closure that holds page_types
            match = (lambda page_types: lambda _document: page_types)(
                PAGE_PSEUDOCLASS_TARGETS[pseudo_class])
            specificity = rule.specificity

            if declarations:
                selector_list = [Selector(specificity, None, match)]
                yield rule, selector_list, declarations

            for margin_rule in rule.at_rules:
                declarations = list(preprocess_declarations(
                    base_url, margin_rule.declarations))
                if declarations:
                    selector_list = [Selector(
                        specificity, margin_rule.at_keyword, match)]
                    yield margin_rule, selector_list, declarations
Ejemplo n.º 52
0
    def convert(self):
        """Remove HTML and PGDP marker from the text."""

        # Process each rule from our transformation CSS
        stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss)
        for rule in stylesheet.rules:

            # Extract values we care about
            v_content = None
            f_transform = None
            f_replace_with_attr = None
            f_replace_regex = None
            f_text_replace = None
            f_element_func = None

            for val in rule.declarations:

                if val.name == 'content':
                    v_content = val.value[0].value

                elif val.name == "text-transform":
                    v = val.value[0].value
                    if v == "uppercase":
                        f_transform = lambda x: x.upper()
                    elif v == "lowercase":
                        f_transform = lambda x: x.lower()
                    elif v == "capitalize":
                        f_transform = lambda x: x.title()

                elif val.name == "_replace_with_attr":
                    f_replace_with_attr = lambda el: el.attrib[val.value[0].
                                                               value]

                elif val.name == "text-replace":
                    v1 = val.value[0].value
                    v2 = val.value[2].value
                    f_text_replace = lambda x: x.replace(v1, v2)

                elif val.name == "display":
                    # Support display none only. So ignore "none" argument.
                    f_element_func = clear_element

#                elif val.name == "_replace_regex":
#                    f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2")
#                    f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value)

# Iterate through each selectors in the rule
                for selector in cssselect.parse(rule.selector.as_css()):

                    pseudo_element = selector.pseudo_element

                    xpath = cssselect.HTMLTranslator().selector_to_xpath(
                        selector)
                    find = etree.XPath(xpath)

                    # Find each matching element in the HTML/XHTML document
                    for element in find(self.myfile.tree):

                        # Replace text with content of an attribute.
                        if f_replace_with_attr:
                            element.text = f_replace_with_attr(element)

                        if pseudo_element == "before":
                            element.text = v_content + (element.text or ''
                                                        )  # opening tag
                        elif pseudo_element == "after":
                            element.tail = v_content + (element.tail or ''
                                                        )  # closing tag

                        if f_transform:
                            self.text_apply(element, f_transform)

                        if f_text_replace:
                            self.text_apply(element, f_text_replace)

                        if f_element_func:
                            f_element_func(element)

                    # if f_replace_regex and element.text:
                    #     element.text = f_replace_regex(element.text)

        return

        # Transform footnote anchors to [..]
        find = etree.XPath("//a")
        for element in find(self.myfile.tree):
            href = element.attrib.get('href', None)
            if not href or not href.startswith("#Footnote_"):
                continue

            if element.text and not element.text.startswith('['):
                # Some PP have [xx], other have just xx for a page
                # number. Do not add [ ] if they are already there.
                element.text = '[' + (element.text or '')  # opening tag
                element.tail = ']' + (element.tail or '')  # closing tag

        # Add illustration tag, wherever we find it
        for figclass in ['figcenter', 'figleft', 'figright', 'caption']:
            find = etree.XPath(
                "//div[contains(concat(' ', normalize-space(@class), ' '), ' "
                + figclass + " ')]")
            for element in find(self.myfile.tree):
                if element.text and len(element.text) > 1:
                    element.text = '[Illustration:' + element.text  # opening tag
                else:
                    element.text = '[Illustration' + (element.text or ''
                                                      )  # opening tag
                element.tail = ']' + (element.tail or '')  # closing tag


#        for figclass in [ 'caption' ]:
#            find = etree.XPath("//p[contains(concat(' ', normalize-space(@class), ' '), ' " + figclass + " ')]")
#            for element in find(self.myfile.tree):
#                element.text = '[Illustration:' + (element.text or '')  # opening tag
#                element.tail = ']' + (element.tail or '') # closing tag

# Add sidenote tag
        if args.with_sidenote_tags:
            for sntag in ['sidenote']:
                for find in [
                        "//p[contains(concat(' ', normalize-space(@class), ' '), ' "
                        + sntag + " ')]",
                        "//div[starts-with(@class, 'sidenote')]"
                ]:
                    for element in etree.XPath(find)(self.myfile.tree):
                        element.text = '[Sidenote:' + (element.text or ''
                                                       )  # opening tag
                        element.tail = ']' + (element.tail or ''
                                              )  # closing tag