def calculate_neo4j_query(css_selector): gen = var_name_generator() query = Query(gen, where_clause=[], match_clause=[]) parsed_css = cssselect.parse(css_selector)[0].parsed_tree _calculate_neo4j_query(parsed_css, query, last_var=query.last_created_var) return query
def series(css): selector, = parse(':nth-child(%s)' % css) args = selector.parsed_tree.arguments try: return parse_series(args) except ValueError: return None
def check_selector_list(sel, doc): tr = cs.HTMLTranslator() def convert(x): return tr.selector_to_xpath(x) # convert e.g. a:hover to a::hover (css3) def S(m): if m.group(0) == ':': return '::' else: return m.group(0) for s in sel: s = re.sub(':+', S, s) try: sel_list = cs.parse(s) for x in map(convert, sel_list): if doc.xpath(x): return True except cs.parser.SelectorSyntaxError as e: # probably unsupported @media selector # may still be matched by subrules' selectors # so just skip this selector pass except Exception as e: print(e, "; sel='{}'".format(s), file=sys.stderr) return False
def process_rule(self, rule, is_ancestor, maximum_specificities): selector = rule['selector'] sheet_index = rule['sheet_index'] rule_address = rule['rule_address'] or () if selector is not None: try: specificity = [0] + list(parse(selector)[0].specificity()) except (AttributeError, TypeError): specificity = [0, 0, 0, 0] else: # style attribute specificity = [1, 0, 0, 0] specificity.extend((sheet_index, tuple(rule_address))) ancestor_specificity = 0 if is_ancestor else 1 properties = [] for prop in rule['properties']: important = 1 if prop[-1] == 'important' else 0 p = Property(prop, [ancestor_specificity] + [important] + specificity) properties.append(p) if p.specificity > maximum_specificities.get(p.name, (0,0,0,0,0,0)): maximum_specificities[p.name] = p.specificity rule['properties'] = properties href = rule['href'] if hasattr(href, 'startswith') and href.startswith('file://'): href = href[len('file://'):] if iswindows and href.startswith('/'): href = href[1:] if href: rule['href'] = current_container().abspath_to_name(href, root=self.preview.current_root)
def isapplicable(cls, selector, node, enable_debug=False): cls._enable_debug = enable_debug s = parse(selector) for item in s: if cls.walk(item, node): return True return False
def tag(self, selector, **attrs): parsed = cssselect.parse(selector) if len(parsed) > 1: raise ValueError('Cannot specify more than 1 tag.') tag_name = None kwargs = {} item = parsed[0].parsed_tree while item: if item.__class__ is cssselect.parser.Hash: kwargs['id'] = item.id item = item.selector elif item.__class__ is cssselect.parser.Class: kwargs['class_'] = ' '.join([ kwargs.get('class_', ''), item.class_name]).strip() item = item.selector elif item.__class__ is cssselect.parser.Attrib: kwargs[item.attrib] = item.value item = item.selector elif item.__class__ is cssselect.parser.Element: tag_name = item.element break else: raise ValueError('Unsupported selector: %s.' % selector) kwargs.update(attrs) return self.settings.tag_class(tag_name, self, **kwargs)
def parse_pseudo(css): result = [] for selector in parse(css): result.append(( repr(selector._tree).replace("(u'", "('"), selector.pseudo_element)) return result
def check_selector_list(sel, doc): tr = cs.HTMLTranslator() def convert(x): return tr.selector_to_xpath( x ) # convert e.g. a:hover to a::hover (css3) def S(m): if m.group(0) == ':': return '::' else: return m.group(0) for s in sel: s = re.sub(':+', S, s) try: sel_list = cs.parse(s) for x in map(convert, sel_list): if doc.xpath(x): return True except cs.parser.SelectorSyntaxError as e: # probably unsupported @media selector # may still be matched by subrules' selectors # so just skip this selector pass except Exception as e: print(e, "; sel='{}'".format(s), file=sys.stderr) return False
def _get_rule_specificity(self, rule): """ For a given CSSRule get its selector specificity in base 10 """ import cssselect sels = (s.specificity() for s in cssselect.parse(rule.selector.as_css())) return sum(map(self._get_specificity_from_list, sels))
def collect_for_nodes(self): for ruleset in self.rules: try: selectors = cssselect.parse(ruleset.selector.as_css()) except: continue for selector in selectors: try: xpath = cssselect.HTMLTranslator().selector_to_xpath(selector) except cssselect.xpath.ExpressionError: continue # constructs a dictionnary for each node addressed by css, and # collects the associated declarations and priorities for node in self.tree.xpath(xpath): if node not in self.nodes: self.nodes[node] = {} for declaration in ruleset.declarations: # replaces if priority is equal of higher and if not important new_specificity = selector.specificity() if declaration.name in self.nodes[node]: if self.nodes[node][declaration.name][1] > new_specificity: continue self.nodes[node][declaration.name] = (declaration.value.as_css(), selector.specificity()) style_attr = node.get('style') if style_attr: declarations = HTMLBaker.css_parser.parse_style_attr(style_attr)[0] for declaration in declarations: self.nodes[node][declaration.name] = (declaration.value.as_css(), (1, 0, 0, 0))
def classes_in_selector(text): classes = set() try: for selector in parse(text): _classes_in_selector(selector, classes) except SelectorSyntaxError: pass return classes
def repr_parse(css): selectors = parse(css) for selector in selectors: assert selector.pseudo_element is None return [ repr(selector.parsed_tree).replace("(u'", "('") for selector in selectors ]
def match_selector(rule, tree): """Yield the ``(element, specificity)`` in ``tree`` matching ``rule``.""" selector_list = cssselect.parse(rule.selector.as_css()) translator = cssselect.GenericTranslator() for selector in selector_list: if not selector.pseudo_element: specificity = selector.specificity() for element in tree.xpath(translator.selector_to_xpath(selector)): yield element, specificity
def parse_pseudo(css): result = [] for selector in parse(css): pseudo = selector.pseudo_element # No Symbol here assert pseudo is None or type(pseudo) is _unicode selector = repr(selector.parsed_tree).replace("(u'", "('") result.append((selector, pseudo)) return result
def is_selector(string): ''' Check to see if string represents valid HTML selector. ''' try: # cssselect doesn't like links, so we replace them. string = re.sub(r"(href.=)([^\]]*)\]", r"\1fakelink]", string) tree = cssselect.parse(string) except SelectorSyntaxError: return False return _do_elements_have_standard_tags(tree) and not _is_file_extension(tree)
def _css(cls, current, css_selector): # The given CSS selector may be a group selector (multiple selectors # delimited by commas), so we must parse out and convert the individual # selectors, then return their union. selectors = parse(css_selector) xpath_selectors = ["{current}//{selector}".format(current=current, selector=_selector_to_xpath(selector)) for selector in selectors] return cls._union(*xpath_selectors)
def get_css_nodes(string): ''' REUSE: _is_selector in Tutorons server code. ''' try: # cssselect doesn't like links, so we replace them. string = re.sub(r"(href.=)([^\]]*)\]", r"\1fakelink]", string) tree = cssselect.parse(string) selector_nodes = get_descendants(tree) return selector_nodes except SelectorSyntaxError: return []
def is_selector(string): ''' Check to see if string represents valid HTML selector. ''' try: # cssselect doesn't like links, so we replace them. string = re.sub(r"(href.=)([^\]]*)\]", r"\1fakelink]", string) tree = cssselect.parse(string) except SelectorSyntaxError: return False return _do_elements_have_standard_tags( tree) and not _is_file_extension(tree)
def _css(self, current, css_selector): # The given CSS selector may be a group selector (multiple selectors # delimited by commas), so we must parse out and convert the individual # selectors, then return their union. selectors = parse(css_selector) xpath_selectors = [ "{0}//{1}".format(current, _selector_to_xpath(selector)) for selector in selectors ] return self._union(*xpath_selectors)
def _parse_locator(css_or_xpath: str) -> tuple: if not isinstance(css_or_xpath, str): raise TypeError("Locator {!r} is not a string.".format(css_or_xpath)) try: cssselect.parse(css_or_xpath) except cssselect.SelectorSyntaxError: pass else: return "css", css_or_xpath try: etree.XPath(css_or_xpath) except etree.XPathSyntaxError: pass else: return "xpath", css_or_xpath raise ValueError( "Locator {!r} neither a css nor an xpath string.".format(css_or_xpath))
def parse_qualified_rule(rule): strselector = tinycss2.serializer.serialize(rule.prelude) if not strselector: return try: selector = cssselect.parse(strselector) except cssselect.SelectorError as ex: #log.error('Error: parsing css select: %s %s' % (strselector, ex)) print('Error: parsing css select: %s %s' % (strselector, ex)) raise for s in selector: process_cssselect_comp(s.parsed_tree, rule)
def is_selector(string): ''' Check to see if string represents valid HTML selector. ''' try: ''' cssselect doesn't play well with links, so we replace them for now. ''' string = re.sub(r"(href.=)([^\]]*)\]", r"\1fakelink]", string) tree = cssselect.parse(string) selector_parts = get_descendants(tree) for part in selector_parts: if isinstance(part, Element): if part.element not in HTML_TAGS: return False return True except SelectorSyntaxError: return False
def is_selector(string): """ Check to see if string represents valid HTML selector. """ try: """ cssselect doesn't play well with links, so we replace them for now. """ string = re.sub(r"(href.=)([^\]]*)\]", r"\1fakelink]", string) tree = cssselect.parse(string) selector_parts = get_descendants(tree) for part in selector_parts: if isinstance(part, Element): if part.element not in HTML_TAGS: return False return True except SelectorSyntaxError: return False
def _token_list_matches_tree(self, token_list): """ Returns whether the token list matches the HTML tree :param selector: A Token list to check :type selector: list of Token objects :returns: True if the token list has matches in self.tree :rtype: bool """ try: parsed_selector = cssselect.parse("".join(token.as_css() for token in token_list))[0] return bool(self.tree.xpath(self.xpath_translator.selector_to_xpath(parsed_selector))) except: # On error, assume the selector matches the tree return True
def _token_list_matches_tree(self, token_list): """ Returns whether the token list matches the HTML tree :param selector: A Token list to check :type selector: list of Token objects :returns: True if the token list has matches in self.tree :rtype: bool """ try: parsed_selector = cssselect.parse(''.join( token.as_css() for token in token_list))[0] return bool( self.tree.xpath( self.xpath_translator.selector_to_xpath(parsed_selector))) except: # On error, assume the selector matches the tree return True
def get_metrics(rule): proc = subprocess.Popen(['echo "'+rule+'" | analyze-css -'],stdout=subprocess.PIPE, shell=True) (o, err) = proc.communicate() result = json.loads(o) metrics = result["metrics"] if ("rules" not in metrics): metrics["rules"] = 0 if ("declarations" not in metrics): metrics["declarations"] = 0 if ("selectors" not in metrics): metrics["selectors"] = 0 remove = ["imports","rules","comments","commentsLength","duplicatedSelectors","emptyRules","base64Length","redundantBodySelectors","redundantChildNodesSelectors"] for key in remove: metrics.pop(key, None) selectors = rule[:rule.find("{")] css = cssselect.parse(selectors) i = 0 specificity = 0 while i<len(css): tmp = css[i].specificity() val = tmp[2] + tmp[1]*10 + tmp[0]*100 if val>specificity: specificity = val i+=1 metrics["specificity"] = specificity if specificity>=100: metrics["specificity_category"] = 'high' elif specificity>=10: metrics["specificity_category"] = 'medium' else: metrics["specificity_category"] = 'low' return metrics
def convert(self): """Remove HTML and PGDP marker from the text.""" escaped_unicode_re = re.compile(r"\\u[0-9a-fA-F]{4}") def escaped_unicode(m): try: newstr = bytes(m.group(0), 'utf8').decode('unicode-escape') except Exception: newstr = m.group(0) return newstr def new_content(element): """Process the "content:" property """ retstr = "" for token in val.value: if token.type == "STRING": # e.g. { content: "xyz" } retstr += escaped_unicode_re.sub(escaped_unicode, token.value) elif token.type == "FUNCTION": if token.function_name == 'attr': # e.g. { content: attr(title) } retstr += element.attrib.get(token.content[0].value, "") elif token.type == "IDENT": if token.value == "content": # Identity, e.g. { content: content } retstr += element.text return retstr # Process each rule from our transformation CSS stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss) property_errors = [] for rule in stylesheet.rules: # Extract values we care about f_transform = None f_replace_with_attr = None #f_replace_regex = None f_text_replace = None f_element_func = None f_move = None for val in rule.declarations: if val.name == 'content': # result depends on element and pseudo elements. pass elif val.name == "text-transform": if len(val.value) != 1: property_errors += [(val.line, val.column, val.name + " takes 1 argument")] else: v = val.value[0].value if v == "uppercase": f_transform = lambda x: x.upper() elif v == "lowercase": f_transform = lambda x: x.lower() elif v == "capitalize": f_transform = lambda x: x.title() else: property_errors += [( val.line, val.column, val.name + " accepts only 'uppercase', 'lowercase' or 'capitalize'" )] elif val.name == "_replace_with_attr": f_replace_with_attr = lambda el: el.attrib[val.value[0]. value] elif val.name == "text-replace": # Skip S (spaces) tokens. values = [v for v in val.value if v.type != "S"] if len(values) != 2: property_errors += [ (val.line, val.column, val.name + " takes 2 string arguments") ] else: v1 = values[0].value v2 = values[1].value f_text_replace = lambda x: x.replace(v1, v2) elif val.name == "display": # Support display none only. So ignore "none" argument. f_element_func = clear_element elif val.name == "_graft": values = [v for v in val.value if v.type != "S"] if len(values) < 1: property_errors += [ (val.line, val.column, val.name + " takes at least one argument") ] continue f_move = [] for v in values: print("[", v.value, "]") if v.value == 'parent': f_move.append(lambda el: el.getparent()) elif v.value == 'prev-sib': f_move.append(lambda el: el.getprevious()) elif v.value == 'next-sib': f_move.append(lambda el: el.getnext()) else: property_errors += [ (val.line, val.column, val.name + " invalid value " + v.value) ] f_move = None break if not f_move: continue # elif val.name == "_replace_regex": # f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2") # f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value) else: property_errors += [(val.line, val.column, "Unsupported property " + val.name)] continue # Iterate through each selectors in the rule for selector in cssselect.parse(rule.selector.as_css()): pseudo_element = selector.pseudo_element xpath = cssselect.HTMLTranslator().selector_to_xpath( selector) find = etree.XPath(xpath) # Find each matching element in the HTML/XHTML document for element in find(self.myfile.tree): # Replace text with content of an attribute. if f_replace_with_attr: element.text = f_replace_with_attr(element) if val.name == 'content': v_content = new_content(element) if pseudo_element == "before": element.text = v_content + (element.text or '' ) # opening tag elif pseudo_element == "after": element.tail = v_content + (element.tail or '' ) # closing tag else: # Replace all content element.text = new_content(element) if f_transform: self.text_apply(element, f_transform) if f_text_replace: self.text_apply(element, f_text_replace) if f_element_func: f_element_func(element) if f_move: parent = element.getparent() new = element for f in f_move: new = f(new) # Move the tail to the sibling or the parent if element.tail: sibling = element.getprevious() if sibling: sibling.tail = (sibling.tail or "") + element.tail else: parent.text = (parent.text or "") + element.tail element.tail = None # Prune and graft parent.remove(element) new.append(element) # if f_replace_regex and element.text: # element.text = f_replace_regex(element.text) css_errors = "" if stylesheet.errors or property_errors: # There is transformation CSS errors. If the default css # is included, take the offset into account. i = 0 if self.args.css_no_default is False: i = DEFAULT_TRANSFORM_CSS.count('\n') css_errors = "<div class='error-border bbox'><p>Error(s) in the transformation CSS:</p><ul>" for err in stylesheet.errors: css_errors += "<li>{0},{1}: {2}</li>".format( err.line - i, err.column, err.reason) for err in property_errors: css_errors += "<li>{0},{1}: {2}</li>".format( err[0] - i, err[1], err[2]) css_errors += "</ul>" return css_errors
def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules, url_fetcher, rules, fonts, font_config): """Do the work that can be done early on stylesheet, before they are in a document. """ selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath for rule in stylesheet_rules: if not rule.at_keyword: declarations = list(preprocess_declarations( base_url, rule.declarations)) if declarations: selector_string = rule.selector.as_css() try: selector_list = [] for selector in cssselect.parse(selector_string): xpath = selector_to_xpath(selector) try: lxml_xpath = lxml.etree.XPath(xpath) except ValueError as exc: # TODO: Some characters are not supported by lxml's # XPath implementation (including control # characters), but these characters are valid in # the CSS2.1 specification. raise cssselect.SelectorError(str(exc)) selector_list.append(Selector( (0,) + selector.specificity(), selector.pseudo_element, lxml_xpath)) for selector in selector_list: if selector.pseudo_element not in PSEUDO_ELEMENTS: raise cssselect.ExpressionError( 'Unknown pseudo-element: %s' % selector.pseudo_element) except cssselect.SelectorError as exc: LOGGER.warning("Invalid or unsupported selector '%s', %s", selector_string, exc) continue rules.append((rule, selector_list, declarations)) elif rule.at_keyword == '@import': if not evaluate_media_query(rule.media, device_media_type): continue url = url_join(base_url, rule.uri, '@import at %s:%s', rule.line, rule.column) if url is not None: try: stylesheet = CSS( url=url, url_fetcher=url_fetcher, media_type=device_media_type, font_config=font_config) except URLFetchingError as exc: LOGGER.warning('Failed to load stylesheet at %s : %s', url, exc) else: for result in stylesheet.rules: rules.append(result) elif rule.at_keyword == '@media': if not evaluate_media_query(rule.media, device_media_type): continue preprocess_stylesheet( device_media_type, base_url, rule.rules, url_fetcher, rules, fonts, font_config) elif rule.at_keyword == '@page': page_name, pseudo_class = rule.selector # TODO: support named pages (see CSS3 Paged Media) if page_name is not None: LOGGER.warning('Named pages are not supported yet, the whole ' '@page %s rule was ignored.', page_name + ( ':' + pseudo_class if pseudo_class else '')) continue declarations = list(preprocess_declarations( base_url, rule.declarations)) # Use a double lambda to have a closure that holds page_types match = (lambda page_types: lambda _document: page_types)( PAGE_PSEUDOCLASS_TARGETS[pseudo_class]) specificity = rule.specificity if declarations: selector_list = [Selector(specificity, None, match)] rules.append((rule, selector_list, declarations)) for margin_rule in rule.at_rules: declarations = list(preprocess_declarations( base_url, margin_rule.declarations)) if declarations: selector_list = [Selector( specificity, margin_rule.at_keyword, match)] rules.append((margin_rule, selector_list, declarations)) elif rule.at_keyword == '@font-face': rule_descriptors = dict(list(preprocess_descriptors( base_url, rule.declarations))) for key in ('src', 'font_family'): if key not in rule_descriptors: LOGGER.warning( "Missing %s descriptor in '@font-face' rule at %s:%s", key.replace('_', '-'), rule.line, rule.column) break else: if font_config is not None: font_filename = font_config.add_font_face( rule_descriptors, url_fetcher) if font_filename: fonts.append(font_filename)
def _parseStylesheet(self, fileName, htmlBody): """ Reads in a stylesheet and parses it. Parameters ---------- ssFileNames : string File name to parse. htmlBody : lxml.etree.Element Root element of the body. Modifies -------- htmlBody Some of the more complex selectors can't be stored as a simple lookup, so these values are added to the inline "style" attribute of the appropriate tags. Returns ------- out : [ dictionary ] * 3 List of three dictionaries: [tagLookup idLookup classLookup] Each is a lookup table for its respective selector type and is of the form: { 'name' : {dictionary} }, where the linked dictionaries contain that selector's parsed declarations. """ cssparser = tinycss.make_parser() stylesheet = cssparser.parse_stylesheet_file(fileName) out = [{}, {}, {}] for rule in stylesheet.rules: # Skip at keywords if rule.at_keyword != None: continue thisDecl = self._declaration2dict(rule.declarations) ## Decode selector types parsedSelectors = cssselect.parse(rule.selector.as_css()) for i, thisSel in enumerate(parsedSelectors): ## Do the easy selectors as a dictionary if sum(thisSel.specificity()) < 2: if hasattr(thisSel.parsed_tree, 'id'): # We have a single ID selector out[1][thisSel.parsed_tree.id] = thisDecl continue elif hasattr(thisSel.parsed_tree, 'class_name'): # We have a single CLASS selector out[2][thisSel.parsed_tree.class_name] = thisDecl continue elif hasattr(thisSel.parsed_tree, 'element'): # We have a single TAG selector out[0][thisSel.parsed_tree.element] = thisDecl continue ## Do the hard selectors as in-line style for elt in htmlBody.cssselect(thisSel): declStr = self._decl2str(rule.declarations) try: elt.attrib['style'] += "; " + declStr except: elt.attrib['style'] = declStr return out
def parse_rule(rule): string = "".join([s.serialize() for s in rule.prelude]) selectors = cssselect.parse(string) for sel in selectors: yield Rule(stringify_selector(sel), parse_properties(rule.content))
from tinycss.css21 import CSS21Parser import cssselect import sys css = open(sys.argv[-1],'r').read() styles = CSS21Parser().parse_stylesheet(css) for ruleIdx in range(len(styles.rules)): rule = styles.rules[ruleIdx] selectors = rule.selector selector_string = selectors.as_css() specs = [s.specificity() for s in cssselect.parse(selector_string)] for i in range(len(specs)): spec = specs[i][0] * 100 + specs[i][1] * 10 + specs[i][2] spec = spec * 1000 + ruleIdx specs[i] = spec for decl in rule.declarations: if len(decl.value) == 1: decl_value = "'" + str(decl.value[0].value) + "'" else: decl_value = '[' + ','.join(["'"+str(x.value)+"'" for x in decl.value if x.type != 'S']) + ']' # A0,...,A(N-1) each selector # AN final node N = len(selectors) specIdx = 0 print "rule(A%d,'%s',%s,%d):-" % (N,decl.name,decl_value,specs[specIdx]) for i in range(N):
def specificity(css): selectors = parse(css) assert len(selectors) == 1 return selectors[0].specificity()
def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules, url_fetcher, rules, fonts, font_config): """Do the work that can be done early on stylesheet, before they are in a document. """ selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath for rule in stylesheet_rules: if rule.type == 'qualified-rule': declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list(rule.content))) if declarations: selector_string = tinycss2.serialize(rule.prelude) try: selector_list = [] for selector in cssselect.parse(selector_string): xpath = selector_to_xpath(selector) try: lxml_xpath = lxml.etree.XPath(xpath) except ValueError as exc: # TODO: Some characters are not supported by lxml's # XPath implementation (including control # characters), but these characters are valid in # the CSS2.1 specification. raise cssselect.SelectorError(str(exc)) selector_list.append( Selector((0, ) + selector.specificity(), selector.pseudo_element, lxml_xpath)) for selector in selector_list: if selector.pseudo_element not in PSEUDO_ELEMENTS: raise cssselect.ExpressionError( 'Unknown pseudo-element: %s' % selector.pseudo_element) except cssselect.SelectorError as exc: LOGGER.warning("Invalid or unsupported selector '%s', %s", selector_string, exc) continue rules.append((rule, selector_list, declarations)) elif rule.type == 'at-rule' and rule.at_keyword == 'import': tokens = remove_whitespace(rule.prelude) if tokens and tokens[0].type in ('url', 'string'): url = tokens[0].value else: continue media = parse_media_query(tokens[1:]) if media is None: LOGGER.warning( 'Invalid media type "%s" ' 'the whole @import rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) if not evaluate_media_query(media, device_media_type): continue url = url_join(base_url, url, allow_relative=False, context='@import at %s:%s', context_args=(rule.source_line, rule.source_column)) if url is not None: try: stylesheet = CSS(url=url, url_fetcher=url_fetcher, media_type=device_media_type, font_config=font_config) except URLFetchingError as exc: LOGGER.warning('Failed to load stylesheet at %s : %s', url, exc) else: for result in stylesheet.rules: rules.append(result) elif rule.type == 'at-rule' and rule.at_keyword == 'media': media = parse_media_query(rule.prelude) if media is None: LOGGER.warning( 'Invalid media type "%s" ' 'the whole @media rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue if not evaluate_media_query(media, device_media_type): continue content_rules = tinycss2.parse_rule_list(rule.content) preprocess_stylesheet(device_media_type, base_url, content_rules, url_fetcher, rules, fonts, font_config) elif rule.type == 'at-rule' and rule.at_keyword == 'page': tokens = remove_whitespace(rule.prelude) # TODO: support named pages (see CSS3 Paged Media) if not tokens: pseudo_class = None specificity = (0, 0) elif (len(tokens) == 2 and tokens[0].type == 'literal' and tokens[0].value == ':' and tokens[1].type == 'ident'): pseudo_class = tokens[1].lower_value specificity = { 'first': (1, 0), 'blank': (1, 0), 'left': (0, 1), 'right': (0, 1), }.get(pseudo_class) if not specificity: LOGGER.warning( 'Unknown @page pseudo-class "%s", ' 'the whole @page rule was ignored ' 'at %s:%s.', pseudo_class, rule.source_line, rule.source_column) continue else: LOGGER.warning( 'Unsupported @page selector "%s", ' 'the whole @page rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue content = tinycss2.parse_declaration_list(rule.content) declarations = list(preprocess_declarations(base_url, content)) # Use a double lambda to have a closure that holds page_types match = (lambda page_types: lambda _document: page_types)( PAGE_PSEUDOCLASS_TARGETS[pseudo_class]) if declarations: selector_list = [Selector(specificity, None, match)] rules.append((rule, selector_list, declarations)) for margin_rule in content: if margin_rule.type != 'at-rule': continue declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list(margin_rule.content))) if declarations: selector_list = [ Selector(specificity, '@' + margin_rule.at_keyword, match) ] rules.append((margin_rule, selector_list, declarations)) elif rule.type == 'at-rule' and rule.at_keyword == 'font-face': content = tinycss2.parse_declaration_list(rule.content) rule_descriptors = dict(preprocess_descriptors(base_url, content)) for key in ('src', 'font_family'): if key not in rule_descriptors: LOGGER.warning( "Missing %s descriptor in '@font-face' rule at %s:%s", key.replace('_', '-'), rule.source_line, rule.source_column) break else: if font_config is not None: font_filename = font_config.add_font_face( rule_descriptors, url_fetcher) if font_filename: fonts.append(font_filename)
def preprocess_stylesheet(device_media_type, base_url, rules, url_fetcher): """Do the work that can be done early on stylesheet, before they are in a document. """ selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath for rule in rules: if not rule.at_keyword: declarations = list( preprocess_declarations(base_url, rule.declarations)) if declarations: selector_string = rule.selector.as_css() try: selector_list = [] for selector in cssselect.parse(selector_string): xpath = selector_to_xpath(selector) try: lxml_xpath = lxml.etree.XPath(xpath) except ValueError as exc: # TODO: Some characters are not supported by lxml's # XPath implementation (including control # characters), but these characters are valid in # the CSS2.1 specification. raise cssselect.SelectorError(str(exc)) selector_list.append( Selector((0, ) + selector.specificity(), selector.pseudo_element, lxml_xpath)) for selector in selector_list: if selector.pseudo_element not in PSEUDO_ELEMENTS: raise cssselect.ExpressionError( 'Unknown pseudo-element: %s' % selector.pseudo_element) except cssselect.SelectorError as exc: LOGGER.warn("Invalid or unsupported selector '%s', %s", selector_string, exc) continue yield rule, selector_list, declarations elif rule.at_keyword == '@import': if not evaluate_media_query(rule.media, device_media_type): continue url = url_join(base_url, rule.uri, '@import at %s:%s', rule.line, rule.column) if url is not None: try: stylesheet = CSS(url=url, url_fetcher=url_fetcher, media_type=device_media_type) except URLFetchingError as exc: LOGGER.warn('Failed to load stylesheet at %s : %s', url, exc) else: for result in stylesheet.rules: yield result elif rule.at_keyword == '@media': if not evaluate_media_query(rule.media, device_media_type): continue for result in preprocess_stylesheet(device_media_type, base_url, rule.rules, url_fetcher): yield result elif rule.at_keyword == '@page': page_name, pseudo_class = rule.selector # TODO: support named pages (see CSS3 Paged Media) if page_name is not None: LOGGER.warn( 'Named pages are not supported yet, the whole ' '@page %s rule was ignored.', page_name + (':' + pseudo_class if pseudo_class else '')) continue declarations = list( preprocess_declarations(base_url, rule.declarations)) # Use a double lambda to have a closure that holds page_types match = (lambda page_types: lambda _document: page_types)( PAGE_PSEUDOCLASS_TARGETS[pseudo_class]) specificity = rule.specificity if declarations: selector_list = [Selector(specificity, None, match)] yield rule, selector_list, declarations for margin_rule in rule.at_rules: declarations = list( preprocess_declarations(base_url, margin_rule.declarations)) if declarations: selector_list = [ Selector(specificity, margin_rule.at_keyword, match) ] yield margin_rule, selector_list, declarations
def __init__(self, html, css, width, load_resourcefn, text_extents, font_extents, user_data): self.text_extents = text_extents self.font_extents = font_extents self.load_resourcefn = load_resourcefn self.user_data = user_data if VERBOSE: start = time.clock() end = time.clock() print "robinson: %8.3fs lxml parsing..." % (end-start) pr = cProfile.Profile() root = etree.fromstring(html) document = etree.ElementTree(root) if VERBOSE: end = time.clock() print repr(root), root.__class__ print document, repr(document), document.__class__ print etree.tostring(document.getroot()) print "robinson: %8.3fs tinycss.css21.CSS21Parser()..." % (end-start) cssparser = tinycss.css21.CSS21Parser() stylesheet = cssparser.parse_stylesheet(css) if VERBOSE: end = time.clock() print "robinson: %8.3fs style mapping..." % (end-start) style_map = {} sel_to_xpath = cssselect.xpath.HTMLTranslator().selector_to_xpath for rule in stylesheet.rules: if not isinstance (rule, tinycss.css21.RuleSet): continue sel_css = rule.selector.as_css() sels = cssselect.parse (sel_css) #print "CSS Ruleset: %s" % (rule.selector.as_css()) for sel in sels: speci = sel.specificity() prio = speci2prio (speci) #print " selector: %s, specificity: %s (%06d)" % (repr(sel), sel.specificity(), prio) xpath = sel_to_xpath (sel) #print " xpath: %s" % repr(xpath) for item in document.xpath(xpath): #print " matched item: %s" % repr(item.tag) if not item in style_map: style_map[item] = {} for decl in rule.declarations: #print " declaration: %s: %s" % (decl.name, decl.value) if not decl.name in style_map[item]: style_map[item][decl.name] = (prio, Value.from_token(decl.value)) else: if prio > style_map[item][decl.name][0]: style_map[item][decl.name] = (prio, Value.from_token(decl.value)) #print "Style map done." #print repr(style_map) if VERBOSE: end = time.clock() print "robinson: %8.3fs building layout tree..." % (end-start) pr.enable() viewport = Dimensions () viewport.content.width = width self.ltree = self._layout_tree (document.getroot(), style_map, viewport) if VERBOSE: end = time.clock() print "robinson: %8.3fs __init__ done." % (end-start)
def search(self, selector): return [self._search(selector, sel.parsed_tree) for sel in cs.parse(selector)]
def convert(self): """Remove HTML and PGDP marker from the text.""" # Process each rule from our transformation CSS stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss) for rule in stylesheet.rules: # Extract values we care about v_content = None f_transform = None f_replace_with_attr = None f_replace_regex = None f_text_replace = None f_element_func = None for val in rule.declarations: if val.name == 'content': v_content = val.value[0].value elif val.name == "text-transform": v = val.value[0].value if v == "uppercase": f_transform = lambda x: x.upper() elif v == "lowercase": f_transform = lambda x: x.lower() elif v == "capitalize": f_transform = lambda x: x.title() elif val.name == "_replace_with_attr": f_replace_with_attr = lambda el: el.attrib[val.value[0].value] elif val.name == "text-replace": v1 = val.value[0].value v2 = val.value[2].value f_text_replace = lambda x: x.replace(v1, v2) elif val.name == "display": # Support display none only. So ignore "none" argument. f_element_func = clear_element # elif val.name == "_replace_regex": # f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2") # f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value) # Iterate through each selectors in the rule for selector in cssselect.parse(rule.selector.as_css()): pseudo_element = selector.pseudo_element xpath = cssselect.HTMLTranslator().selector_to_xpath(selector) find = etree.XPath(xpath) # Find each matching element in the HTML/XHTML document for element in find(self.myfile.tree): # Replace text with content of an attribute. if f_replace_with_attr: element.text = f_replace_with_attr(element) if pseudo_element == "before": element.text = v_content + (element.text or '') # opening tag elif pseudo_element == "after": element.tail = v_content + (element.tail or '') # closing tag if f_transform: self.text_apply(element, f_transform) if f_text_replace: self.text_apply(element, f_text_replace) if f_element_func: f_element_func(element) # if f_replace_regex and element.text: # element.text = f_replace_regex(element.text) return # Transform footnote anchors to [..] find = etree.XPath("//a") for element in find(self.myfile.tree): href = element.attrib.get('href', None) if not href or not href.startswith("#Footnote_"): continue if element.text and not element.text.startswith('['): # Some PP have [xx], other have just xx for a page # number. Do not add [ ] if they are already there. element.text = '[' + (element.text or '') # opening tag element.tail = ']' + (element.tail or '') # closing tag # Add illustration tag, wherever we find it for figclass in [ 'figcenter', 'figleft', 'figright', 'caption' ]: find = etree.XPath("//div[contains(concat(' ', normalize-space(@class), ' '), ' " + figclass + " ')]") for element in find(self.myfile.tree): if element.text and len(element.text) > 1: element.text = '[Illustration:' + element.text # opening tag else: element.text = '[Illustration' + (element.text or '') # opening tag element.tail = ']' + (element.tail or '') # closing tag # for figclass in [ 'caption' ]: # find = etree.XPath("//p[contains(concat(' ', normalize-space(@class), ' '), ' " + figclass + " ')]") # for element in find(self.myfile.tree): # element.text = '[Illustration:' + (element.text or '') # opening tag # element.tail = ']' + (element.tail or '') # closing tag # Add sidenote tag if args.with_sidenote_tags: for sntag in [ 'sidenote' ]: for find in [ "//p[contains(concat(' ', normalize-space(@class), ' '), ' " + sntag + " ')]", "//div[starts-with(@class, 'sidenote')]" ]: for element in etree.XPath(find)(self.myfile.tree): element.text = '[Sidenote:' + (element.text or '') # opening tag element.tail = ']' + (element.tail or '') # closing tag
def convert(self): """Remove HTML and PGDP marker from the text.""" escaped_unicode_re = re.compile(r"\\u[0-9a-fA-F]{4}") def escaped_unicode(m): try: newstr = bytes(m.group(0), 'utf8').decode('unicode-escape') except Exception: newstr = m.group(0) return newstr def new_content(element): """Process the "content:" property """ retstr = "" for token in val.value: if token.type == "STRING": # e.g. { content: "xyz" } retstr += escaped_unicode_re.sub(escaped_unicode, token.value) elif token.type == "FUNCTION": if token.function_name == 'attr': # e.g. { content: attr(title) } retstr += element.attrib.get(token.content[0].value, "") elif token.type == "IDENT": if token.value == "content": # Identity, e.g. { content: content } retstr += element.text return retstr # Process each rule from our transformation CSS stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss) property_errors = [] for rule in stylesheet.rules: # Extract values we care about f_transform = None f_replace_with_attr = None #f_replace_regex = None f_text_replace = None f_element_func = None f_move = None for val in rule.declarations: if val.name == 'content': # result depends on element and pseudo elements. pass elif val.name == "text-transform": if len(val.value) != 1: property_errors += [(val.line, val.column, val.name + " takes 1 argument")] else: v = val.value[0].value if v == "uppercase": f_transform = lambda x: x.upper() elif v == "lowercase": f_transform = lambda x: x.lower() elif v == "capitalize": f_transform = lambda x: x.title() else: property_errors += [(val.line, val.column, val.name + " accepts only 'uppercase', 'lowercase' or 'capitalize'")] elif val.name == "_replace_with_attr": f_replace_with_attr = lambda el: el.attrib[val.value[0].value] elif val.name == "text-replace": # Skip S (spaces) tokens. values = [v for v in val.value if v.type != "S"] if len(values) != 2: property_errors += [(val.line, val.column, val.name + " takes 2 string arguments")] else: v1 = values[0].value v2 = values[1].value f_text_replace = lambda x: x.replace(v1, v2) elif val.name == "display": # Support display none only. So ignore "none" argument. f_element_func = clear_element elif val.name == "_graft": values = [v for v in val.value if v.type != "S"] if len(values) < 1: property_errors += [(val.line, val.column, val.name + " takes at least one argument")] continue f_move = [] for v in values: print("[", v.value, "]") if v.value == 'parent': f_move.append(lambda el: el.getparent()) elif v.value == 'prev-sib': f_move.append(lambda el: el.getprevious()) elif v.value == 'next-sib': f_move.append(lambda el: el.getnext()) else: property_errors += [(val.line, val.column, val.name + " invalid value " + v.value)] f_move = None break if not f_move: continue # elif val.name == "_replace_regex": # f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2") # f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value) else: property_errors += [(val.line, val.column, "Unsupported property " + val.name)] continue # Iterate through each selectors in the rule for selector in cssselect.parse(rule.selector.as_css()): pseudo_element = selector.pseudo_element xpath = cssselect.HTMLTranslator().selector_to_xpath(selector) find = etree.XPath(xpath) # Find each matching element in the HTML/XHTML document for element in find(self.myfile.tree): # Replace text with content of an attribute. if f_replace_with_attr: element.text = f_replace_with_attr(element) if val.name == 'content': v_content = new_content(element) if pseudo_element == "before": element.text = v_content + (element.text or '') # opening tag elif pseudo_element == "after": element.tail = v_content + (element.tail or '') # closing tag else: # Replace all content element.text = new_content(element) if f_transform: self.text_apply(element, f_transform) if f_text_replace: self.text_apply(element, f_text_replace) if f_element_func: f_element_func(element) if f_move: parent = element.getparent() new = element for f in f_move: new = f(new) # Move the tail to the sibling or the parent if element.tail: sibling = element.getprevious() if sibling: sibling.tail = (sibling.tail or "") + element.tail else: parent.text = (parent.text or "") + element.tail element.tail = None # Prune and graft parent.remove(element) new.append(element) # if f_replace_regex and element.text: # element.text = f_replace_regex(element.text) css_errors = "" if stylesheet.errors or property_errors: # There is transformation CSS errors. If the default css # is included, take the offset into account. i = 0 if self.args.css_no_default is False: i = DEFAULT_TRANSFORM_CSS.count('\n') css_errors = "<div class='error-border bbox'><p>Error(s) in the transformation CSS:</p><ul>" for err in stylesheet.errors: css_errors += "<li>{0},{1}: {2}</li>".format(err.line-i, err.column, err.reason) for err in property_errors: css_errors += "<li>{0},{1}: {2}</li>".format(err[0]-i, err[1], err[2]) css_errors += "</ul>" return css_errors
def __init__(self, html, css, width, load_resourcefn, text_extents, font_extents, user_data): self.text_extents = text_extents self.font_extents = font_extents self.load_resourcefn = load_resourcefn self.user_data = user_data if VERBOSE: start = time.clock() end = time.clock() print("robinson: %8.3fs lxml parsing..." % (end - start)) pr = cProfile.Profile() root = etree.fromstring(html) document = etree.ElementTree(root) if VERBOSE: end = time.clock() print(repr(root), root.__class__) print(document, repr(document), document.__class__) print(etree.tostring(document.getroot())) print("robinson: %8.3fs tinycss.css21.CSS21Parser()..." % (end - start)) cssparser = tinycss.css21.CSS21Parser() stylesheet = cssparser.parse_stylesheet(css) if VERBOSE: end = time.clock() print("robinson: %8.3fs style mapping..." % (end - start)) style_map = {} sel_to_xpath = cssselect.xpath.HTMLTranslator().selector_to_xpath for rule in stylesheet.rules: if not isinstance(rule, tinycss.css21.RuleSet): continue sel_css = rule.selector.as_css() sels = cssselect.parse(sel_css) #print "CSS Ruleset: %s" % (rule.selector.as_css()) for sel in sels: speci = sel.specificity() prio = speci2prio(speci) #print " selector: %s, specificity: %s (%06d)" % (repr(sel), sel.specificity(), prio) xpath = sel_to_xpath(sel) #print " xpath: %s" % repr(xpath) for item in document.xpath(xpath): #print " matched item: %s" % repr(item.tag) if not item in style_map: style_map[item] = {} for decl in rule.declarations: #print " declaration: %s: %s" % (decl.name, decl.value) if not decl.name in style_map[item]: style_map[item][decl.name] = (prio, Value.from_token( decl.value)) else: if prio > style_map[item][decl.name][0]: style_map[item][decl.name] = (prio, Value.from_token( decl.value)) #print "Style map done." #print repr(style_map) if VERBOSE: end = time.clock() print("robinson: %8.3fs building layout tree..." % (end - start)) pr.enable() viewport = Dimensions() viewport.content.width = width self.ltree = self._layout_tree(document.getroot(), style_map, viewport) if VERBOSE: end = time.clock() print("robinson: %8.3fs __init__ done." % (end - start))
def get_error(css): try: parse(css) except SelectorSyntaxError: # Py2, Py3, ... return str(sys.exc_info()[1]).replace("(u'", "('")
def __init__(self, renderer_dict): self._map = [] for key in renderer_dict: selector = cssselect.parse(key) self._map.append(RendererMapping(selector, renderer_dict[key]))
def repr_parse(css): selectors = parse(css) for selector in selectors: assert selector.pseudo_element is None return [repr(selector._tree).replace("(u'", "('") for selector in selectors]
def preprocess_stylesheet(device_media_type, base_url, rules, url_fetcher): """Do the work that can be done early on stylesheet, before they are in a document. """ selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath for rule in rules: if not rule.at_keyword: declarations = list(preprocess_declarations( base_url, rule.declarations)) if declarations: selector_string = rule.selector.as_css() try: selector_list = [ Selector( (0,) + selector.specificity(), selector.pseudo_element, lxml.etree.XPath(selector_to_xpath(selector))) for selector in cssselect.parse(selector_string) ] for selector in selector_list: if selector.pseudo_element not in PSEUDO_ELEMENTS: raise cssselect.ExpressionError( 'Unknown pseudo-element: %s' % selector.pseudo_element) except cssselect.SelectorError as exc: LOGGER.warn("Invalid or unsupported selector '%s', %s", selector_string, exc) continue yield rule, selector_list, declarations elif rule.at_keyword == '@import': if not evaluate_media_query(rule.media, device_media_type): continue url = url_join(base_url, rule.uri, '@import at %s:%s', rule.line, rule.column) if url is not None: for result in CSS(url=url, url_fetcher=url_fetcher, media_type=device_media_type).rules: yield result elif rule.at_keyword == '@media': if not evaluate_media_query(rule.media, device_media_type): continue for result in preprocess_stylesheet( device_media_type, base_url, rule.rules, url_fetcher): yield result elif rule.at_keyword == '@page': page_name, pseudo_class = rule.selector # TODO: support named pages (see CSS3 Paged Media) if page_name is not None: LOGGER.warn('Named pages are not supported yet, the whole ' '@page %s rule was ignored.', page_name + ( ':' + pseudo_class if pseudo_class else '')) continue declarations = list(preprocess_declarations( base_url, rule.declarations)) # Use a double lambda to have a closure that holds page_types match = (lambda page_types: lambda _document: page_types)( PAGE_PSEUDOCLASS_TARGETS[pseudo_class]) specificity = rule.specificity if declarations: selector_list = [Selector(specificity, None, match)] yield rule, selector_list, declarations for margin_rule in rule.at_rules: declarations = list(preprocess_declarations( base_url, margin_rule.declarations)) if declarations: selector_list = [Selector( specificity, margin_rule.at_keyword, match)] yield margin_rule, selector_list, declarations
def convert(self): """Remove HTML and PGDP marker from the text.""" # Process each rule from our transformation CSS stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss) for rule in stylesheet.rules: # Extract values we care about v_content = None f_transform = None f_replace_with_attr = None f_replace_regex = None f_text_replace = None f_element_func = None for val in rule.declarations: if val.name == 'content': v_content = val.value[0].value elif val.name == "text-transform": v = val.value[0].value if v == "uppercase": f_transform = lambda x: x.upper() elif v == "lowercase": f_transform = lambda x: x.lower() elif v == "capitalize": f_transform = lambda x: x.title() elif val.name == "_replace_with_attr": f_replace_with_attr = lambda el: el.attrib[val.value[0]. value] elif val.name == "text-replace": v1 = val.value[0].value v2 = val.value[2].value f_text_replace = lambda x: x.replace(v1, v2) elif val.name == "display": # Support display none only. So ignore "none" argument. f_element_func = clear_element # elif val.name == "_replace_regex": # f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2") # f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value) # Iterate through each selectors in the rule for selector in cssselect.parse(rule.selector.as_css()): pseudo_element = selector.pseudo_element xpath = cssselect.HTMLTranslator().selector_to_xpath( selector) find = etree.XPath(xpath) # Find each matching element in the HTML/XHTML document for element in find(self.myfile.tree): # Replace text with content of an attribute. if f_replace_with_attr: element.text = f_replace_with_attr(element) if pseudo_element == "before": element.text = v_content + (element.text or '' ) # opening tag elif pseudo_element == "after": element.tail = v_content + (element.tail or '' ) # closing tag if f_transform: self.text_apply(element, f_transform) if f_text_replace: self.text_apply(element, f_text_replace) if f_element_func: f_element_func(element) # if f_replace_regex and element.text: # element.text = f_replace_regex(element.text) return # Transform footnote anchors to [..] find = etree.XPath("//a") for element in find(self.myfile.tree): href = element.attrib.get('href', None) if not href or not href.startswith("#Footnote_"): continue if element.text and not element.text.startswith('['): # Some PP have [xx], other have just xx for a page # number. Do not add [ ] if they are already there. element.text = '[' + (element.text or '') # opening tag element.tail = ']' + (element.tail or '') # closing tag # Add illustration tag, wherever we find it for figclass in ['figcenter', 'figleft', 'figright', 'caption']: find = etree.XPath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' " + figclass + " ')]") for element in find(self.myfile.tree): if element.text and len(element.text) > 1: element.text = '[Illustration:' + element.text # opening tag else: element.text = '[Illustration' + (element.text or '' ) # opening tag element.tail = ']' + (element.tail or '') # closing tag # for figclass in [ 'caption' ]: # find = etree.XPath("//p[contains(concat(' ', normalize-space(@class), ' '), ' " + figclass + " ')]") # for element in find(self.myfile.tree): # element.text = '[Illustration:' + (element.text or '') # opening tag # element.tail = ']' + (element.tail or '') # closing tag # Add sidenote tag if args.with_sidenote_tags: for sntag in ['sidenote']: for find in [ "//p[contains(concat(' ', normalize-space(@class), ' '), ' " + sntag + " ')]", "//div[starts-with(@class, 'sidenote')]" ]: for element in etree.XPath(find)(self.myfile.tree): element.text = '[Sidenote:' + (element.text or '' ) # opening tag element.tail = ']' + (element.tail or '' ) # closing tag