def preprocess_stylesheet(device_media_type, base_url, rules, url_fetcher): """Do the work that can be done early on stylesheet, before they are in a document. """ selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath for rule in rules: if not rule.at_keyword: declarations = list( preprocess_declarations(base_url, rule.declarations)) if declarations: selector_string = rule.selector.as_css() try: selector_list = [] for selector in cssselect.parse(selector_string): xpath = selector_to_xpath(selector) try: lxml_xpath = lxml.etree.XPath(xpath) except ValueError as exc: # TODO: Some characters are not supported by lxml's # XPath implementation (including control # characters), but these characters are valid in # the CSS2.1 specification. raise cssselect.SelectorError(str(exc)) selector_list.append( Selector((0, ) + selector.specificity(), selector.pseudo_element, lxml_xpath)) for selector in selector_list: if selector.pseudo_element not in PSEUDO_ELEMENTS: raise cssselect.ExpressionError( 'Unknown pseudo-element: %s' % selector.pseudo_element) except cssselect.SelectorError as exc: LOGGER.warn("Invalid or unsupported selector '%s', %s", selector_string, exc) continue yield rule, selector_list, declarations elif rule.at_keyword == '@import': if not evaluate_media_query(rule.media, device_media_type): continue url = url_join(base_url, rule.uri, '@import at %s:%s', rule.line, rule.column) if url is not None: try: stylesheet = CSS(url=url, url_fetcher=url_fetcher, media_type=device_media_type) except URLFetchingError as exc: LOGGER.warn('Failed to load stylesheet at %s : %s', url, exc) else: for result in stylesheet.rules: yield result elif rule.at_keyword == '@media': if not evaluate_media_query(rule.media, device_media_type): continue for result in preprocess_stylesheet(device_media_type, base_url, rule.rules, url_fetcher): yield result elif rule.at_keyword == '@page': page_name, pseudo_class = rule.selector # TODO: support named pages (see CSS3 Paged Media) if page_name is not None: LOGGER.warn( 'Named pages are not supported yet, the whole ' '@page %s rule was ignored.', page_name + (':' + pseudo_class if pseudo_class else '')) continue declarations = list( preprocess_declarations(base_url, rule.declarations)) # Use a double lambda to have a closure that holds page_types match = (lambda page_types: lambda _document: page_types)( PAGE_PSEUDOCLASS_TARGETS[pseudo_class]) specificity = rule.specificity if declarations: selector_list = [Selector(specificity, None, match)] yield rule, selector_list, declarations for margin_rule in rule.at_rules: declarations = list( preprocess_declarations(base_url, margin_rule.declarations)) if declarations: selector_list = [ Selector(specificity, margin_rule.at_keyword, match) ] yield margin_rule, selector_list, declarations
def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules, url_fetcher, rules, fonts, font_config): """Do the work that can be done early on stylesheet, before they are in a document. """ selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath for rule in stylesheet_rules: if rule.type == 'qualified-rule': declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list(rule.content))) if declarations: selector_string = tinycss2.serialize(rule.prelude) try: selector_list = [] for selector in cssselect.parse(selector_string): xpath = selector_to_xpath(selector) try: lxml_xpath = lxml.etree.XPath(xpath) except ValueError as exc: # TODO: Some characters are not supported by lxml's # XPath implementation (including control # characters), but these characters are valid in # the CSS2.1 specification. raise cssselect.SelectorError(str(exc)) selector_list.append( Selector((0, ) + selector.specificity(), selector.pseudo_element, lxml_xpath)) for selector in selector_list: if selector.pseudo_element not in PSEUDO_ELEMENTS: raise cssselect.ExpressionError( 'Unknown pseudo-element: %s' % selector.pseudo_element) except cssselect.SelectorError as exc: LOGGER.warning("Invalid or unsupported selector '%s', %s", selector_string, exc) continue rules.append((rule, selector_list, declarations)) elif rule.type == 'at-rule' and rule.at_keyword == 'import': tokens = remove_whitespace(rule.prelude) if tokens and tokens[0].type in ('url', 'string'): url = tokens[0].value else: continue media = parse_media_query(tokens[1:]) if media is None: LOGGER.warning( 'Invalid media type "%s" ' 'the whole @import rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) if not evaluate_media_query(media, device_media_type): continue url = url_join(base_url, url, allow_relative=False, context='@import at %s:%s', context_args=(rule.source_line, rule.source_column)) if url is not None: try: stylesheet = CSS(url=url, url_fetcher=url_fetcher, media_type=device_media_type, font_config=font_config) except URLFetchingError as exc: LOGGER.warning('Failed to load stylesheet at %s : %s', url, exc) else: for result in stylesheet.rules: rules.append(result) elif rule.type == 'at-rule' and rule.at_keyword == 'media': media = parse_media_query(rule.prelude) if media is None: LOGGER.warning( 'Invalid media type "%s" ' 'the whole @media rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue if not evaluate_media_query(media, device_media_type): continue content_rules = tinycss2.parse_rule_list(rule.content) preprocess_stylesheet(device_media_type, base_url, content_rules, url_fetcher, rules, fonts, font_config) elif rule.type == 'at-rule' and rule.at_keyword == 'page': tokens = remove_whitespace(rule.prelude) # TODO: support named pages (see CSS3 Paged Media) if not tokens: pseudo_class = None specificity = (0, 0) elif (len(tokens) == 2 and tokens[0].type == 'literal' and tokens[0].value == ':' and tokens[1].type == 'ident'): pseudo_class = tokens[1].lower_value specificity = { 'first': (1, 0), 'blank': (1, 0), 'left': (0, 1), 'right': (0, 1), }.get(pseudo_class) if not specificity: LOGGER.warning( 'Unknown @page pseudo-class "%s", ' 'the whole @page rule was ignored ' 'at %s:%s.', pseudo_class, rule.source_line, rule.source_column) continue else: LOGGER.warning( 'Unsupported @page selector "%s", ' 'the whole @page rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue content = tinycss2.parse_declaration_list(rule.content) declarations = list(preprocess_declarations(base_url, content)) # Use a double lambda to have a closure that holds page_types match = (lambda page_types: lambda _document: page_types)( PAGE_PSEUDOCLASS_TARGETS[pseudo_class]) if declarations: selector_list = [Selector(specificity, None, match)] rules.append((rule, selector_list, declarations)) for margin_rule in content: if margin_rule.type != 'at-rule': continue declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list(margin_rule.content))) if declarations: selector_list = [ Selector(specificity, '@' + margin_rule.at_keyword, match) ] rules.append((margin_rule, selector_list, declarations)) elif rule.type == 'at-rule' and rule.at_keyword == 'font-face': content = tinycss2.parse_declaration_list(rule.content) rule_descriptors = dict(preprocess_descriptors(base_url, content)) for key in ('src', 'font_family'): if key not in rule_descriptors: LOGGER.warning( "Missing %s descriptor in '@font-face' rule at %s:%s", key.replace('_', '-'), rule.source_line, rule.source_column) break else: if font_config is not None: font_filename = font_config.add_font_face( rule_descriptors, url_fetcher) if font_filename: fonts.append(font_filename)