def test_cssselect(self): div, = lxml.html.fromstring(HTML).xpath('//div') def count(selector, expected_count, **kwargs): result = div.cssselect(selector, **kwargs) self.assertEqual(len(result), expected_count) count('div', 1) count('a', 2) count('em', 0) # Element names are case-insensitive in HTML count('DIV', 1) # ... but not in XHTML or XML count('DIV', 0, translator='xhtml') count('DIV', 0, translator='xml') # :contains() is case-insensitive in lxml count(':contains("link")', 2) # div, a count(':contains("LInk")', 2) # Whatever the document language count(':contains("LInk")', 2, translator='xhtml') count(':contains("LInk")', 2, translator='xml') # ... but not in upstream cssselect import cssselect count(':contains("link")', 2, translator=cssselect.HTMLTranslator()) count(':contains("LInk")', 0, translator=cssselect.HTMLTranslator())
def main(argv): if len(argv) < 3: return 1 css = argv[1] html = argv[2] doc = ht.document_fromstring(open(html).read()) css_text = open(css).read() rules = parser().parseString(css_text) tr = cs.HTMLTranslator() result_rules = [] rejected_rules = [] for r in rules: if check_rule(r, doc): result_rules.append(r) print(r.text(), end='') else: print('rejected:', r.text(exclude=False), file=sys.stderr) rejected_rules.append(r) print() print("rules before:\t", len(rules), file=sys.stderr) print("rules after:\t", len(result_rules), file=sys.stderr) #print ("rejected rules:", file=sys.stderr) #for r in rejected_rules: # print(r.text(exclude=False), file=sys.stderr, end='') sys.exit()
def check_selector_list(sel, doc): tr = cs.HTMLTranslator() def convert(x): return tr.selector_to_xpath(x) # convert e.g. a:hover to a::hover (css3) def S(m): if m.group(0) == ':': return '::' else: return m.group(0) for s in sel: s = re.sub(':+', S, s) try: sel_list = cs.parse(s) for x in map(convert, sel_list): if doc.xpath(x): return True except cs.parser.SelectorSyntaxError as e: # probably unsupported @media selector # may still be matched by subrules' selectors # so just skip this selector pass except Exception as e: print(e, "; sel='{}'".format(s), file=sys.stderr) return False
def download_assets(page, directory=None): """ Downloads assets from the given page into a temporary directory. """ print_(EMPTY, "Compiling list of assets to download...") try: expression = cssselect.HTMLTranslator().css_to_xpath(ASSET_SELECTOR) except cssselect.SelectorError: print_(FAIL) print_(INFO, "Invalid ASSET_SELECTOR configured.") exit(1) return asset_list = [ "https://github.com" + e.get('href') for e in lxml.html.fromstring(page).xpath(expression) if "Source code" not in e[0].text ] print_(OK) tmp_dir = directory or tempfile.mkdtemp(prefix='intel-opencl-neo-') if directory is None: print_(DBUG, f"Temporary directory: {tmp_dir}") for asset in asset_list: download_asset(asset, tmp_dir) return tmp_dir
def __init__(self, *rules_files): """ :param rules_files: path to rules files """ if not rules_files: rule_urls = [ 'https://filters.adtidy.org/extension/chromium/filters/2.txt', 'https://easylist.to/easylist/easylist.txt' ] rules_files = [url.rpartition('/')[-1] for url in rule_urls] if not os.path.isdir("adList"): os.mkdir("adList") # download files containing rules for rule_url, rule_file in zip(rule_urls, rules_files): r = requests.get(rule_url) with open("adList/" + rule_file, 'w', encoding='utf-8') as f: f.write(r.text) translator = cssselect.HTMLTranslator() self.rules = [] for rules_file in rules_files: with open("adList/" + rules_file, 'r', encoding="utf-8") as f: for line in f: # elemhide rules are prefixed by ## in the adblock filter syntax if line[:2] != '@@': try: self.rules.append(translator.css_to_xpath( line[2:])) except cssselect.SelectorError: # just skip bad selectors pass n_thread = mp.cpu_count() * 2 l_query = len(self.rules) # create one large query by joining them the xpath | (or) operator self.xpath_query_list = [] for _ in range(n_thread): start = int(_ * l_query / n_thread) stop = int((_ + 1) * l_query / n_thread) self.xpath_query_list.append('|'.join(self.rules[start:stop]))
def __init__(self, *rules_files): if not rules_files: raise ValueError("one or more rules_files required") translator = cssselect.HTMLTranslator() rules = [] for rules_file in rules_files: with open(rules_file, 'r') as f: for line in f: # elemhide rules are prefixed by ## in the adblock filter syntax if line[:2] == '##': try: rules.append(translator.css_to_xpath(line[2:])) except cssselect.SelectorError: # just skip bad selectors pass # create one large query by joining them the xpath | (or) operator self.xpath_query = '|'.join(rules)
def advertisements(data, store): with open('webcred/data/easylist.txt') as f: # elemhide rules are prefixed by ## in the filter syntax css_rules = [line[2:] for line in f if line[:2] == "##"] # convert css rules from filter list to xpath rules xpath_rules = [] translator = cssselect.HTMLTranslator() for rule in css_rules: try: xpath_rules.append(translator.css_to_xpath(rule)) except cssselect.SelectorError: # skip bad selectors pass # create one large query by joining the rules using the xpath OR operator xpath_query = '|'.join(xpath_rules) ad_count = len(data['doc'].xpath(xpath_query)) store['advertisements'] = ad_count
def preprocess_stylesheet(device_media_type, base_url, rules, url_fetcher): """Do the work that can be done early on stylesheet, before they are in a document. """ selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath for rule in rules: if not rule.at_keyword: declarations = list( preprocess_declarations(base_url, rule.declarations)) if declarations: selector_string = rule.selector.as_css() try: selector_list = [] for selector in cssselect.parse(selector_string): xpath = selector_to_xpath(selector) try: lxml_xpath = lxml.etree.XPath(xpath) except ValueError as exc: # TODO: Some characters are not supported by lxml's # XPath implementation (including control # characters), but these characters are valid in # the CSS2.1 specification. raise cssselect.SelectorError(str(exc)) selector_list.append( Selector((0, ) + selector.specificity(), selector.pseudo_element, lxml_xpath)) for selector in selector_list: if selector.pseudo_element not in PSEUDO_ELEMENTS: raise cssselect.ExpressionError( 'Unknown pseudo-element: %s' % selector.pseudo_element) except cssselect.SelectorError as exc: LOGGER.warn("Invalid or unsupported selector '%s', %s", selector_string, exc) continue yield rule, selector_list, declarations elif rule.at_keyword == '@import': if not evaluate_media_query(rule.media, device_media_type): continue url = url_join(base_url, rule.uri, '@import at %s:%s', rule.line, rule.column) if url is not None: try: stylesheet = CSS(url=url, url_fetcher=url_fetcher, media_type=device_media_type) except URLFetchingError as exc: LOGGER.warn('Failed to load stylesheet at %s : %s', url, exc) else: for result in stylesheet.rules: yield result elif rule.at_keyword == '@media': if not evaluate_media_query(rule.media, device_media_type): continue for result in preprocess_stylesheet(device_media_type, base_url, rule.rules, url_fetcher): yield result elif rule.at_keyword == '@page': page_name, pseudo_class = rule.selector # TODO: support named pages (see CSS3 Paged Media) if page_name is not None: LOGGER.warn( 'Named pages are not supported yet, the whole ' '@page %s rule was ignored.', page_name + (':' + pseudo_class if pseudo_class else '')) continue declarations = list( preprocess_declarations(base_url, rule.declarations)) # Use a double lambda to have a closure that holds page_types match = (lambda page_types: lambda _document: page_types)( PAGE_PSEUDOCLASS_TARGETS[pseudo_class]) specificity = rule.specificity if declarations: selector_list = [Selector(specificity, None, match)] yield rule, selector_list, declarations for margin_rule in rule.at_rules: declarations = list( preprocess_declarations(base_url, margin_rule.declarations)) if declarations: selector_list = [ Selector(specificity, margin_rule.at_keyword, match) ] yield margin_rule, selector_list, declarations
def css(self, selector): xpath = cssselect.HTMLTranslator().css_to_xpath(selector) return self.xpath(xpath)
def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules, url_fetcher, rules, fonts, font_config): """Do the work that can be done early on stylesheet, before they are in a document. """ selector_to_xpath = cssselect.HTMLTranslator().selector_to_xpath for rule in stylesheet_rules: if rule.type == 'qualified-rule': declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list(rule.content))) if declarations: selector_string = tinycss2.serialize(rule.prelude) try: selector_list = [] for selector in cssselect.parse(selector_string): xpath = selector_to_xpath(selector) try: lxml_xpath = lxml.etree.XPath(xpath) except ValueError as exc: # TODO: Some characters are not supported by lxml's # XPath implementation (including control # characters), but these characters are valid in # the CSS2.1 specification. raise cssselect.SelectorError(str(exc)) selector_list.append( Selector((0, ) + selector.specificity(), selector.pseudo_element, lxml_xpath)) for selector in selector_list: if selector.pseudo_element not in PSEUDO_ELEMENTS: raise cssselect.ExpressionError( 'Unknown pseudo-element: %s' % selector.pseudo_element) except cssselect.SelectorError as exc: LOGGER.warning("Invalid or unsupported selector '%s', %s", selector_string, exc) continue rules.append((rule, selector_list, declarations)) elif rule.type == 'at-rule' and rule.at_keyword == 'import': tokens = remove_whitespace(rule.prelude) if tokens and tokens[0].type in ('url', 'string'): url = tokens[0].value else: continue media = parse_media_query(tokens[1:]) if media is None: LOGGER.warning( 'Invalid media type "%s" ' 'the whole @import rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) if not evaluate_media_query(media, device_media_type): continue url = url_join(base_url, url, allow_relative=False, context='@import at %s:%s', context_args=(rule.source_line, rule.source_column)) if url is not None: try: stylesheet = CSS(url=url, url_fetcher=url_fetcher, media_type=device_media_type, font_config=font_config) except URLFetchingError as exc: LOGGER.warning('Failed to load stylesheet at %s : %s', url, exc) else: for result in stylesheet.rules: rules.append(result) elif rule.type == 'at-rule' and rule.at_keyword == 'media': media = parse_media_query(rule.prelude) if media is None: LOGGER.warning( 'Invalid media type "%s" ' 'the whole @media rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue if not evaluate_media_query(media, device_media_type): continue content_rules = tinycss2.parse_rule_list(rule.content) preprocess_stylesheet(device_media_type, base_url, content_rules, url_fetcher, rules, fonts, font_config) elif rule.type == 'at-rule' and rule.at_keyword == 'page': tokens = remove_whitespace(rule.prelude) # TODO: support named pages (see CSS3 Paged Media) if not tokens: pseudo_class = None specificity = (0, 0) elif (len(tokens) == 2 and tokens[0].type == 'literal' and tokens[0].value == ':' and tokens[1].type == 'ident'): pseudo_class = tokens[1].lower_value specificity = { 'first': (1, 0), 'blank': (1, 0), 'left': (0, 1), 'right': (0, 1), }.get(pseudo_class) if not specificity: LOGGER.warning( 'Unknown @page pseudo-class "%s", ' 'the whole @page rule was ignored ' 'at %s:%s.', pseudo_class, rule.source_line, rule.source_column) continue else: LOGGER.warning( 'Unsupported @page selector "%s", ' 'the whole @page rule was ignored at %s:%s.', tinycss2.serialize(rule.prelude), rule.source_line, rule.source_column) continue content = tinycss2.parse_declaration_list(rule.content) declarations = list(preprocess_declarations(base_url, content)) # Use a double lambda to have a closure that holds page_types match = (lambda page_types: lambda _document: page_types)( PAGE_PSEUDOCLASS_TARGETS[pseudo_class]) if declarations: selector_list = [Selector(specificity, None, match)] rules.append((rule, selector_list, declarations)) for margin_rule in content: if margin_rule.type != 'at-rule': continue declarations = list( preprocess_declarations( base_url, tinycss2.parse_declaration_list(margin_rule.content))) if declarations: selector_list = [ Selector(specificity, '@' + margin_rule.at_keyword, match) ] rules.append((margin_rule, selector_list, declarations)) elif rule.type == 'at-rule' and rule.at_keyword == 'font-face': content = tinycss2.parse_declaration_list(rule.content) rule_descriptors = dict(preprocess_descriptors(base_url, content)) for key in ('src', 'font_family'): if key not in rule_descriptors: LOGGER.warning( "Missing %s descriptor in '@font-face' rule at %s:%s", key.replace('_', '-'), rule.source_line, rule.source_column) break else: if font_config is not None: font_filename = font_config.add_font_face( rule_descriptors, url_fetcher) if font_filename: fonts.append(font_filename)
def css_to_xpath(css): return cssselect.HTMLTranslator().css_to_xpath(css)
def csspath(query): return cssselect.HTMLTranslator().css_to_xpath(query)
def convert(self): """Remove HTML and PGDP marker from the text.""" escaped_unicode_re = re.compile(r"\\u[0-9a-fA-F]{4}") def escaped_unicode(m): try: newstr = bytes(m.group(0), 'utf8').decode('unicode-escape') except Exception: newstr = m.group(0) return newstr def new_content(element): """Process the "content:" property """ retstr = "" for token in val.value: if token.type == "STRING": # e.g. { content: "xyz" } retstr += escaped_unicode_re.sub(escaped_unicode, token.value) elif token.type == "FUNCTION": if token.function_name == 'attr': # e.g. { content: attr(title) } retstr += element.attrib.get(token.content[0].value, "") elif token.type == "IDENT": if token.value == "content": # Identity, e.g. { content: content } retstr += element.text return retstr # Process each rule from our transformation CSS stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss) property_errors = [] for rule in stylesheet.rules: # Extract values we care about f_transform = None f_replace_with_attr = None #f_replace_regex = None f_text_replace = None f_element_func = None f_move = None for val in rule.declarations: if val.name == 'content': # result depends on element and pseudo elements. pass elif val.name == "text-transform": if len(val.value) != 1: property_errors += [(val.line, val.column, val.name + " takes 1 argument")] else: v = val.value[0].value if v == "uppercase": f_transform = lambda x: x.upper() elif v == "lowercase": f_transform = lambda x: x.lower() elif v == "capitalize": f_transform = lambda x: x.title() else: property_errors += [( val.line, val.column, val.name + " accepts only 'uppercase', 'lowercase' or 'capitalize'" )] elif val.name == "_replace_with_attr": f_replace_with_attr = lambda el: el.attrib[val.value[0]. value] elif val.name == "text-replace": # Skip S (spaces) tokens. values = [v for v in val.value if v.type != "S"] if len(values) != 2: property_errors += [ (val.line, val.column, val.name + " takes 2 string arguments") ] else: v1 = values[0].value v2 = values[1].value f_text_replace = lambda x: x.replace(v1, v2) elif val.name == "display": # Support display none only. So ignore "none" argument. f_element_func = clear_element elif val.name == "_graft": values = [v for v in val.value if v.type != "S"] if len(values) < 1: property_errors += [ (val.line, val.column, val.name + " takes at least one argument") ] continue f_move = [] for v in values: print("[", v.value, "]") if v.value == 'parent': f_move.append(lambda el: el.getparent()) elif v.value == 'prev-sib': f_move.append(lambda el: el.getprevious()) elif v.value == 'next-sib': f_move.append(lambda el: el.getnext()) else: property_errors += [ (val.line, val.column, val.name + " invalid value " + v.value) ] f_move = None break if not f_move: continue # elif val.name == "_replace_regex": # f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2") # f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value) else: property_errors += [(val.line, val.column, "Unsupported property " + val.name)] continue # Iterate through each selectors in the rule for selector in cssselect.parse(rule.selector.as_css()): pseudo_element = selector.pseudo_element xpath = cssselect.HTMLTranslator().selector_to_xpath( selector) find = etree.XPath(xpath) # Find each matching element in the HTML/XHTML document for element in find(self.myfile.tree): # Replace text with content of an attribute. if f_replace_with_attr: element.text = f_replace_with_attr(element) if val.name == 'content': v_content = new_content(element) if pseudo_element == "before": element.text = v_content + (element.text or '' ) # opening tag elif pseudo_element == "after": element.tail = v_content + (element.tail or '' ) # closing tag else: # Replace all content element.text = new_content(element) if f_transform: self.text_apply(element, f_transform) if f_text_replace: self.text_apply(element, f_text_replace) if f_element_func: f_element_func(element) if f_move: parent = element.getparent() new = element for f in f_move: new = f(new) # Move the tail to the sibling or the parent if element.tail: sibling = element.getprevious() if sibling: sibling.tail = (sibling.tail or "") + element.tail else: parent.text = (parent.text or "") + element.tail element.tail = None # Prune and graft parent.remove(element) new.append(element) # if f_replace_regex and element.text: # element.text = f_replace_regex(element.text) css_errors = "" if stylesheet.errors or property_errors: # There is transformation CSS errors. If the default css # is included, take the offset into account. i = 0 if self.args.css_no_default is False: i = DEFAULT_TRANSFORM_CSS.count('\n') css_errors = "<div class='error-border bbox'><p>Error(s) in the transformation CSS:</p><ul>" for err in stylesheet.errors: css_errors += "<li>{0},{1}: {2}</li>".format( err.line - i, err.column, err.reason) for err in property_errors: css_errors += "<li>{0},{1}: {2}</li>".format( err[0] - i, err[1], err[2]) css_errors += "</ul>" return css_errors
# -*- coding: utf-8 -*- import urlparse import cssselect css = cssselect.HTMLTranslator().css_to_xpath from wiseguy import html_tags as ht from wiseguy.template import Transform def stylesheet(href): return ht.LINK({'rel': "stylesheet", 'type': "text/css", 'href': href}) def script(href): return ht.SCRIPT({'src': href}) def add_stylesheet(href): return Transform( [], lambda template: template.element.add("head", stylesheet(href))) def add_script(href): return Transform( [], lambda template: template.element.add("head", script(href))) _url_fixable_tags = set([ ("link", "href"),
def convert(self): """Remove HTML and PGDP marker from the text.""" # Process each rule from our transformation CSS stylesheet = tinycss.make_parser().parse_stylesheet(self.mycss) for rule in stylesheet.rules: # Extract values we care about v_content = None f_transform = None f_replace_with_attr = None f_replace_regex = None f_text_replace = None f_element_func = None for val in rule.declarations: if val.name == 'content': v_content = val.value[0].value elif val.name == "text-transform": v = val.value[0].value if v == "uppercase": f_transform = lambda x: x.upper() elif v == "lowercase": f_transform = lambda x: x.lower() elif v == "capitalize": f_transform = lambda x: x.title() elif val.name == "_replace_with_attr": f_replace_with_attr = lambda el: el.attrib[val.value[0]. value] elif val.name == "text-replace": v1 = val.value[0].value v2 = val.value[2].value f_text_replace = lambda x: x.replace(v1, v2) elif val.name == "display": # Support display none only. So ignore "none" argument. f_element_func = clear_element # elif val.name == "_replace_regex": # f_replace_regex = partial(re.sub, r"(\d)\u00A0(\d)", r"\1\2") # f_replace_regex = partial(re.sub, val.value[0].value, val.value[1].value) # Iterate through each selectors in the rule for selector in cssselect.parse(rule.selector.as_css()): pseudo_element = selector.pseudo_element xpath = cssselect.HTMLTranslator().selector_to_xpath( selector) find = etree.XPath(xpath) # Find each matching element in the HTML/XHTML document for element in find(self.myfile.tree): # Replace text with content of an attribute. if f_replace_with_attr: element.text = f_replace_with_attr(element) if pseudo_element == "before": element.text = v_content + (element.text or '' ) # opening tag elif pseudo_element == "after": element.tail = v_content + (element.tail or '' ) # closing tag if f_transform: self.text_apply(element, f_transform) if f_text_replace: self.text_apply(element, f_text_replace) if f_element_func: f_element_func(element) # if f_replace_regex and element.text: # element.text = f_replace_regex(element.text) return # Transform footnote anchors to [..] find = etree.XPath("//a") for element in find(self.myfile.tree): href = element.attrib.get('href', None) if not href or not href.startswith("#Footnote_"): continue if element.text and not element.text.startswith('['): # Some PP have [xx], other have just xx for a page # number. Do not add [ ] if they are already there. element.text = '[' + (element.text or '') # opening tag element.tail = ']' + (element.tail or '') # closing tag # Add illustration tag, wherever we find it for figclass in ['figcenter', 'figleft', 'figright', 'caption']: find = etree.XPath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' " + figclass + " ')]") for element in find(self.myfile.tree): if element.text and len(element.text) > 1: element.text = '[Illustration:' + element.text # opening tag else: element.text = '[Illustration' + (element.text or '' ) # opening tag element.tail = ']' + (element.tail or '') # closing tag # for figclass in [ 'caption' ]: # find = etree.XPath("//p[contains(concat(' ', normalize-space(@class), ' '), ' " + figclass + " ')]") # for element in find(self.myfile.tree): # element.text = '[Illustration:' + (element.text or '') # opening tag # element.tail = ']' + (element.tail or '') # closing tag # Add sidenote tag if args.with_sidenote_tags: for sntag in ['sidenote']: for find in [ "//p[contains(concat(' ', normalize-space(@class), ' '), ' " + sntag + " ')]", "//div[starts-with(@class, 'sidenote')]" ]: for element in etree.XPath(find)(self.myfile.tree): element.text = '[Sidenote:' + (element.text or '' ) # opening tag element.tail = ']' + (element.tail or '' ) # closing tag