def test_handlers(self): "cssutils.log" s = self._setHandler() cssutils.log.setLevel(logging.FATAL) self.assertEqual(cssutils.log.getEffectiveLevel(), logging.FATAL) cssutils.parseString('a { color: 1 }') self.assertEqual(s.getvalue(), '') cssutils.log.setLevel(logging.DEBUG) cssutils.parseString('a { color: 1 }') # TODO: Fix? # self.assertEqual(s.getvalue(), # u'ERROR Property: Invalid value for "CSS Color Module Level 3/CSS Level 2.1" property: 1 [1:5: color]\n') self.assertEqual( s.getvalue(), 'ERROR Property: Invalid value for "CSS Level 2.1" property: 1 [1:5: color]\n' ) s = self._setHandler() cssutils.log.setLevel(logging.ERROR) cssutils.parseUrl('http://example.com') self.assertEqual(s.getvalue()[:38], 'ERROR Expected "text/css" mime type')
def test_handlers(self): "cssutils.log" s = self._setHandler() cssutils.log.setLevel(logging.FATAL) self.assertEqual(cssutils.log.getEffectiveLevel(), logging.FATAL) cssutils.parseString('a { color: 1 }') self.assertEqual(s.getvalue(), '') cssutils.log.setLevel(logging.DEBUG) cssutils.parseString('a { color: 1 }') # TODO: Fix? # self.assertEqual( # s.getvalue(), # u'ERROR Property: Invalid value for "CSS Color Module ' # 'Level 3/CSS Level 2.1" property: 1 [1:5: color]\n') self.assertEqual( s.getvalue(), 'ERROR Property: Invalid value for "CSS Level 2.1" ' 'property: 1 [1:5: color]\n', ) s = self._setHandler() try: socket.getaddrinfo('example.com', 80) except socket.error: # skip the test as the name can't resolve return cssutils.log.setLevel(logging.ERROR) cssutils.parseUrl('http://example.com') self.assertEqual(s.getvalue()[:38], 'ERROR Expected "text/css" mime type')
def scrape_style(url, zipper): """ Scrape any instances of url(...) Args: url (str): url to css file zipper (html_writer): zip to write to Returns str of css style rules """ sheet = cssutils.parseUrl(url) rules = sheet.cssText.decode('utf-8') # Parse urls in css for url in cssutils.getUrls(sheet): try: # Download any urls in css to the shared asset directory (if not already there) filename = url.split('?')[0].split('/')[-1] filepath = os.path.sep.join([SHARED_ASSET_DIRECTORY, filename]) if not os.path.isfile(filepath): with open(filepath, 'wb') as fobj: fobj.write(read(url)) # Replace text with new url new_url = zipper.write_file(filepath, filename, directory="assets") rules = rules.replace(url, "../" + new_url) except requests.exceptions.HTTPError: LOGGER.warning("Could not download css url {}".format(url)) return rules
def test_parseUrl(self): "cssutils.parseUrl()" href = os.path.join(os.path.dirname(__file__), '..', '..', 'sheets', 'import.css') #href = u'file:' + urllib.pathname2url(href) href = cssutils.helper.path2url(href) #href = 'http://seewhatever.de/sheets/import.css' s = cssutils.parseUrl(href, media='tv, print', title='from url') self.assert_(isinstance(s, cssutils.css.CSSStyleSheet)) self.assertEqual(href, s.href) self.assertEqual(self.exp, s.cssText) self.assertEqual(u'utf-8', s.encoding) self.assertEqual(u'tv, print', s.media.mediaText) self.assertEqual('from url', s.title) sr = s.cssRules[1] img = sr.style.getProperty('background-image').cssValue.getStringValue() self.assertEqual(img, 'images/example.gif') ir = s.cssRules[0] self.assertEqual(u'import/import2.css', ir.href) irs = ir.styleSheet self.assertEqual(irs.cssText, '@import "../import3.css";\n@import "import-impossible.css" print;\n.import2 {\n /* sheets/import2.css */\n background: url(./../images/example.gif)\n }') ir2 = irs.cssRules[0] self.assertEqual(u'../import3.css', ir2.href) irs2 = ir2.styleSheet self.assertEqual(irs2.cssText, '/* import3 */\n.import3 {\n /* from ./import/../import3.css */\n background: url(import/images2/../../images/example3.gif)\n }')
def _concatenate_sheets(self): if self.dirty or (self._cached_stylesheet is None): r = CSSStyleSheet() uri_properties = [] for d in self.sheets: local_loader = d.get('local_loader', None) text = d.get('text', None) uri = d.get('uri', None) absolute_url = d.get('absolute_url', None) if (text is None) and local_loader and uri: text = local_loader[uri] if text: sheet = CSSParser().parseString(text, href=absolute_url) else: sheet = cssutils.parseUrl(href=absolute_url) for rule in sheet: r.add(rule) #print __name__, "rule=", rule for p in _get_rule_uri_properties(rule): #print __name__, "_get_rule_uri_properties:", p uri_properties.append(p) self._uri_properties = uri_properties #print __name__, "self._uri_properties=", self._uri_properties self._cached_stylesheet = r self.dirty = False
def test_parseUrl(self): "cssutils.parseUrl()" href = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'sheets', 'import.css') #href = u'file:' + urllib.pathname2url(href) href = cssutils.helper.path2url(href) #href = 'http://seewhatever.de/sheets/import.css' s = cssutils.parseUrl(href, media='tv, print', title='from url') self.assertTrue(isinstance(s, cssutils.css.CSSStyleSheet)) self.assertEqual(href, s.href) self.assertEqual(self.exp.encode(), s.cssText) self.assertEqual(u'utf-8', s.encoding) self.assertEqual(u'tv, print', s.media.mediaText) self.assertEqual('from url', s.title) sr = s.cssRules[1] img = sr.style.getProperty('background-image').propertyValue[0].value self.assertEqual(img, 'images/example.gif') ir = s.cssRules[0] self.assertEqual(u'import/import2.css', ir.href) irs = ir.styleSheet self.assertEqual( irs.cssText, '@import "../import3.css";\n@import "import-impossible.css" print;\n.import2 {\n /* sheets/import2.css */\n background: url(http://example.com/images/example.gif);\n background: url(//example.com/images/example.gif);\n background: url(/images/example.gif);\n background: url(images2/example.gif);\n background: url(./images2/example.gif);\n background: url(../images/example.gif);\n background: url(./../images/example.gif)\n }' .encode()) ir2 = irs.cssRules[0] self.assertEqual(u'../import3.css', ir2.href) irs2 = ir2.styleSheet self.assertEqual( irs2.cssText, '/* import3 */\n.import3 {\n /* from ./import/../import3.css */\n background: url(images/example3.gif);\n background: url(./images/example3.gif);\n background: url(import/images2/example2.gif);\n background: url(./import/images2/example2.gif);\n background: url(import/images2/../../images/example3.gif)\n }' .encode())
def test_parseUrl(self): "cssutils.parseUrl()" href = os.path.join(os.path.dirname(__file__), '..', '..', 'sheets', 'import.css') #href = u'file:' + urllib.pathname2url(href) href = cssutils.helper.path2url(href) #href = 'http://seewhatever.de/sheets/import.css' s = cssutils.parseUrl(href, media='tv, print', title='from url') self.assert_(isinstance(s, cssutils.css.CSSStyleSheet)) self.assertEqual(href, s.href) self.assertEqual(self.exp, s.cssText) self.assertEqual(u'utf-8', s.encoding) self.assertEqual(u'tv, print', s.media.mediaText) self.assertEqual('from url', s.title) sr = s.cssRules[1] img = sr.style.getProperty( 'background-image').cssValue.getStringValue() self.assertEqual(img, 'test/x.gif') ir = s.cssRules[0] self.assertEqual(u'import/import2.css', ir.href) irs = ir.styleSheet self.assertEqual( u'''@import "../import3.css"; /* sheets/import2.css */''', irs.cssText) ir2 = irs.cssRules[0] self.assertEqual(u'../import3.css', ir2.href) irs2 = ir2.styleSheet self.assertEqual(u'/* import3 */', irs2.cssText)
def csscombine(path=None, url=None, cssText=None, href=None, sourceencoding=None, targetencoding=None, minify=True, resolveVariables=False): """Combine sheets referred to by @import rules in given CSS proxy sheet into a single new sheet. :returns: combined cssText, normal or minified :Parameters: `path` or `url` or `cssText` + `href` path or URL to a CSSStyleSheet or a cssText of a sheet which imports other sheets which are then combined into one sheet. `cssText` normally needs `href` to be able to resolve relative imports. `sourceencoding` = 'utf-8' explicit encoding of the source proxysheet `targetencoding` encoding of the combined stylesheet `minify` = True defines if the combined sheet should be minified `resolveVariables` = False defined if variables in combined should be resolved """ cssutils.log.info(u'Combining files from %r' % url, neverraise=True) if sourceencoding is not None: cssutils.log.info(u'Using source encoding %r' % sourceencoding, neverraise=True) if path and not cssText: src = cssutils.parseFile(path, encoding=sourceencoding) elif url: src = cssutils.parseUrl(url, encoding=sourceencoding) elif cssText: src = cssutils.parseString(cssText, href=href, encoding=sourceencoding) else: sys.exit('Path or URL must be given') result = cssutils.resolveImports(src) result.encoding = targetencoding cssutils.log.info(u'Using target encoding: %r' % targetencoding, neverraise=True) if minify: # save old setting and use own serializer oldser = cssutils.ser cssutils.setSerializer(cssutils.serialize.CSSSerializer()) cssutils.ser.prefs.useMinified() cssutils.ser.prefs.resolveVariables = resolveVariables cssText = result.cssText cssutils.setSerializer(oldser) else: rv = cssutils.ser.prefs.resolveVariables cssutils.ser.prefs.resolveVariables = resolveVariables cssText = result.cssText cssutils.ser.prefs.resolveVariables = rv return cssText
def test_handlers(self): "cssutils.log" s = self._setHandler() cssutils.log.setLevel(logging.FATAL) self.assertEqual(cssutils.log.getEffectiveLevel(), logging.FATAL) cssutils.parseString('a { color: 1 }') self.assertEqual(s.getvalue(), u'') cssutils.log.setLevel(logging.DEBUG) cssutils.parseString('a { color: 1 }') self.assertEqual(s.getvalue(), u'ERROR Property: Invalid value for "CSS Color Module Level 3/CSS Level 2.1" property: 1 [1:5: color]\n') s = self._setHandler() cssutils.log.setLevel(logging.ERROR) cssutils.parseUrl('http://example.com') self.assertEqual(s.getvalue()[:38], u'ERROR Expected "text/css" mime type')
def doiter(node): for cssnode in node.walknodes(_isstyle): if isinstance(cssnode, html.style): href = str(base) if base is not None else None if matchstyle(cssnode): stylesheet = cssutils.parseString(str(cssnode.content), href=href, media=str(cssnode.attrs.media)) yield from _doimport(media, stylesheet, base) else: # link if "href" in cssnode.attrs: href = cssnode.attrs.href.asURL() if base is not None: href = base/href if matchlink(cssnode): stylesheet = cssutils.parseUrl(str(href), media=str(cssnode.attrs.media)) yield from _doimport(media, stylesheet, href)
def csscombine(path=None, url=None, sourceencoding=None, targetencoding=None, minify=True): """Combine sheets referred to by @import rules in given CSS proxy sheet into a single new sheet. :returns: combined cssText, normal or minified :Parameters: `path` or `url` path or URL to a CSSStyleSheet which imports other sheets which are then combined into one sheet `targetencoding` encoding of the combined stylesheet, default 'utf-8' `minify` defines if the combined sheet should be minified, default True """ cssutils.log.info(u'Combining files from %r' % url, neverraise=True) if sourceencoding is not None: cssutils.log.info(u'Using source encoding %r' % sourceencoding, neverraise=True) if path: src = cssutils.parseFile(path, encoding=sourceencoding) elif url: src = cssutils.parseUrl(url, encoding=sourceencoding) else: sys.exit('Path or URL must be given') result = cssutils.resolveImports(src) result.encoding = targetencoding cssutils.log.info(u'Using target encoding: %r' % targetencoding, neverraise=True) if minify: # save old setting and use own serializer oldser = cssutils.ser cssutils.setSerializer(cssutils.serialize.CSSSerializer()) cssutils.ser.prefs.useMinified() cssText = result.cssText cssutils.setSerializer(oldser) else: cssText = result.cssText return cssText
def doiter(node): for cssnode in node.walknodes(_isstyle): if isinstance(cssnode, html.style): href = str(base) if base is not None else None if matchstyle(cssnode): stylesheet = cssutils.parseString(str(cssnode.content), href=href, media=str( cssnode.attrs.media)) yield from _doimport(media, stylesheet, base) else: # link if "href" in cssnode.attrs: href = cssnode.attrs.href.asURL() if base is not None: href = base / href if matchlink(cssnode): stylesheet = cssutils.parseUrl( str(href), media=str(cssnode.attrs.media)) yield from _doimport(media, stylesheet, href)
def csscombine(path=None, url=None, sourceencoding=None, targetencoding=None, minify=True): """Combine sheets referred to by @import rules in given CSS proxy sheet into a single new sheet. :returns: combined cssText, normal or minified :Parameters: `path` or `url` path or URL to a CSSStyleSheet which imports other sheets which are then combined into one sheet `targetencoding` encoding of the combined stylesheet, default 'utf-8' `minify` defines if the combined sheet should be minified, default True """ cssutils.log.info(u"Combining files from %r" % url, neverraise=True) if sourceencoding is not None: cssutils.log.info(u"Using source encoding %r" % sourceencoding, neverraise=True) if path: src = cssutils.parseFile(path, encoding=sourceencoding) elif url: src = cssutils.parseUrl(url, encoding=sourceencoding) else: sys.exit("Path or URL must be given") result = cssutils.resolveImports(src) result.encoding = targetencoding cssutils.log.info(u"Using target encoding: %r" % targetencoding, neverraise=True) if minify: # save old setting and use own serializer oldser = cssutils.ser cssutils.setSerializer(cssutils.serialize.CSSSerializer()) cssutils.ser.prefs.useMinified() cssText = result.cssText cssutils.setSerializer(oldser) else: cssText = result.cssText return cssText
import cssutils sheet=cssutils.parseUrl("http://pogoda.tut.by/css/by2/weather1.css") print sheet.__class__ # <class 'cssutils.css.cssstylesheet.CSSStyleSheet'> for rule in sheet: if rule.__class__==cssutils.css.cssstylerule.CSSStyleRule: for selector in rule.selectorList: if selector.selectorText==".w_n_8": if "background" in rule.style.keys(): for property in rule.style.getPropertyCSSValue("background"): #a list of Property objects set in this declaration. if property.__class__==cssutils.css.value.URIValue: print property.uri
def transform(self, pretty_print=True): """change the self.html and return it with CSS turned into style attributes. """ if etree is None: return self.html tree = etree.fromstring(self.html.strip()).getroottree() page = tree.getroot() cssutils.ser.prefs.useMinified() cssutils.ser.prefs.keepAllProperties = False if page is None: print repr(self.html) raise PremailerError("Could not parse the html") assert page is not None ## ## style selectors ## self.styles = defaultdict(list) for style in CSSSelector('style')(page): css_body = etree.tostring(style) css_body = css_body.split('>')[1].split('</')[0] leftovers = self._parse_stylesheet(page, cssutils.parseString(css_body)) if leftovers: style.text = '\n'.join([r.cssText for r in leftovers]) elif not self.keep_style_tags: parent_of_style = style.getparent() parent_of_style.remove(style) for stylefile in self.external_styles: if stylefile.startswith('http://'): self._parse_stylesheet(page, cssutils.parseUrl(stylefile)) elif os.path.exists(stylefile): self._parse_stylesheet(page, cssutils.parseFile(stylefile)) else: raise ValueError(u'Could not find external style: %s' % \ stylefile) for element, rules in self.styles.iteritems(): rules += [element.attrib.get('style', '')] declarations = [] pseudoclass_rules = defaultdict(list) for rule in rules: if not rule: continue elif isinstance(rule, tuple): # pseudoclass pseudoclass, prules = rule pseudoclass_rules[pseudoclass].append(prules) else: declarations.append(rule.strip(';')) css_text = ';'.join(declarations) style = cssutils.parseStyle(css_text) if pseudoclass_rules: prules_list = [] for pclass, prules in pseudoclass_rules.iteritems(): pdecl = cssutils.parseStyle(';'.join(prules)) prules_list.append(':%s{%s}' % (pclass, pdecl.cssText)) if css_text: element.attrib['style'] = '{%s} %s' % \ (style.cssText, ' '.join(prules_list)) else: element.attrib['style'] = ' '.join(prules_list) else: element.attrib['style'] = style.cssText self._style_to_basic_html_attributes(element, style) # now we can delete all 'class' attributes (that aren't in the # whitelist) for item in page.xpath('//*[@class]'): classes = set(item.attrib['class'].split()) remaining_classes = classes - (classes ^ self.keep_classnames) if len(remaining_classes) == 0: del item.attrib['class'] else: item.attrib['class'] = ' '.join(remaining_classes) ## ## URLs ## if self.base_url: for attr in ('href', 'src'): for item in page.xpath('//*[@%s]' % attr): if attr == 'href' and self.preserve_internal_links \ and item.attrib[attr].startswith('#'): continue item.attrib[attr] = urlparse.urljoin(self.base_url, item.attrib[attr]) return etree.tostring(page, pretty_print=pretty_print) \ .replace('<head/>', '<head></head>')
# print for i, s in enumerate(sel): print(i, s) sheet = cssutils.parseString(s + '{color: green}') print(sheet.cssText) print() sys.exit(0) if 1: href = os.path.join(os.path.dirname(__file__), '..', 'sheets', 'import.css') href = cssutils.helper.path2url(href) #href = 'http://seewhatever.de/sheets/import.css' s = cssutils.parseUrl(href, media='tv, print', title='from url') print(s.cssText) sys.exit(0) if 1: import cssutils.script #p = r'sheets\vars\vars.css' p = r'sheets\var\start.css' do = """ import cssutils t = cssutils.tokenize2.Tokenizer() css=''' @page {} @media {} @media {}
#issue 50 #s = cssutils.parseString('[a]{color: green}') s = cssutils.parseFile('sheets/acid2.css') print s print s.cssText if 1: href = os.path.join(os.path.dirname(__file__), '..', 'sheets', 'import.css') href = cssutils.helper.path2url(href) #href = 'http://seewhatever.de/sheets/import.css' s = cssutils.parseUrl(href, media='tv, print', title='from url') print s.cssText sys.exit(0) if 1: css = '\\ReD white transparent, navy #ff0000 #f0f #ff00ff #00ff00' p = cssutils.css.Property('color', css) print p print pv = p.propertyValue for c in pv: print c #print c.name, c.colorType, c.red, c.green, c.blue, c.alpha # for i, v in enumerate(pv): # print i, v
def toStylesheet(filename): if os.path.exists(filename): return cssutils.parseFile(filename) return cssutils.parseUrl(filename)
def test_load_url(cls, _url) -> None: #return cls([CSSRuleBlock.load_rule(i) for i in cssutils.parseUrl(_url).cssRules if isinstance(i, cssutils.css.CSSStyleRule)]) _rules = [(re.split(',\s+|,', re.sub('\[\w+\=[^\]]+\]|:\w+|\*', '', i.selectorText)), i.style) for i in cssutils.parseUrl(_url).cssRules if isinstance(i, cssutils.css.CSSStyleRule)] return cls([CSSRuleBlock._load_rule(i, a) for b, a in _rules for i in b if i])
def lookup_styles(cls, *urls:typing.List[str]) -> typing.Callable: #return cls([CSSRuleBlock.load_rule(i) for b in urls for i in cssutils.parseUrl(b).cssRules if isinstance(i, cssutils.css.CSSStyleRule)]) _rules = [(re.split(',\s+|,', re.sub('\[\w+\=[^\]]+\]|:+[\w\-]+|\*', '', i.selectorText)), i.style) for b in urls for i in cssutils.parseUrl(b).cssRules if isinstance(i, cssutils.css.CSSStyleRule)] return cls([CSSRuleBlock._load_rule(i, a) for b, a in _rules for i in b if i])
def css_parse(soup, URL, socketio): DOMAIN = get_domain(URL) cssLinkElements1 = soup.findAll("link", type="text/css") cssLinkElements2 = soup.findAll("link", rel="stylesheet") cssLinkLists = cssLinkElements1 + cssLinkElements2 finalCSSLinks = [] for cssLink in cssLinkLists: if "bootstrap" in str(cssLink) or "vendor" in str( cssLink) or "http" in str(cssLink): pass elif cssLink not in finalCSSLinks: finalCSSLinks.append(cssLink) first_bool = True smt_success_bool = True # assume successs inaccess_success_bool = True # assume success contrast_success_bool = True # assume success if finalCSSLinks != []: for item in finalCSSLinks: stylesheetName = item['href'] create_print_json("css stylesheet: " + str(stylesheetName), socketio) fullCSSStyleLink = DOMAIN + item['href'] cssutils.log.setLevel(logging.CRITICAL) sheet = cssutils.parseUrl(fullCSSStyleLink) for rule in sheet: if rule.type == rule.STYLE_RULE: try: contrast_bool = find_contrast(soup, URL, first_bool, stylesheetName, fullCSSStyleLink, rule, socketio) if contrast_bool == False: contrast_success_bool = False # once it turns false, it's not going back to True except: pass # HERE WE LOOP OVER THE PROPERTIES # THIS IS WHERE WE CALL ALL THE FUNCTIONS TO CHECK AT THE SAME TIME # SO WE ONLY HAVE TO LOOP OVER THE STYLE SHEET ONCE for cssProperty in rule.style: try: smt_bool = find_small_text(soup, URL, cssProperty, first_bool, stylesheetName, fullCSSStyleLink, rule, socketio) if smt_bool == False: # error smt_success_bool = False # once it turns false, it's not going back to True except: pass try: inaccess_bool = find_inaccessible_colors( soup, URL, cssProperty, first_bool, stylesheetName, fullCSSStyleLink, rule, socketio) if inaccess_bool == False: inaccess_success_bool = False # once it turns false, it's not going back to True except: pass first_bool = False if smt_success_bool: # if this stayed true the whole time it's a success create_success_json("small text", URL, socketio) if inaccess_success_bool: create_success_json("inaccessible colors", URL, socketio) else: TYPE = "inaccessible colors" SEVERITY = "warning" text = "We found " + str( len(INACCESSIBLE_COLORS_FOUND)) + " inaccessible colors." create_error_json(TYPE, SEVERITY, fullCSSStyleLink, text=text, meta=str(INACCESSIBLE_COLORS_FOUND), socketio=socketio) if contrast_success_bool: create_success_json("accessibility for colorblind users", URL, socketio) else: create_success_json("CSS design", URL, socketio)
def transform(self, pretty_print=True): """change the self.html and return it with CSS turned into style attributes. """ if etree is None: return self.html tree = etree.fromstring(self.html.strip()).getroottree() page = tree.getroot() cssutils.ser.prefs.useMinified() cssutils.ser.prefs.keepAllProperties = False if page is None: print repr(self.html) raise PremailerError("Could not parse the html") assert page is not None ## ## style selectors ## self.styles = defaultdict(list) for style in CSSSelector('style')(page): css_body = etree.tostring(style) css_body = css_body.split('>')[1].split('</')[0] leftovers = self._parse_stylesheet(page, cssutils.parseString(css_body)) if leftovers: style.text = '\n'.join([r.cssText for r in leftovers]) elif not self.keep_style_tags: parent_of_style = style.getparent() parent_of_style.remove(style) for stylefile in self.external_styles: if stylefile.startswith('http://'): self._parse_stylesheet(page, cssutils.parseUrl(stylefile)) elif os.path.exists(stylefile): self._parse_stylesheet(page, cssutils.parseFile(stylefile)) else: raise ValueError(u'Could not find external style: %s' % \ stylefile) for element, rules in self.styles.iteritems(): rules += [element.attrib.get('style', '')] declarations = [] pseudoclass_rules = defaultdict(list) for rule in rules: if not rule: continue elif isinstance(rule, tuple): # pseudoclass pseudoclass, prules = rule pseudoclass_rules[pseudoclass].append(prules) else: declarations.append(rule.strip(';')) css_text = ';'.join(declarations) style = cssutils.parseStyle(css_text) if pseudoclass_rules: prules_list = [] for pclass, prules in pseudoclass_rules.iteritems(): pdecl = cssutils.parseStyle(';'.join(prules)) prules_list.append(':%s{%s}' % (pclass, pdecl.cssText)) if css_text: element.attrib['style'] = '{%s} %s' % \ (style.cssText, ' '.join(prules_list)) else: element.attrib['style'] = ' '.join(prules_list) else: element.attrib['style'] = style.cssText self._style_to_basic_html_attributes(element, style) # now we can delete all 'class' attributes (that aren't in the # whitelist) for item in page.xpath('//*[@class]'): classes = set(item.attrib['class'].split()) remaining_classes = classes - (classes ^ self.keep_classnames) if len(remaining_classes) == 0: del item.attrib['class'] else: item.attrib['class'] = ' '.join(remaining_classes) ## ## URLs ## if self.base_url: for attr in ('href', 'src'): for item in page.xpath('//*[@%s]' % attr): if attr == 'href' and self.preserve_internal_links \ and item.attrib[attr].startswith('#'): continue item.attrib[attr] = urlparse.urljoin( self.base_url, item.attrib[attr]) return etree.tostring(page, pretty_print=pretty_print) \ .replace('<head/>', '<head></head>')