def test_parentRule(self): "CSSStyleDeclaration.parentRule" s = cssutils.css.CSSStyleDeclaration() sheet = cssutils.css.CSSStyleRule() s.parentRule = sheet self.assertEqual(sheet, s.parentRule) sheet = cssutils.parseString(u'a{x:1}') s = sheet.cssRules[0] d = s.style self.assertEqual(s, d.parentRule) s = cssutils.parseString(''' @font-face { font-weight: bold; } a { font-weight: bolder; } @page { font-weight: bolder; } ''') for r in s: self.assertEqual(r.style.parentRule, r)
def test_cssRules(self): "CSSMediaRule.cssRules" r = cssutils.css.CSSMediaRule() self.assertEqual([], r.cssRules) sr = cssutils.css.CSSStyleRule() r.cssRules.append(sr) self.assertEqual([sr], r.cssRules) ir = cssutils.css.CSSImportRule() self.assertRaises(xml.dom.HierarchyRequestErr, r.cssRules.append, ir) s = cssutils.parseString('@media all { /*1*/a {x:1} }') m = s.cssRules[0] self.assertEqual(2, m.cssRules.length) del m.cssRules[0] self.assertEqual(1, m.cssRules.length) m.cssRules.append('/*2*/') self.assertEqual(2, m.cssRules.length) m.cssRules.extend(cssutils.parseString('/*3*/x {y:2}').cssRules) self.assertEqual(4, m.cssRules.length) self.assertEqual(u'@media all {\n a {\n x: 1\n }\n /*2*/\n /*3*/\n x {\n y: 2\n }\n }', m.cssText) for rule in m.cssRules: self.assertEqual(rule.parentStyleSheet, s) self.assertEqual(rule.parentRule, m)
def test_children(self): "CSSStyleDeclaration.children()" style = u'/*1*/color: red; color: green; @x;' types = [ (cssutils.css.CSSComment, None), (cssutils.css.Property, 'parentStyle'), #DEPRECATED (cssutils.css.Property, 'parentStyle'), #DEPRECATED (cssutils.css.CSSUnknownRule, None) ] def t(s): for i, x in enumerate(s.children()): self.assertEqual(types[i][0], type(x)) self.assertEqual(s, x.parent) if types[i][1]: #DEPRECATED self.assertEqual(s, getattr(x, types[i][1])) t(cssutils.parseStyle(style)) t(cssutils.parseString(u'a {'+style+'}').cssRules[0].style) t(cssutils.parseString(u'@media all {a {'+style+'}}').cssRules[0].cssRules[0].style) s = cssutils.parseStyle(style) s['x'] = '0' self.assertEqual(s, s.getProperty('x').parent) s.setProperty('y', '1') self.assertEqual(s, s.getProperty('y').parent)
def test_parsevalidation(self): style = 'color: 1' t = 'a { %s }' % style cssutils.log.setLevel(logging.DEBUG) # sheet s = self._setHandler() cssutils.parseString(t) self.assertNotEqual(len(s.getvalue()), 0) s = self._setHandler() cssutils.parseString(t, validate=False) self.assertEqual(s.getvalue(), '') # style s = self._setHandler() cssutils.parseStyle(style) self.assertNotEqual(len(s.getvalue()), 0) s = self._setHandler() cssutils.parseStyle(style, validate=True) self.assertNotEqual(len(s.getvalue()), 0) s = self._setHandler() cssutils.parseStyle(style, validate=False) self.assertEqual(s.getvalue(), '')
def test_invalidstring(self): "cssutils.parseString(INVALID_STRING)" validfromhere = '@namespace "x";' csss = ( u'''@charset "ascii ;''' + validfromhere, u'''@charset 'ascii ;''' + validfromhere, u'''@namespace "y ;''' + validfromhere, u'''@import "y ;''' + validfromhere, u'''@import url('a );''' + validfromhere, u'''@unknown "y ;''' + validfromhere) for css in csss: s = cssutils.parseString(css) self.assertEqual(validfromhere, s.cssText) csss = (u'''a { font-family: "Courier ; }''', ur'''a { content: "\"; } ''', ur'''a { content: "\\\"; } ''' ) for css in csss: self.assertEqual(u'', cssutils.parseString(css).cssText)
def test_prioriy(self): "Property.priority" s = cssutils.parseString('a { color: red }') self.assertEqual(u'a {\n color: red\n }', s.cssText) self.assertEqual(u'', s.cssRules[0].style.getPropertyPriority('color')) s = cssutils.parseString('a { color: red !important }') self.assertEqual(u'a {\n color: red !important\n }', s.cssText) self.assertEqual(u'important', s.cssRules[0].style.getPropertyPriority('color')) # invalid but kept! # #cssutils.log.raiseExceptions = False # s = cssutils.parseString('a { color: red !x }') # self.assertEqual(u'a {\n color: red !x\n }', s.cssText) # self.assertEqual(u'x', s.cssRules[0].style.getPropertyPriority('color')) cssutils.log.raiseExceptions = True p = cssutils.css.Property(u'color', u'red', u'') self.assertEqual(p.priority, u'') p = cssutils.css.Property(u'color', u'red', u'!important') self.assertEqual(p.priority, u'important') self.assertRaisesMsg(xml.dom.SyntaxErr, u'', cssutils.css.Property, u'color', u'red', u'x') cssutils.log.raiseExceptions = False p = cssutils.css.Property(u'color', u'red', u'!x') self.assertEqual(p.priority, u'x') p = cssutils.css.Property(u'color', u'red', u'!x') self.assertEqual(p.priority, u'x') cssutils.log.raiseExceptions = True
def test_insufficient_partial_cascade(self): css_text1 = "body {background-color: #00ff00;}" style1 = cssutils.parseString(css_text1).cssRules[0].style css_text2 = "body {color: #0000ff;}" style2 = cssutils.parseString(css_text2).cssRules[0].style self.assertRaises(ValueError, wx._styles2dict, [style1, style2])
def test_resolveImports(self): "cssutils.resolveImports(sheet)" if mock: self._tempSer() cssutils.ser.prefs.useMinified() a = u'@charset "iso-8859-1";@import"b.css";ä{color:green}'.encode('iso-8859-1') b = u'@charset "ascii";\E4 {color:red}'.encode('ascii') # normal mock("cssutils.util._defaultFetcher", mock_obj=self._make_fetcher(None, b)) s = cssutils.parseString(a) restore() self.assertEqual(a, s.cssText) self.assertEqual(b, s.cssRules[1].styleSheet.cssText) c = cssutils.resolveImports(s) self.assertEqual('\xc3\xa4{color:red}\xc3\xa4{color:green}', c.cssText) c.encoding = 'ascii' self.assertEqual(r'@charset "ascii";\E4 {color:red}\E4 {color:green}', c.cssText) # b cannot be found mock("cssutils.util._defaultFetcher", mock_obj=self._make_fetcher(None, None)) s = cssutils.parseString(a) restore() self.assertEqual(a, s.cssText) self.assertEqual(None, s.cssRules[1].styleSheet) c = cssutils.resolveImports(s) self.assertEqual('@import"b.css";\xc3\xa4{color:green}', c.cssText) # @import with media a = u'@import"b.css";@import"b.css" print, tv ;@import"b.css" all;' b = u'a {color: red}' mock("cssutils.util._defaultFetcher", mock_obj=self._make_fetcher(None, b)) s = cssutils.parseString(a) restore() c = cssutils.resolveImports(s) self.assertEqual('a{color:red}@media print,tv{a{color:red}}a{color:red}', c.cssText) # cannot resolve with media => keep original a = u'@import"b.css"print;' b = u'@namespace "http://example.com";' mock("cssutils.util._defaultFetcher", mock_obj=self._make_fetcher(None, b)) s = cssutils.parseString(a) restore() c = cssutils.resolveImports(s) self.assertEqual(a, c.cssText) else: self.assertEqual(False, u'Minimock needed for this test')
def test_attributes(self): "cssutils.parseString(href, media)" s = cssutils.parseString("a{}", href="file:foo.css", media="screen, projection, tv") self.assertEqual(s.href, "file:foo.css") self.assertEqual(s.media.mediaText, "screen, projection, tv") s = cssutils.parseString("a{}", href="file:foo.css", media=["screen", "projection", "tv"]) self.assertEqual(s.media.mediaText, "screen, projection, tv")
def test_CSSStyleSheet(self): "CSSSerializer.do_CSSStyleSheet" css = u'/* κουρος */' sheet = cssutils.parseString(css) self.assertEqual(css, unicode(sheet.cssText, 'utf-8')) css = u'@charset "utf-8";\n/* κουρος */' sheet = cssutils.parseString(css) self.assertEqual(css, unicode(sheet.cssText, 'utf-8')) sheet.cssRules[0].encoding = 'ascii' self.assertEqual('@charset "ascii";\n/* \\3BA \\3BF \\3C5 \\3C1 \\3BF \\3C2 */'.encode(), sheet.cssText)
def test_set(self): "settings.set()" cssutils.ser.prefs.useMinified() text = u'a {filter: progid:DXImageTransform.Microsoft.BasicImage( rotation = 90 )}' self.assertEqual(cssutils.parseString(text).cssText, ''.encode()) cssutils.settings.set('DXImageTransform.Microsoft', True) self.assertEqual(cssutils.parseString(text).cssText, 'a{filter:progid:DXImageTransform.Microsoft.BasicImage(rotation=90)}'.encode()) cssutils.ser.prefs.useDefaults()
def test_roundtrip(self): "cssutils encodings" css1 = ur'''@charset "utf-8"; /* ä */''' s = cssutils.parseString(css1) css2 = unicode(s.cssText, 'utf-8') self.assertEqual(css1, css2) s = cssutils.parseString(css2) s.cssRules[0].encoding='ascii' css3 = ur'''@charset "ascii"; /* \E4 */''' self.assertEqual(css3, unicode(s.cssText, 'utf-8'))
def test_partial_cascade(self): css_text1 = ( "body {background-color: #00ff00; " "font-family: monospace;}") style1 = cssutils.parseString(css_text1).cssRules[0].style css_text2 = "body {color: #0000ff;}" style2 = cssutils.parseString(css_text2).cssRules[0].style self.assertEqual(wx._styles2dict([style1, style2]), { 'color': (0, 0, 255), 'background-color': (0, 255, 0), 'font-family': ['monospace']})
def test_escapes(self): "cssutils escapes" css = ur'\43\x { \43\x: \43\x !import\41nt }' sheet = cssutils.parseString(css) self.assertEqual(sheet.cssText, ur'''C\x { c\x: C\x !important }''') css = ur'\ x{\ x :\ x ;y:1} ' sheet = cssutils.parseString(css) self.assertEqual(sheet.cssText, ur'''\ x { \ x: \ x; y: 1 }''')
def test_propertyNameSpacer(self): "Preferences.propertyNameSpacer" css = 'a { x: 1; y: 2 }' s = cssutils.parseString(css) self.assertEqual(u'a {\n x: 1;\n y: 2\n }'.encode(), s.cssText) cssutils.ser.prefs.propertyNameSpacer = u'' self.assertEqual(u'a {\n x:1;\n y:2\n }'.encode(), s.cssText)
def test_omitLastSemicolon(self): "Preferences.omitLastSemicolon" css = 'a { x: 1; y: 2 }' s = cssutils.parseString(css) self.assertEqual(u'a {\n x: 1;\n y: 2\n }'.encode(), s.cssText) cssutils.ser.prefs.omitLastSemicolon = False self.assertEqual(u'a {\n x: 1;\n y: 2;\n }'.encode(), s.cssText)
def test_keepUsedNamespaceRulesOnly(self): "Preferences.keepUsedNamespaceRulesOnly" tests = { # default == prefix => both are combined '@namespace p "u"; @namespace "u"; p|a, a {top: 0}': ('@namespace "u";\na, a {\n top: 0\n }', '@namespace "u";\na, a {\n top: 0\n }'), '@namespace "u"; @namespace p "u"; p|a, a {top: 0}': ('@namespace p "u";\np|a, p|a {\n top: 0\n }', '@namespace p "u";\np|a, p|a {\n top: 0\n }'), # default and prefix '@namespace p "u"; @namespace "d"; p|a, a {top: 0}': ('@namespace p "u";\n@namespace "d";\np|a, a {\n top: 0\n }', '@namespace p "u";\n@namespace "d";\np|a, a {\n top: 0\n }'), # prefix only '@namespace p "u"; @namespace "d"; p|a {top: 0}': ('@namespace p "u";\n@namespace "d";\np|a {\n top: 0\n }', '@namespace p "u";\np|a {\n top: 0\n }'), # default only '@namespace p "u"; @namespace "d"; a {top: 0}': ('@namespace p "u";\n@namespace "d";\na {\n top: 0\n }', '@namespace "d";\na {\n top: 0\n }'), # prefix-ns only '@namespace p "u"; @namespace d "d"; p|a {top: 0}': ('@namespace p "u";\n@namespace d "d";\np|a {\n top: 0\n }', '@namespace p "u";\np|a {\n top: 0\n }'), } for test in tests: s = cssutils.parseString(test) expwith, expwithout = tests[test] cssutils.ser.prefs.keepUsedNamespaceRulesOnly = False self.assertEqual(s.cssText, expwith.encode()) cssutils.ser.prefs.keepUsedNamespaceRulesOnly = True self.assertEqual(s.cssText, expwithout.encode())
def test_keepUnknownAtRules(self): "Preferences.keepUnknownAtRules" tests = { u'''@three-dee { @background-lighting { azimuth: 30deg; elevation: 190deg; } h1 { color: red } } h1 { color: blue }''': (u'''@three-dee { @background-lighting { azimuth: 30deg; elevation: 190deg; } h1 { color: red } } h1 { color: blue }''', u'''h1 { color: blue }''') } for test in tests: s = cssutils.parseString(test) expwith, expwithout = tests[test] cssutils.ser.prefs.keepUnknownAtRules = True self.assertEqual(s.cssText, expwith.encode()) cssutils.ser.prefs.keepUnknownAtRules = False self.assertEqual(s.cssText, expwithout.encode())
def test_keepComments(self): "Preferences.keepComments" s = cssutils.parseString('/*1*/ a { /*2*/ }') cssutils.ser.prefs.keepComments = False self.assertEqual(''.encode(), s.cssText) cssutils.ser.prefs.keepEmptyRules = True self.assertEqual('a {}'.encode(), s.cssText)
def html_css_stylesheet(): global _html_css_stylesheet if _html_css_stylesheet is None: html_css = open(os.path.join(os.path.dirname(__file__), 'html.css'), 'rb').read() _html_css_stylesheet = parseString(html_css, validate=False) _html_css_stylesheet.namespaces['h'] = XHTML_NS return _html_css_stylesheet
def handle_style(self, style): log.info(style) sheet = cssutils.parseString(style.text) for rule in sheet: if rule.type == rule.FONT_FACE_RULE: self.do_handle_font_face_rule(rule)
def getStyleElementRules(self, htmltext): """Given an htmltext, return the CSS rules contained in the content""" compiledstyle = "" stylesheet = cssutils.css.CSSStyleSheet() myparser = etree.HTMLParser(encoding="utf-8") tree = etree.HTML(htmltext, parser=myparser) styleelements = tree.xpath('//style') for styleelt in styleelements: if styleelt.text != None: compiledstyle = compiledstyle + styleelt.text else: logging.debug("STYLE ELEMENT %s on %s" % (styleelements.index(styleelt) + 1, len(styleelements))) if compiledstyle != None: cssutils.ser.prefs.indentClosingBrace = False cssutils.ser.prefs.keepComments = False cssutils.ser.prefs.lineSeparator = u'' cssutils.ser.prefs.omitLastSemicolon = False try: stylesheet = cssutils.parseString(compiledstyle) except ValueError as e: logging.info("BOGUS STYLE RULE: %s" % (e.message)) else: raise ValueError("STYLE ELEMENT: no CSS Rules") return stylesheet
def main(): # -*- coding: utf-8 -*- import cssutils import logging cssutils.log.setLevel(logging.DEBUG) css = u'''/* a comment with umlaut ä */ @namespace html "http://www.w3.org/1999/xhtml"; @variables { BG: #fff } html|a { color:red; background: var(BG) }''' sheet = cssutils.parseString(css) for rule in sheet: if rule.type == rule.STYLE_RULE: # find property for property in rule.style: if property.name == 'color': property.value = 'green' property.priority = 'IMPORTANT' break # or simply: rule.style['margin'] = '01.0eM' # or: ('1em', 'important') sheet.encoding = 'ascii' sheet.namespaces['xhtml'] = 'http://www.w3.org/1999/xhtml' sheet.namespaces['atom'] = 'http://www.w3.org/2005/Atom' sheet.add('atom|title {color: #000000 !important}') sheet.add('@import "sheets/import.css";') # cssutils.ser.prefs.resolveVariables = True # default since 0.9.7b2 print sheet.cssText
def _css_parse(self, css): """ Parse styling via cssutils modules :rtype: dict """ sheet = parseString(css) style_sheet = {} for rule in sheet: new_style = {} selector = rule.selectorText.lower() if selector[0] in [u"#", u"."]: selector = selector[1:] # keep any style attributes that are needed for prop in rule.style: if prop.name == u"color": cv = cssutils_css.ColorValue(prop.value) # Code for RGB to hex conversion comes from # http://bit.ly/1kwfBnQ new_style[u"color"] = u"#%02x%02x%02x" % (cv.red, cv.green, cv.blue) else: new_style[prop.name] = prop.value if new_style: style_sheet[selector] = new_style return style_sheet
def write(self, css_text=''): """ Output a human readable version of the css file in utf-8 format. **Notes:** - The file is human readable. It is not intended to be human editable as the file is auto-generated. - Pre-existing files with the same name are overwritten. :type css_text: str :param css_text: Text containing the CSS to be written to the file. :return: None **Example:** >>> css_text = '.margin-top-50px { margin-top: 3.125em }' >>> css_file = CSSFile() >>> css_file.write(css_text=css_text) """ parse_string = parseString(css_text) ser.prefs.useDefaults() # Enables Default / Verbose Mode file_path = get_file_path(file_directory=self.file_directory, file_name=self.file_name, extension='.css') with open(file_path, 'w') as css_file: css_file.write(parse_string.cssText.decode('utf-8'))
def __get_field(self, field, cell): cell_text = cell.getText().replace('\n', '').strip() if field is ProxyField.LastUpdate: # Parse time format [{0}h] [{1}min[s]] [{2}sec[s]] tex = re.search('^(?:(\d+)(?:h\s*))?(?:(\d+)\s*mins?\s*)?(?:(\d+)\s*secs?)?$', cell_text) time = [int(tex.group(i + 1)) if tex.group(i + 1) is not None else 0 for i in range(3)] return self.requestTime + datetime.timedelta(hours=time[0], minutes=time[1], seconds=time[2]) if field is ProxyField.IpAddress: # Inline style block used to hide junk elements. # Extract all classes with 'display: none' and remove them. style = cell.find("style") css = cssutils.parseString(style.getText()) hidden_css_classes = [rule for rule in [next((re.search("\.(.+)", cssRule.selectorText).group(1) for prop in cssRule.style if prop.name == "display" and prop.value == "none"), None) for cssRule in css.cssRules] if rule is not None] # remove style tag, all hidden elements & rebuild cellText now all the hidden stuff is gone style.decompose() [tag.decompose() for cssClass in hidden_css_classes for tag in cell.findAll(attrs={'class': cssClass})] return cell.getText().replace('\n', '').strip() elif field is ProxyField.Protocol: return self.__match_enum(ProxyProtocol, self.ProtocolPatterns, cell_text, None) elif field is ProxyField.Speed or field is ProxyField.ConnectionTime: indicator = cell.find("div", "indicator") return re.search("width:\s*(\d+)%;", indicator['style'], re.IGNORECASE).group(1) elif field is ProxyField.Anon: return self.__match_enum(ProxyAnon, self.AnonPatterns, cell_text, ProxyAnon.Low) else: return cell_text
def test_list(self): "PropertyValue[index]" # issue #41 css = """div.one {color: rgb(255, 0, 0);} """ sheet = cssutils.parseString(css) pv = sheet.cssRules[0].style.getProperty('color').propertyValue self.assertEqual(pv.value, 'rgb(255, 0, 0)') self.assertEqual(pv[0].value, 'rgb(255, 0, 0)') # issue #42 sheet = cssutils.parseString('body { font-family: "A", b, serif }') pv = sheet.cssRules[0].style.getProperty('font-family').propertyValue self.assertEqual(3, pv.length) self.assertEqual(pv[0].value, 'A') self.assertEqual(pv[1].value, 'b') self.assertEqual(pv[2].value, 'serif')
def _css_parse(self, css): # parse via cssutils modules sheet = parseString(css) style_sheet = {} for rule in sheet: not_empty = False new_style = {} selector = rule.selectorText.lower() if selector[0] in ['#', '.']: selector = selector[1:] # keep any style attributes that are needed for prop in rule.style: if prop.name == 'text-align': new_style['text-align'] = prop.value not_empty = True if prop.name == 'font-family': new_style['font-family'] = prop.value not_empty = True if prop.name == 'font-size': new_style['font-size'] = prop.value not_empty = True if prop.name == 'color': new_style['color'] = _3digit_to_6digit_color(prop.value) not_empty = True if prop.name == 'lang': new_style['lang'] = prop.value not_empty = True if not_empty: style_sheet[selector] = new_style return style_sheet
def collect_global_css(self): global_css = defaultdict(list) for item in self.oeb.spine: stylizer = self.stylizers[item] if float(self.context.margin_top) >= 0: stylizer.page_rule['margin-top'] = '%gpt'%\ float(self.context.margin_top) if float(self.context.margin_bottom) >= 0: stylizer.page_rule['margin-bottom'] = '%gpt'%\ float(self.context.margin_bottom) items = sorted(stylizer.page_rule.items()) css = ';\n'.join("%s: %s" % (key, val) for key, val in items) css = ('@page {\n%s\n}\n'%css) if items else '' rules = [r.cssText for r in stylizer.font_face_rules + self.embed_font_rules] raw = '\n\n'.join(rules) css += '\n\n' + raw global_css[css].append(item) gc_map = {} manifest = self.oeb.manifest for css in global_css: href = None if css.strip(): id_, href = manifest.generate('page_css', 'page_styles.css') manifest.add(id_, href, CSS_MIME, data=cssutils.parseString(css, validate=False)) gc_map[css] = href ans = {} for css, items in global_css.iteritems(): for item in items: ans[item] = gc_map[css] return ans
def style_html(html_text, style_css): """Applies inline styles to an html_text string from an externa file.""" styled_html = html_text with open(style_css) as css_file: css = cssutils.parseString(css_file.read()) styles = {} for rule in css: for selector in rule.selectorText.split(", "): for prop in rule.style: try: styles[selector].append(prop) except KeyError: styles[selector] = [prop] for selector, prop_list in styles.items(): inline = " style=\"" for prop in prop_list: inline += "{}:{};".format(prop.name, prop.value) inline += "\"" styled_html = regex.sub("<"+selector, "<"+selector+inline, styled_html) return styled_html
def html_css_stylesheet(): global _html_css_stylesheet if _html_css_stylesheet is None: html_css = open(P('templates/html.css'), 'rb').read() _html_css_stylesheet = parseString(html_css, validate=False) return _html_css_stylesheet
list_font_basic_properties(c)[1], list_font_basic_properties(c)[2], list_font_basic_properties(c)[3])) except (lib.fntutls.UnsupportedFont, struct.error) as e: print('%sERROR! Problem with font file "%s": %s' % (_file_dec, singlefile, e)) if os.path.isdir(temp_font_dir): shutil.rmtree(temp_font_dir) elif singlefile.lower().endswith('.css'): with epubfile.open(singlefile) as f: cssutils.log.setLog(logging.getLogger(singlefile)) cssutils.log.addHandler(streamhandler) cssutils.log.setLevel(logging.WARNING) cssutils.parseString(f.read(), validate=True) check_urls_in_css(singlefile, epubfile, prepnl, _file_dec) # TODO: not a real problem with file (make separate check for it) # is_body_family, is_font_face, ff, sfound\ # = check_body_font_family( # singlefile, epubfile, _file_dec, # is_body_family, is_font_face, ff, sfound # ) else: try: c = epubfile.read(singlefile) for key in entities.iterkeys(): c = c.replace(key, entities[key]) sftree = etree.fromstring(c) except: sftree = None
def do(): s = cssutils.parseString(a) r = cssutils.resolveImports(s) return s, r
def _update_fson_html_fields_and_screenshot_pending(self): for rec in self: # Skipp this for email_template versions if rec.version_of_email_id: continue # Update fields 'fso_email_html', 'fso_email_html_parsed' and 'screenshot_pending' logger.info("Update fields 'fso_email_html', 'fso_email_html_parsed' and 'screenshot_pending' " "for email.template with id %s" % rec.id) # Only update fields if all needed fields are set if rec.fso_email_template and rec.fso_template_view_id and rec.body_html: # Render the the related theme (ir.ui.view) to get the basic html content of the email body email_body = rec.fso_template_view_id.render({'html_sanitize': html_sanitize, 'email_editor_mode': False, 'record': rec, 'print_fields': self.env['fso.print_field'].search([]), }) # Convert html content to a beautiful soup object email_body_soup = BeautifulSoup(email_body, "lxml") # TODO: Remove any java script tags if "no_java_script" is set in the email.template if rec.no_java_script: email_body_soup_java_script_tags = email_body_soup.find_all("script") for jst in email_body_soup_java_script_tags: jst.decompose() # Replace print fields in e-mail body html with correct code for Fundraising Studio # HINT: http://beautiful-soup-4.readthedocs.io/en/latest/#output # HINT: Will auto-detect encoding and convert to unicode # HINT: 'class_' is used by html_soup because 'class' is a reserved keyword in python email_body_soup_print_fields = email_body_soup.find_all(class_="drop_in_print_field") for pf in email_body_soup_print_fields: pf_class = [c for c in pf.get("class", []) if c.startswith("pf_")] pf_span = pf.find_all(class_=pf_class[0]) fs_string = pf_span[0].get("data-fs-email-placeholder") pf.replace_with(fs_string) # Repair anchors without protocol # E.g.: www.google.at > https://www.google.at email_body_soup_anchors = email_body_soup.find_all('a') for a in email_body_soup_anchors: href = a.get('href', '').strip() if '://' in href or any(href.startswith(x) for x in ('http', 'mailto', '/', '#', '%')): continue else: a['href'] = 'https://' + href # Convert beautiful soup object to regular html # HINT: keep html entities like by using the formater "html" instead of "minimal" # ATTENTION: Do NOT pretty print the document because it may lead to wrong spaces! # email_body_prepared = email_body_soup.prettify(formatter="html") email_body_prepared = email_body_soup.decode(pretty_print=False, formatter="html") # Use premailer to: # - inline CSS and # - convert relative to absolute URLs # HINT: This step must done before generating multimailer links # ATTENTION: This step will try a lot of requests.packages.urllib3.connectionpool connections # which may lead to long processing times. email_body_prepared_premailer = PremailerWithTimeout(email_body_prepared, method='xml', base_url=self.get_base_url(), preserve_internal_links=True, keep_style_tags=False, strip_important=True, align_floating_images=False, remove_unset_properties=True, include_star_selectors=False, cssutils_logging_handler=premailer_log_handler, cssutils_logging_level=logging.FATAL,) fso_email_html = email_body_prepared_premailer.transform(pretty_print=False) # Convert html content to a beautiful soup object again email_body_css_inline_soup = BeautifulSoup(fso_email_html, "lxml") # Replace anchors with FRST-Multimailer links email_body_css_inline_soup_anchors = email_body_css_inline_soup.find_all('a') for a in email_body_css_inline_soup_anchors: href = a.get('href', '').strip() # Handle and fix '%open_browser%' FRST-Multimailer links if '%open_browser%' in href: a['href'] = '%open_browser%' continue # Multimailer Token-Links: Add ?fs_ptoken=%xGuid% to token links for FRST if 'link-withtoken' in a.get('class', ''): token_query = '&fs_ptoken=%xGuid%' if '?' in href else '?fs_ptoken=%xGuid%' href = href+token_query a['href'] = href logger.info("TOKEN QUERY %s " % href) # Skipp rewrite to tracking link if 'link-donottrack' class is set if 'link-donottrack' in a.get('class', ''): continue # Convert to FRST-Multimailer link if '://' in href and href.startswith('http'): protocol, address = href.split('://', 1) a['href'] = '%redirector%/' + protocol + '//' + address def cycle_rules(rules): for r in rules: if r.type == r.STYLE_RULE: for p in r.style: p.priority = 'IMPORTANT' elif hasattr(r, 'cssRules'): cycle_rules(r) # Add !important to all media queries CSS in the header # HINT: Only the media queries will be in style tags in the header (the rest is inlined) for styletag in email_body_css_inline_soup.find_all('style'): css = styletag.string css_parsed = cssutils.parseString(css, validate=True) cycle_rules(css_parsed) styletag.string = css_parsed.cssText # Convert beautiful soup object back to regular html # ATTENTION: Do NOT pretty print the document because it may lead to wrong spaces! # fso_email_html_parsed = email_body_css_inline_soup.prettify(formatter="html") fso_email_html_parsed = email_body_css_inline_soup.decode(pretty_print=False, formatter="html") # Update the email.template fields return rec.write({'fso_email_html': fso_email_html, 'fso_email_html_parsed': fso_email_html_parsed, 'screenshot': False, 'screenshot_pending': True}) # Make sure all fields are unset if any of the mandatory fields are missing else: if any(rec[f] for f in ['fso_email_html', 'fso_email_html_parsed', 'screenshot']): return rec.write({'fso_email_html': False, 'fso_email_html_parsed': False, 'screenshot': False, 'screenshot_pending': False})
def test_keepEmptyRules(self): "Preferences.keepEmptyRules" # CSSStyleRule css = u'''a {} a { /*1*/ } a { color: red }''' s = cssutils.parseString(css) cssutils.ser.prefs.useDefaults() cssutils.ser.prefs.keepEmptyRules = True self.assertEqual(css, s.cssText) cssutils.ser.prefs.keepEmptyRules = False self.assertEqual(u'a {\n /*1*/\n }\na {\n color: red\n }', s.cssText) cssutils.ser.prefs.keepComments = False self.assertEqual(u'a {\n color: red\n }', s.cssText) # CSSMediaRule css = u'''@media tv { } @media all { /*1*/ } @media print { a {} } @media print { a { /*1*/ } } @media all { a { color: red } }''' s = cssutils.parseString(css) cssutils.ser.prefs.useDefaults() cssutils.ser.prefs.keepEmptyRules = True # self.assertEqual(css, s.cssText) cssutils.ser.prefs.keepEmptyRules = False self.assertEqual( '''@media all { /*1*/ } @media print { a { /*1*/ } } @media all { a { color: red } }''', s.cssText) cssutils.ser.prefs.keepComments = False self.assertEqual( '''@media all { a { color: red } }''', s.cssText)
def test_parse(self): "CSSStyleDeclaration parse" # error but parse tests = { # property names are caseinsensitive 'TOP:0': 'top: 0', 'top:0': 'top: 0', # simple escape 'c\\olor: red; color:green': 'color: green', 'color:g\\reen': 'color: g\\reen', # http://www.w3.org/TR/2009/CR-CSS2-20090423/syndata.html#illegalvalues 'color:green': 'color: green', 'color:green; color': 'color: green', 'color:red; color; color:green': 'color: green', 'color:green; color:': 'color: green', 'color:red; color:; color:green': 'color: green', 'color:green; color{;color:maroon}': 'color: green', 'color:red; color{;color:maroon}; color:green': 'color: green', # tantek hack r'''color: red; voice-family: "\"}\""; voice-family:inherit; color: green;''': 'voice-family: inherit;\ncolor: green', r'''col\or: blue; font-family: 'Courier New Times color: red; color: green;''': 'color: green', # special IE hacks are not preserved anymore (>=0.9.5b3) '/color: red; color: green': 'color: green', '/ color: red; color: green': 'color: green', '1px: red; color: green': 'color: green', '0: red; color: green': 'color: green', '1px:: red; color: green': 'color: green', r'$top: 0': '', r'$: 0': '', # really invalid! # unknown rule but valid '@x;\ncolor: red': None, '@x {\n }\ncolor: red': None, '/**/\ncolor: red': None, '/**/\ncolor: red;\n/**/': None, # issue #28 ';color: red': 'color: red', ';;color: red;;': 'color: red', } cssutils.ser.prefs.keepAllProperties = False for test, exp in list(tests.items()): sh = cssutils.parseString('a { %s }' % test) if exp is None: exp = '%s' % test elif exp != '': exp = '%s' % exp self.assertEqual(exp, sh.cssRules[0].style.cssText) cssutils.ser.prefs.useDefaults()
class CSSStyleRuleTestCase(test_cssrule.CSSRuleTestCase): def setUp(self): super(CSSStyleRuleTestCase, self).setUp() self.r = cssutils.css.CSSStyleRule() self.rRO = cssutils.css.CSSStyleRule(readonly=True) self.r_type = cssutils.css.CSSStyleRule.STYLE_RULE self.r_typeString = 'STYLE_RULE' def test_init(self): "CSSStyleRule.type and init" super(CSSStyleRuleTestCase, self).test_init() self.assertEqual(u'', self.r.cssText) self.assertEqual(cssutils.css.selectorlist.SelectorList, type(self.r.selectorList)) self.assertEqual(u'', self.r.selectorText) self.assertEqual(cssutils.css.CSSStyleDeclaration, type(self.r.style)) self.assertEqual(self.r, self.r.style.parentRule) def test_refs(self): "CSSStyleRule references" s = cssutils.css.CSSStyleRule() sel, style = s.selectorList, s.style self.assertEqual(s, sel.parentRule) self.assertEqual(s, style.parentRule) s.cssText = 'a { x:1 }' self.assertNotEqual(sel, s.selectorList) self.assertEqual('a', s.selectorList.selectorText) self.assertNotEqual(style, s.style) self.assertEqual('1', s.style.getPropertyValue('x')) sel, style = s.selectorList, s.style invalids = ( '$b { x:2 }', # invalid selector 'c { $x3 }', # invalid style '/b { 2 }' # both invalid ) for invalid in invalids: try: s.cssText = invalid except xml.dom.DOMException, e: pass self.assertEqual(sel, s.selectorList) self.assertEqual(u'a', s.selectorList.selectorText) self.assertEqual(style, s.style) self.assertEqual(u'1', s.style.getPropertyValue('x')) # CHANGING s = cssutils.parseString(u'a {s1: 1}') r = s.cssRules[0] sel1 = r.selectorList st1 = r.style # selectorList r.selectorText = 'b' self.assertNotEqual(sel1, r.selectorList) self.assertEqual('b', r.selectorList.selectorText) self.assertEqual('b', r.selectorText) sel1b = r.selectorList sel1b.selectorText = 'c' self.assertEqual(sel1b, r.selectorList) self.assertEqual('c', r.selectorList.selectorText) self.assertEqual('c', r.selectorText) sel2 = cssutils.css.SelectorList('sel2') s.selectorList = sel2 self.assertEqual(sel2, s.selectorList) self.assertEqual('sel2', s.selectorList.selectorText) sel2.selectorText = 'sel2b' self.assertEqual('sel2b', sel2.selectorText) self.assertEqual('sel2b', s.selectorList.selectorText) s.selectorList.selectorText = 'sel2c' self.assertEqual('sel2c', sel2.selectorText) self.assertEqual('sel2c', s.selectorList.selectorText) # style r.style = 's1: 2' self.assertNotEqual(st1, r.style) self.assertEqual('s1: 2', r.style.cssText) st2 = cssutils.parseStyle(u's2: 1') r.style = st2 self.assertEqual(st2, r.style) self.assertEqual('s2: 1', r.style.cssText) # cssText sl, st = r.selectorList, r.style # fails try: r.cssText = '$ {content: "new"}' except xml.dom.SyntaxErr, e: pass
def getView(document, css): """ document a DOM document, currently an lxml HTML document css a CSS StyleSheet string returns style view a dict of {DOMElement: css.CSSStyleDeclaration} for html """ from lxml.cssselect import CSSSelector sheet = cssutils.parseString(css) view = {} specificities = {} # needed temporarily # TODO: filter rules simpler?, add @media rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE) for rule in rules: for selector in rule.selectorList: #log(0, 'SELECTOR', selector.selectorText) # TODO: make this a callback to be able to use other stuff than lxml try: cssselector = CSSSelector(selector.selectorText) except: continue matching = cssselector.evaluate(document) for element in matching: #if element.tag in ('div',): # add styles for all matching DOM elements #log(1, 'ELEMENT', id(element), element.text) if element not in view: # add initial empty style declatation view[element] = cssutils.css.CSSStyleDeclaration( ) # @UndefinedVariable specificities[element] = {} for p in rule.style: # update style declaration if p not in view[element]: # setProperty needs a new Property object and # MUST NOT reuse the existing Property # which would be the same for all elements! # see Issue #23 view[element].setProperty(p.name, p.value, p.priority) specificities[element][p.name] = selector.specificity #log(2, view[element].getProperty('color')) else: #log(2, view[element].getProperty('color')) sameprio = ( p.priority == view[element].getPropertyPriority( p.name)) if not sameprio and bool(p.priority) or ( sameprio and selector.specificity >= specificities[element][p.name]): # later, more specific or higher prio view[element].setProperty(p.name, p.value, p.priority) return view
def adddata(self): """Add more data to the svgimgutils""" self.style_element = self.root.find('./' + self.root[0].tag + '/' + self.root[0][0].tag) self.number_of_classes = cssutils.parseString( self.style_element.text).cssRules.length
def check_css(self, myfile): """Find unused CSS and undefined used CCS. """ # Fails on a few corner cases, such as # ".tdm > tbody > tr > td:first-child + td" # ".onetable td" # # Ignores @media # Find the CCS style css_selectors = [] css = myfile.tree.find('head').find('style') if css == None: return # The CSS can be in a comment or not if len(css): # Not sure whether that covers all the comment cases. Maybe add # all the children css_string = etree.tostring(css[0]) else: css_string = css.text # Parse the CSS and retrieve the errors mylog = io.StringIO() h = logging.StreamHandler(mylog) h.setFormatter(logging.Formatter('%(levelname)s %(message)s')) cssutils.log.addHandler(h) cssutils.log.setLevel(logging.INFO) css_sheet = cssutils.parseString(css_string) self.cssutils_errors = [ x for x in mylog.getvalue().splitlines() if x ] for rule in css_sheet: # We don't want comments, media, ... if rule.type != cssutils.css.CSSRule.STYLE_RULE: continue rules = rule.selectorText.split(',') for rule in rules: # Cleanup rule selector = rule.strip() css_selectors.append(selector) # Find the unused/undefined CSS. It is possible 2 rules will # match the same class (for instance "p.foo" and ".foo" will # match "class=foo"). That is not detected, and both rules # will be valid. self.sel_unchecked = [] self.sel_unused = [] for selector in css_selectors: # Get the selector (eg. "body", "p", ".some_class") try: sel = CSSSelector(selector) except: self.sel_unchecked.append(selector) continue # Retrieve where it is used in the xhtml occurences = sel(myfile.tree) if len(occurences) == 0: self.sel_unused.append(sel.css) continue # If it's from a class, find the name. It should be the # last word starting with a dot (eg. "p.foo", ".foo", # "#toc .foo" => "foo") m = re.match('^.*\.([\w-]+)$', sel.css) if m == None: continue cl = m.group(1) assert len(cl) > 0 # Mark the class wherever it is used, in each element for item in occurences: if 'class' in item.attrib: old_classes = item.attrib['class'].split(' ') found = 0 for it_cl in old_classes: it_cl.strip() if it_cl == cl: # The class has been found. Mark it. found = 1 if '__used_classes' not in item.attrib: item.attrib['__used_classes'] = it_cl else: item.attrib['__used_classes'] += ' ' + it_cl assert found == 1 else: # I don't think that should happen print("KPPVH ERROR - matches " + cl + " but no class - line " + str(item.sourceline)) # Look for unused classes self.classes_undefined = [] # todo- use xpath instead for element in myfile.tree.iter(tag=etree.Element): if 'class' in element.attrib: classes = element.attrib['class'].strip() classes = re.sub(r"\s+", " ", classes) classes = classes.split(' ') if '__used_classes' in element.attrib: used_classes = element.attrib['__used_classes'].strip().split(' ') # Substract content of used_classes from classes classes = list(set(classes) - set(used_classes)) # Finally, print the warning) for cl in classes: self.classes_undefined.append([element.sourceline, cl])
def getView(self, document, css): view = {} specificities = {} supportratios = {} supportFailRate = 0 supportTotalRate = 0 compliance = dict() #load CSV containing css property client support into dict mycsv = csv.DictReader(open( os.path.join(os.path.dirname(__file__), "css_compliance.csv")), delimiter=',') for row in mycsv: #count clients so we can calculate an overall support percentage later clientCount = len(row) compliance[row['property'].strip()] = dict(row) #decrement client count to account for first col which is property name clientCount -= 1 #sheet = csscombine(path="http://www.torchbox.com/css/front/import.css") sheet = cssutils.parseString(css) rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE) for rule in rules: for selector in rule.selectorList: try: cssselector = CSSSelector(selector.selectorText) matching = cssselector.evaluate(document) for element in matching: # add styles for all matching DOM elements if element not in view: # add initial view[element] = cssutils.css.CSSStyleDeclaration() specificities[element] = {} # add inline style if present inlinestyletext = element.get('style') if inlinestyletext: inlinestyle = cssutils.css.CSSStyleDeclaration( cssText=inlinestyletext) else: inlinestyle = None if inlinestyle: for p in inlinestyle: # set inline style specificity view[element].setProperty(p) specificities[element][p.name] = (1, 0, 0, 0) for p in rule.style: #create supportratio dic item for this property if p.name not in supportratios: supportratios[p.name] = { 'usage': 0, 'failedClients': 0 } #increment usage supportratios[p.name]['usage'] += 1 try: if not p.name in self.CSSUnsupportErrors: for client, support in compliance[ p.name].items(): if support == "N" or support == "P": #increment client failure count for this property supportratios[ p.name]['failedClients'] += 1 if not p.name in self.CSSUnsupportErrors: if support == "P": self.CSSUnsupportErrors[ p.name] = [ client + ' (partial support)' ] else: self.CSSUnsupportErrors[ p.name] = [client] else: if support == "P": self.CSSUnsupportErrors[ p.name].append( client + ' (partial support)' ) else: self.CSSUnsupportErrors[ p.name].append(client) except KeyError: pass # update styles if p not in view[element]: view[element].setProperty( p.name, p.value, p.priority) specificities[element][ p.name] = selector.specificity else: sameprio = (p.priority == view[element].getPropertyPriority( p.name)) if not sameprio and bool(p.priority) or ( sameprio and selector.specificity >= specificities[element][p.name]): # later, more specific or higher prio view[element].setProperty( p.name, p.value, p.priority) except ExpressionError: if str(sys.exc_info()[1]) not in self.CSSErrors: self.CSSErrors.append(str(sys.exc_info()[1])) pass for props, propvals in supportratios.items(): supportFailRate += (propvals['usage']) * int( propvals['failedClients']) supportTotalRate += int(propvals['usage']) * clientCount if (supportFailRate and supportTotalRate): self.supportPercentage = 100 - ( (float(supportFailRate) / float(supportTotalRate)) * 100) return view
def test_reader(cls) -> None: #return cls([CSSRuleBlock.load_rule(i) for i in cssutils.parseString(open('test_css.css').read()).cssRules if isinstance(i, cssutils.css.CSSStyleRule)]) _rules = [(re.split(',\s+|,', re.sub('\[\w+\=[^\]]+\]|:+[\w\-]+|\s+\*|\*', '', i.selectorText)), i.style) for i in cssutils.parseString(open('test_css.css').read()).cssRules if isinstance(i, cssutils.css.CSSStyleRule)] #print('rules first part', _rules[:10]) return cls([CSSRuleBlock._load_rule(i, a) for b, a in _rules for i in b if i])
def show_urls(s, data): stylesheet = cssutils.parseString(s) # parseFile (f) # its a start :) #print [u for u in cssutils.getUrls (stylesheet)] for u in cssutils.getUrls(stylesheet): print u
import cssutils import logging cssutils.log.setLevel(logging.FATAL) css = '''@import "example.css"; a { color: blue !important; c\olor: green !important; c\olor: pink; color: red; }''' sheet = cssutils.parseString(css) print "\nORIGINAL CSS:" print css print "------------" print repr(cssutils.ser.prefs) print "\nCSS Serialized" print sheet.cssText print "\nCSS Serialized with ``keepAllProperties`` = False" cssutils.ser.prefs.keepAllProperties = False print sheet.cssText print "\nCSS Serialized with ``defaultPropertyName`` = True" cssutils.ser.prefs.defaultPropertyName = True print sheet.cssText print "\nCSS Serialized with ``defaultPropertyName`` = False"
def test_cssRules(self): "CSSPageRule.cssRules" s = cssutils.parseString('@page {}') p = s.cssRules[0] self.assertEqual(len(p.cssRules), 0) # add and insert m1 = cssutils.css.MarginRule('@top-left', 'color: red') i = p.add(m1) self.assertEqual(i, 0) self.assertEqual(len(p.cssRules), 1) m3 = cssutils.css.MarginRule() m3.cssText = '@top-right { color: blue }' i = p.insertRule(m3) self.assertEqual(i, 1) self.assertEqual(len(p.cssRules), 2) m2 = cssutils.css.MarginRule() m2.margin = '@top-center' m2.style = 'color: green' i = p.insertRule(m2, 1) self.assertEqual(i, 1) self.assertEqual(len(p.cssRules), 3) self.assertEqual( p.cssText, '''@page { @top-left { color: red } @top-center { color: green } @top-right { color: blue } }''') # keys and dict index self.assertEqual('@top-left' in p, True) self.assertEqual('@bottom-left' in p, False) self.assertEqual(list(p.keys()), ['@top-left', '@top-center', '@top-right']) self.assertEqual(p['@bottom-left'], None) self.assertEqual(p['@top-left'].cssText, 'color: red') p['@top-left'] = 'color: #f00' self.assertEqual(p['@top-left'].cssText, 'color: #f00') # delete p.deleteRule(m2) self.assertEqual(len(p.cssRules), 2) self.assertEqual( p.cssText, '''@page { @top-left { color: #f00 } @top-right { color: blue } }''') p.deleteRule(0) self.assertEqual(len(p.cssRules), 1) self.assertEqual(m3, p.cssRules[0]) self.assertEqual( p.cssText, '''@page { @top-right { color: blue } }''') del p['@top-right'] self.assertEqual(len(p.cssRules), 0)
def test_prioriy(self): "Property.priority" s = cssutils.parseString('a { color: red }') self.assertEqual(s.cssText, 'a {\n color: red\n }'.encode())
def web_crawler(url, depth=0, page_assets=False): if depth >= 0: opener = request.build_opener() opener.add_headers = [{'User-Agent': 'Mozilla'}] request.install_opener(opener) base_url = "{0.scheme}://{0.netloc}/".format(parse.urlsplit(url)) if url not in links: links.append(url) raw = requests.get(url).text if page_assets: try: sheet = cssutils.parseString(requests.get(url).content) urls = cssutils.getUrls(sheet) for url in urls: if url not in links: links.append(url) path = request.urlopen(url) meta = path.info() print(url, ' size: ', meta.get(name="Content-Length")) except: pass soup = bs(raw, 'html.parser') for script in soup.find_all("script"): if script.attrs.get("src"): script_url = parse.urljoin(url, script.attrs.get("src")) if script_url not in assets: path = request.urlopen(script_url) meta = path.info() print(script_url, ' size: ', meta.get(name="Content-Length")) assets.append(script_url) if page_assets and script_url not in links: links.append(script_url) web_crawler(script_url, depth - 1, page_assets) for css in soup.find_all("link", {"rel": "stylesheet"}): if css.attrs.get("href"): css_url = parse.urljoin(url, css.attrs.get("href")) if css_url not in assets: try: path = request.urlopen(css_url) meta = path.info() print(css_url, ' ', 'size: ', meta.get(name="Content-Length")) assets.append(css_url) if page_assets and css_url not in links: links.append(css_url) web_crawler(css_url, depth - 1, page_assets) except: pass for img in soup.find_all("img"): if img.get("src"): img_url = parse.urljoin(url, img.get("src")) try: path = request.urlopen(img_url) meta = path.info() if img_url not in assets: print(img_url, ' ', 'size: ', meta.get(name="Content-Length")) assets.append(img_url) except: pass for a in soup.find_all('a'): href = str(a.get('href')) if 'http://' not in href and 'https://' not in href and base_url not in href: href = base_url + href[1:] if href not in links: path = request.urlopen(href) meta = path.info() print(href, ' ', 'size: ', meta.get(name="Content-Length")) links.append(href) web_crawler(href, depth - 1, page_assets)
def test_useDefaults(self): "Preferences.useDefaults()" cssutils.ser.prefs.useMinified() cssutils.ser.prefs.useDefaults() self.assertEqual(cssutils.ser.prefs.defaultAtKeyword, True) self.assertEqual(cssutils.ser.prefs.defaultPropertyName, True) self.assertEqual(cssutils.ser.prefs.defaultPropertyPriority, True) self.assertEqual(cssutils.ser.prefs.importHrefFormat, None) self.assertEqual(cssutils.ser.prefs.indent, 4 * ' ') self.assertEqual(cssutils.ser.prefs.indentClosingBrace, True) self.assertEqual(cssutils.ser.prefs.keepAllProperties, True) self.assertEqual(cssutils.ser.prefs.keepComments, True) self.assertEqual(cssutils.ser.prefs.keepEmptyRules, False) self.assertEqual(cssutils.ser.prefs.keepUnknownAtRules, True) self.assertEqual(cssutils.ser.prefs.keepUsedNamespaceRulesOnly, False) self.assertEqual(cssutils.ser.prefs.lineNumbers, False) self.assertEqual(cssutils.ser.prefs.lineSeparator, '\n') self.assertEqual(cssutils.ser.prefs.listItemSpacer, ' ') self.assertEqual(cssutils.ser.prefs.omitLastSemicolon, True) self.assertEqual(cssutils.ser.prefs.omitLeadingZero, False) self.assertEqual(cssutils.ser.prefs.paranthesisSpacer, ' ') self.assertEqual(cssutils.ser.prefs.propertyNameSpacer, ' ') self.assertEqual(cssutils.ser.prefs.selectorCombinatorSpacer, ' ') self.assertEqual(cssutils.ser.prefs.spacer, ' ') self.assertEqual(cssutils.ser.prefs.validOnly, False) css = ''' /*1*/ @import url(x) tv , print; @namespace prefix "uri"; @namespace unused "unused"; @media all {} @media all { a {} } @media all { a { color: red; } } @page { left: 0; } a {} prefix|x, a + b > c ~ d , b { top : 1px ; font-family : arial ,'some' } ''' parsedcss = '''/*1*/ @import url(x) tv, print; @namespace prefix "uri"; @namespace unused "unused"; @media all { a { color: red } } @page { left: 0 } prefix|x, a + b > c ~ d, b { top: 1px; font-family: arial, "some" }''' s = cssutils.parseString(css) self.assertEqual(s.cssText, parsedcss.encode()) tests = { '0.1 .1 0.1px .1px 0.1% .1% +0.1 +.1 +0.1px +.1px +0.1% +.1% -0.1 -.1 -0.1px -.1px -0.1% -.1%': '0.1 0.1 0.1px 0.1px 0.1% 0.1% +0.1 +0.1 +0.1px +0.1px +0.1% +0.1% -0.1 -0.1 -0.1px -0.1px -0.1% -0.1%' } cssutils.ser.prefs.useDefaults() for test, exp in list(tests.items()): s = cssutils.parseString('a{x:%s}' % test) self.assertEqual(('a {\n x: %s\n }' % exp).encode(), s.cssText)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in style_tags: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == 'amzn-mobi': continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) for rule in tuple(stylesheet.cssRules.rulesOfType(CSSRule.PAGE_RULE)): stylesheet.cssRules.remove(rule) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = {rule.media.item(i) for i in xrange(rule.media.length)} if not media.intersection({'all', 'screen', 'amzn-kf8'}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err))) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() in {u'mobi', u'docx'}: # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
var-theme-colour-1: #009EE0; var-theme-colour-2: #FFED00; var-theme-colour-3: #E2007A; var-spacing: 24px; } a { bottom: var(b, x); color: var(theme-colour-1, rgb(14,14,14)); left: var(L, 1px); z-index: var(L, 1); top: var(T, calc( 2 * 1px )); background: var(U, url(example.png)); border-color: var(C, #f00) } ''' print(cssutils.parseString(css).cssText) sys.exit(1) if 1: css = '''@media all and (width: 10px), all and (height:20px) { a {color:red} } ''' css = '''@media (min-device-pixel-ratio: 1.3), (min-resolution: 1.3dppx){ a {color:red} }''' css = '''@media not handheld/**/,/**/ all/**/and/**/ (/**/width: 10px) and (color), tv/**/{ a {color:red} }''' css = '''@media tv,braille,tv {
def parse_page(self, url, processed_pages={}, index=None): # if this is the first page being parse, set it as the index.html if not index: index = url log.info(f"Parsing page '{url}'") log.debug(f"Using page config: {self.get_page_config(url)}") self.driver.get(url) try: WebDriverWait(self.driver, 60).until(notion_page_loaded()) except TimeoutException as ex: log.critical( "Timeout waiting for page content to load, or no content found." " Are you sure the page is set to public?") return # scroll at the bottom of the notion-scroller element to load all elements # continue once there are no changes in height after a timeout # don't do this if the page has a calendar databse on it or it will load forever calendar = self.driver.find_elements_by_class_name( "notion-calendar-view") if not calendar: scroller = self.driver.find_element_by_css_selector( ".notion-frame > .notion-scroller") last_height = scroller.get_attribute("scrollHeight") log.debug( f"Scrolling to bottom of notion-scroller (height: {last_height})" ) while True: self.driver.execute_script( "arguments[0].scrollTo(0, arguments[0].scrollHeight)", scroller) time.sleep(self.args["timeout"]) new_height = scroller.get_attribute("scrollHeight") log.debug( f"New notion-scroller height after timeout is: {new_height}" ) if new_height == last_height: break last_height = new_height # function to expand all the toggle block in the page to make their content visible # so we can hook up our custom toggle logic afterwards def open_toggle_blocks(timeout, exclude=[]): opened_toggles = exclude toggle_blocks = self.driver.find_elements_by_class_name( "notion-toggle-block") log.debug( f"Opening {len(toggle_blocks)} new toggle blocks in the page") for toggle_block in toggle_blocks: if not toggle_block in opened_toggles: toggle_button = toggle_block.find_element_by_css_selector( "div[role=button]") # check if the toggle is already open by the direction of its arrow is_toggled = "(180deg)" in ( toggle_button.find_element_by_tag_name( "svg").get_attribute("style")) if not is_toggled: # click on it, then wait until all elements are displayed toggle_button.click() try: WebDriverWait(self.driver, timeout).until( toggle_block_has_opened(toggle_block)) except TimeoutException as ex: log.warning( "Timeout waiting for toggle block to open." " Likely it's already open, but doesn't hurt to check." ) except Exception as exception: log.error( f"Error trying to open a toggle block: {exception}" ) opened_toggles.append(toggle_block) # after all toggles have been opened, check the page again to see if # any toggle block had nested toggle blocks inside them new_toggle_blocks = self.driver.find_elements_by_class_name( "notion-toggle-block") if len(new_toggle_blocks) > len(toggle_blocks): # if so, run the function again open_toggle_blocks(timeout, opened_toggles) # open the toggle blocks in the page open_toggle_blocks(self.args["timeout"]) # creates soup from the page to start parsing soup = BeautifulSoup(self.driver.page_source, "html.parser") # remove scripts and other tags we don't want / need for unwanted in soup.findAll("script"): unwanted.decompose() for intercom_frame in soup.findAll("div", {"id": "intercom-frame"}): intercom_frame.decompose() for intercom_div in soup.findAll( "div", {"class": "intercom-lightweight-app"}): intercom_div.decompose() for overlay_div in soup.findAll("div", {"class": "notion-overlay-container"}): overlay_div.decompose() for vendors_css in soup.find_all("link", href=lambda x: x and "vendors~" in x): vendors_css.decompose() # clean up the default notion meta tags for tag in [ "description", "twitter:card", "twitter:site", "twitter:title", "twitter:description", "twitter:image", "twitter:url", "apple-itunes-app", ]: unwanted_tag = soup.find("meta", attrs={"name": tag}) if unwanted_tag: unwanted_tag.decompose() for tag in [ "og:site_name", "og:type", "og:url", "og:title", "og:description", "og:image", ]: unwanted_og_tag = soup.find("meta", attrs={"property": tag}) if unwanted_og_tag: unwanted_og_tag.decompose() # set custom meta tags custom_meta_tags = self.get_page_config(url).get("meta", []) for custom_meta_tag in custom_meta_tags: tag = soup.new_tag("meta") for attr, value in custom_meta_tag.items(): tag.attrs[attr] = value log.debug(f"Adding meta tag {str(tag)}") soup.head.append(tag) # process images & emojis cache_images = True for img in soup.findAll("img"): if img.has_attr("src"): if cache_images and not "data:image" in img["src"]: img_src = img["src"] # if the path starts with /, it's one of notion's predefined images if img["src"].startswith("/"): img_src = "https://www.notion.so" + img["src"] # notion's own default images urls are in a weird format, need to sanitize them # img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0] # if (not '.amazonaws' in img_src): # img_src = urllib.parse.unquote(img_src) cached_image = self.cache_file(img_src) img["src"] = cached_image else: if img["src"].startswith("/"): img["src"] = "https://www.notion.so" + img["src"] # on emoji images, cache their sprite sheet and re-set their background url if img.has_attr("class") and "notion-emoji" in img["class"]: style = cssutils.parseStyle(img["style"]) spritesheet = style["background"] spritesheet_url = spritesheet[spritesheet.find("(") + 1:spritesheet.find(")")] cached_spritesheet_url = self.cache_file( "https://www.notion.so" + spritesheet_url) style["background"] = spritesheet.replace( spritesheet_url, str(cached_spritesheet_url)) img["style"] = style.cssText # process stylesheets for link in soup.findAll("link", rel="stylesheet"): if link.has_attr("href") and link["href"].startswith("/"): # we don't need the vendors stylesheet if "vendors~" in link["href"]: continue # css_file = link['href'].strip("/") cached_css_file = self.cache_file("https://www.notion.so" + link["href"]) with open(self.dist_folder / cached_css_file, "rb") as f: stylesheet = cssutils.parseString(f.read()) # open the stylesheet and check for any font-face rule, for rule in stylesheet.cssRules: if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE: # if any are found, download the font file font_file = (rule.style["src"].split("url(/") [-1].split(") format")[0]) cached_font_file = self.cache_file( f"https://www.notion.so/{font_file}") rule.style["src"] = f"url({str(cached_font_file)})" link["href"] = str(cached_css_file) # add our custom logic to all toggle blocks for toggle_block in soup.findAll("div", {"class": "notion-toggle-block"}): toggle_id = uuid.uuid4() toggle_button = toggle_block.select_one("div[role=button]") toggle_content = toggle_block.find("div", { "class": None, "style": "" }) if toggle_button and toggle_content: # add a custom class to the toggle button and content, # plus a custom attribute sharing a unique uiid so # we can hook them up with some custom js logic later toggle_button["class"] = toggle_block.get( "class", []) + ["loconotion-toggle-button"] toggle_content["class"] = toggle_content.get( "class", []) + ["loconotion-toggle-content"] toggle_content.attrs[ "loconotion-toggle-id"] = toggle_button.attrs[ "loconotion-toggle-id"] = toggle_id # if there are any table views in the page, add links to the title rows # the link to the row item is equal to its data-block-id without dashes for table_view in soup.findAll("div", {"class": "notion-table-view"}): for table_row in table_view.findAll( "div", {"class": "notion-collection-item"}): table_row_block_id = table_row["data-block-id"] table_row_href = "/" + table_row_block_id.replace("-", "") row_target_span = table_row.find("span") row_link_wrapper = soup.new_tag("a", attrs={ "href": table_row_href, "style": "cursor: pointer;" }) row_target_span.wrap(row_link_wrapper) # embed custom google font(s) fonts_selectors = { "site": "div:not(.notion-code-block)", "navbar": ".notion-topbar div", "title": ".notion-page-block > div, .notion-collection_view_page-block > div[data-root]", "h1": ".notion-header-block div, notion-page-content > notion-collection_view-block > div:first-child div", "h2": ".notion-sub_header-block div", "h3": ".notion-sub_sub_header-block div", "body": ".notion-scroller", "code": ".notion-code-block *", } custom_fonts = self.get_page_config(url).get("fonts", {}) if custom_fonts: # append a stylesheet importing the google font for each unique font unique_custom_fonts = set(custom_fonts.values()) for font in unique_custom_fonts: if font: google_fonts_embed_name = font.replace(" ", "+") font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap" custom_font_stylesheet = soup.new_tag("link", rel="stylesheet", href=font_href) soup.head.append(custom_font_stylesheet) # go through each custom font, and add a css rule overriding the font-family # to the font override stylesheet targetting the appropriate selector font_override_stylesheet = soup.new_tag("style", type="text/css") for target, custom_font in custom_fonts.items(): if custom_font and not target == "site": log.debug(f"Setting {target} font-family to {custom_font}") font_override_stylesheet.append(fonts_selectors[target] + " {font-family:" + custom_font + " !important} ") site_font = custom_fonts.get("site", None) # process global site font last to more granular settings can override it if site_font: log.debug(f"Setting global site font-family to {site_font}"), font_override_stylesheet.append(fonts_selectors["site"] + " {font-family:" + site_font + "} ") # finally append the font overrides stylesheets to the page soup.head.append(font_override_stylesheet) # inject any custom elements to the page custom_injects = self.get_page_config(url).get("inject", {}) def injects_custom_tags(section): section_custom_injects = custom_injects.get(section, {}) for tag, elements in section_custom_injects.items(): for element in elements: injected_tag = soup.new_tag(tag) for attr, value in element.items(): injected_tag[attr] = value # if the value refers to a file, copy it to the dist folder if attr.lower() == "href" or attr.lower() == "src": log.debug(f"Copying injected file '{value}'") cached_custom_file = self.cache_file( (Path.cwd() / value.strip("/"))) # destination = (self.dist_folder / source.name) # shutil.copyfile(source, destination) injected_tag[attr] = str( cached_custom_file) # source.name log.debug( f"Injecting <{section}> tag: {str(injected_tag)}") soup.find(section).append(injected_tag) injects_custom_tags("head") injects_custom_tags("body") # inject loconotion's custom stylesheet and script loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css")) custom_css = soup.new_tag("link", rel="stylesheet", href=str(loconotion_custom_css)) soup.head.insert(-1, custom_css) loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js")) custom_script = soup.new_tag("script", type="text/javascript", src=str(loconotion_custom_js)) soup.body.insert(-1, custom_script) # find sub-pages and clean slugs / links sub_pages = [] for a in soup.findAll("a"): if a["href"].startswith("/"): sub_page_href = "https://www.notion.so" + a["href"] # if the link is an anchor link, # check if the page hasn't already been parsed if "#" in sub_page_href: sub_page_href_tokens = sub_page_href.split("#") sub_page_href = sub_page_href_tokens[0] a["href"] = "#" + sub_page_href_tokens[-1] a["class"] = a.get("class", []) + ["loconotion-anchor-link"] if (sub_page_href in processed_pages.keys() or sub_page_href in sub_pages): log.debug( f"Original page for anchor link {sub_page_href}" " already parsed / pending parsing, skipping") continue else: a["href"] = (self.get_page_slug(sub_page_href) if sub_page_href != index else "index.html") sub_pages.append(sub_page_href) log.debug(f"Found link to page {a['href']}") # exports the parsed page html_str = str(soup) html_file = self.get_page_slug(url) if url != index else "index.html" if html_file in processed_pages.values(): log.error( f"Found duplicate pages with slug '{html_file}' - previous one will be" " overwritten. Make sure that your notion pages names or custom slugs" " in the configuration files are unique") log.info(f"Exporting page '{url}' as '{html_file}'") with open(self.dist_folder / html_file, "wb") as f: f.write(html_str.encode("utf-8").strip()) processed_pages[url] = html_file # parse sub-pages if sub_pages and not self.args.get("single_page", False): if processed_pages: log.debug(f"Pages processed so far: {len(processed_pages)}") for sub_page in sub_pages: if not sub_page in processed_pages.keys(): self.parse_page(sub_page, processed_pages=processed_pages, index=index) # we're all done! return processed_pages
def generateFontCSS(self): # Список стилей для встраивания шрифтов style_rules = [ '.titleblock', '.text-author', 'p', 'p.title', '.cite', '.poem', '.table th', '.table td', '.annotation', 'body' ] css_string = modules.default_css.gui_default_css css = cssutils.parseString(css_string) font_regular = '' font_italic = '' font_bold = '' font_bolditalic = '' if 'Regular' in self.gui_config.fontDb.families[ self.gui_config.embedFontFamily]: font_regular = self.gui_config.fontDb.families[ self.gui_config.embedFontFamily]['Regular'] if 'Italic' in self.gui_config.fontDb.families[ self.gui_config.embedFontFamily]: font_italic = self.gui_config.fontDb.families[ self.gui_config.embedFontFamily]['Italic'] else: font_italic = font_regular if 'Bold' in self.gui_config.fontDb.families[ self.gui_config.embedFontFamily]: font_bold = self.gui_config.fontDb.families[ self.gui_config.embedFontFamily]['Bold'] else: font_bold = font_regular if 'Bold Italic' in self.gui_config.fontDb.families[ self.gui_config.embedFontFamily]: font_bolditalic = self.gui_config.fontDb.families[ self.gui_config.embedFontFamily]['Bold Italic'] else: font_bolditalic = font_italic css.add('@font-face {{ font-family: "para"; src: url("fonts/{0}"); }}'. format(font_regular)) css.add( '@font-face {{ font-family: "para"; src: url("fonts/{0}"); font-style: italic; }}' .format(font_italic)) css.add( '@font-face {{ font-family: "para"; src: url("fonts/{0}"); font-weight: bold; }}' .format(font_bold)) css.add( '@font-face {{ font-family: "para"; src: url("fonts/{0}"); font-style: italic; font-weight: bold; }}' .format(font_bolditalic)) found_body = False for rule in css: if rule.type == rule.STYLE_RULE: if rule.selectorText in style_rules: rule.style['font-family'] = '"para"' if rule.selectorText == 'body': found_body = True # Добавим стиль для if not found_body: css.add('body {font-family: "para"; line-height: 100%; }') css_path = os.path.join(os.path.dirname(self.config_file), 'profiles') if not os.path.exists(css_path): os.makedirs(css_path) with codecs.open(os.path.join(css_path, '_font.css'), 'w', 'utf-8') as f: f.write(str(css.cssText, 'utf-8'))
def get_body(self, root_url, urls, visited_urls): if not root_url.startswith("http"): root_url = 'http://' + root_url visited_urls.append(root_url) parsed_url = urlparse(root_url) if parsed_url.fragment: return domain = parsed_url.netloc if not domain.endswith('/'): domain += '/' file_name = self.make_new_link(root_url) file_path = '' patt = '/.*/.*\.' if re.match(patt, file_name): file_path, file_name = file_name.rsplit('/', 1) file_path += '/' print('path: ', file_path, 'name: ', file_name) if len(domain) < 4: sys.exit('invalid taget {}'.format(root_url)) page_path = '/opt/snare/pages/{}'.format(domain) if not os.path.exists(page_path): os.mkdir(page_path) if file_path and not os.path.exists(page_path + file_path): os.makedirs(page_path + file_path) data = None try: with aiohttp.Timeout(10.0): with aiohttp.ClientSession() as session: response = yield from session.get(root_url) data = yield from response.read() except Exception as e: print(e) else: response.release() session.close() if data is not None: if re.match(re.compile('.*\.(html|php)'), file_name): soup = self.replace_links(data, domain, urls) data = str(soup).encode() with open(page_path + file_path + file_name, 'wb') as index_fh: index_fh.write(data) if '.css' in file_name: css = cssutils.parseString(data) for carved_url in cssutils.getUrls(css): if carved_url.startswith('data'): continue carved_url = os.path.normpath( os.path.join(domain, carved_url)) if not carved_url.startswith('http'): if carved_url.startswith( '..') or carved_url.startswith('/'): carved_url = 'http://' + domain + carved_url else: carved_url = 'http://' + carved_url if carved_url not in visited_urls: urls.insert(0, carved_url) for url in urls: urls.remove(url) if url in visited_urls: continue yield from self.get_body(url, urls, visited_urls)
def extract_css_into_flows(self): inlines = defaultdict(list) # Ensure identical <style>s not repeated sheets = {} passthrough = getattr(self.opts, 'mobi_passthrough', False) for item in self.oeb.manifest: if item.media_type in OEB_STYLES: sheet = self.data(item) if not passthrough and not self.opts.expand_css and hasattr(item.data, 'cssText'): condense_sheet(sheet) sheets[item.href] = len(self.flows) self.flows.append(sheet) def fix_import_rules(sheet): changed = False for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE): if rule.href: href = item.abshref(rule.href) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) rule.href = 'kindle:flow:%s?mime=text/css'%idx changed = True return changed for item in self.oeb.spine: root = self.data(item) for link in XPath('//h:link[@href]')(root): href = item.abshref(link.get('href')) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) link.set('href', 'kindle:flow:%s?mime=text/css'%idx) for tag in XPath('//h:style')(root): p = tag.getparent() idx = p.index(tag) raw = tag.text if not raw or not raw.strip(): extract(tag) continue sheet = cssutils.parseString(raw, validate=False) if fix_import_rules(sheet): raw = force_unicode(sheet.cssText, 'utf-8') repl = etree.Element(XHTML('link'), type='text/css', rel='stylesheet') repl.tail='\n' p.insert(idx, repl) extract(tag) inlines[raw].append(repl) for raw, elems in inlines.items(): idx = to_ref(len(self.flows)) self.flows.append(raw) for link in elems: link.set('href', 'kindle:flow:%s?mime=text/css'%idx) for item in self.oeb.manifest: if item.media_type in OEB_STYLES: sheet = self.data(item) if hasattr(sheet, 'cssRules'): fix_import_rules(sheet) for i, sheet in enumerate(tuple(self.flows)): if hasattr(sheet, 'cssText'): self.flows[i] = force_unicode(sheet.cssText, 'utf-8')
def test_useMinified(self): "Preferences.useMinified()" cssutils.ser.prefs.useDefaults() cssutils.ser.prefs.useMinified() self.assertEqual(cssutils.ser.prefs.defaultAtKeyword, True) self.assertEqual(cssutils.ser.prefs.defaultPropertyName, True) self.assertEqual(cssutils.ser.prefs.importHrefFormat, 'string') self.assertEqual(cssutils.ser.prefs.indent, u'') self.assertEqual(cssutils.ser.prefs.keepAllProperties, True) self.assertEqual(cssutils.ser.prefs.keepComments, False) self.assertEqual(cssutils.ser.prefs.keepEmptyRules, False) self.assertEqual(cssutils.ser.prefs.keepUnkownAtRules, False) self.assertEqual(cssutils.ser.prefs.keepUsedNamespaceRulesOnly, True) self.assertEqual(cssutils.ser.prefs.lineNumbers, False) self.assertEqual(cssutils.ser.prefs.lineSeparator, u'') self.assertEqual(cssutils.ser.prefs.listItemSpacer, u'') self.assertEqual(cssutils.ser.prefs.omitLastSemicolon, True) self.assertEqual(cssutils.ser.prefs.paranthesisSpacer, u'') self.assertEqual(cssutils.ser.prefs.propertyNameSpacer, u'') self.assertEqual(cssutils.ser.prefs.selectorCombinatorSpacer, u'') self.assertEqual(cssutils.ser.prefs.spacer, u'') self.assertEqual(cssutils.ser.prefs.validOnly, False) css = u''' /*1*/ @import url(x) tv , print; @namespace prefix "uri"; @namespace unused "unused"; @media all {} @media all { a {} } @media all "name" { a { color: red; } } @page:left { left: 0 } a {} prefix|x, a + b > c ~ d , b { top : 1px ; font-family : arial , 'some' } @x x; ''' s = cssutils.parseString(css) cssutils.ser.prefs.keepUnkownAtRules = True self.assertEqual( s.cssText, u'''@import"x"tv,print;@namespace prefix"uri";@media all"name"{a{color:red}}@page :left{left:0}prefix|x,a+b>c~d,b{top:1px;font-family:arial,"some"}@x x;''' ) cssutils.ser.prefs.keepUnkownAtRules = False self.assertEqual( s.cssText, u'''@import"x"tv,print;@namespace prefix"uri";@media all"name"{a{color:red}}@page :left{left:0}prefix|x,a+b>c~d,b{top:1px;font-family:arial,"some"}''' ) # CSSValues valuetests = { u' a a1 a-1 a-1a ': 'a a1 a-1 a-1a', u'a b 1 c 1em d -1em e': u'a b 1 c 1em d -1em e', u' 1em / 5 ': u'1em/5', u'1em/5': u'1em/5', u'a 0 a .0 a 0.0 a -0 a -.0 a -0.0 a +0 a +.0 a +0.0': u'a 0 a 0 a 0 a 0 a 0 a 0 a 0 a 0 a 0', u'a 0px a .0px a 0.0px a -0px a -.0px a -0.0px a +0px a +.0px a +0.0px ': u'a 0 a 0 a 0 a 0 a 0 a 0 a 0 a 0 a 0', u'a 1 a .1 a 1.0 a 0.1 a -1 a -.1 a -1.0 a -0.1 a +1 a +.1 a +1.0': u'a 1 a 0.1 a 1 a 0.1 a -1 a -0.1 a -1 a -0.1 a 1 a 0.1 a 1', u' url(x) f()': 'url(x) f()', u'#112233': '#123', u'#112234': '#112234', u'#123': '#123', u'#123 url() f()': '#123 url() f()', u'1 +2 +3 -4': u'1 2 3 -4' # ? } for test, exp in valuetests.items(): s = cssutils.parseString(u'a{x:%s}' % test) self.assertEqual(u'a{x:%s}' % exp, s.cssText)
def getSoupView(soup, css, url=''): """ soup a BeautifulSoup 4 object css a CSS StyleSheet string returns style view a dict of tuples """ sheet = cssutils.parseString(css, href=url) cssutils.replaceUrls(sheet, lambda u: urlparse.urljoin(url, u), ignoreImportRules=True) view = {} specificities = {} # needed temporarily # TODO: filter rules simpler?, add @media gens = [] for i_rule in sheet: if i_rule.type == i_rule.IMPORT_RULE: cssutils.replaceUrls(i_rule.styleSheet, lambda u: urlparse.urljoin(i_rule.href, u), ignoreImportRules=True) rules = (rule for rule in i_rule.styleSheet if rule.type == rule.STYLE_RULE) gens.append(rules) rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE) if gens: import itertools gens.append(rules) rules = itertools.chain(*gens) for rule in rules: for selector in rule.selectorList: #log(0, 'SELECTOR', selector.selectorText) # TODO: make this a callback to be able to use other stuff than lxml if ':' in selector.selectorText: continue #Ignore pseudo:classes because we can't use them, plus the match when we don't want them to on bs4 matching = soup.select(selector.selectorText) for element in matching: ID = id(element) if ID not in view: # add initial empty style declatation view[ID] = (element, cssutils.css.CSSStyleDeclaration() ) # @UndefinedVariable specificities[ID] = {} for p in rule.style: # update style declaration if p not in view[ID][1]: # setProperty needs a new Property object and # MUST NOT reuse the existing Property # which would be the same for all elements! # see Issue #23 view[ID][1].setProperty(p.name, p.value, p.priority) specificities[ID][p.name] = selector.specificity #log(2, view[element].getProperty('color')) else: #log(2, view[element].getProperty('color')) sameprio = ( p.priority == view[ID][1].getPropertyPriority( p.name)) if not sameprio and bool(p.priority) or ( sameprio and selector.specificity >= specificities[ID][p.name]): # later, more specific or higher prio view[ID][1].setProperty(p.name, p.value, p.priority) return view
def _parse_css_string(self, css_body, validate=True): if self.cache_css_parsing: return _cache_parse_css_string(css_body, validate=validate) return cssutils.parseString(css_body, validate=validate)
def test_styleSheet(self): "CSSImportRule.styleSheet" def fetcher(url): if url == "/root/level1/anything.css": return None, '@import "level2/css.css" "title2";' else: return None, 'a { color: red }' parser = cssutils.CSSParser(fetcher=fetcher) sheet = parser.parseString('''@charset "ascii"; @import "level1/anything.css" tv "title";''', href='/root/') self.assertEqual(sheet.href, '/root/') ir = sheet.cssRules[1] self.assertEqual(ir.href, 'level1/anything.css') self.assertEqual(ir.styleSheet.href, '/root/level1/anything.css') # inherits ascii as no self charset is set self.assertEqual(ir.styleSheet.encoding, 'ascii') self.assertEqual(ir.styleSheet.ownerRule, ir) self.assertEqual(ir.styleSheet.media.mediaText, 'tv') self.assertEqual(ir.styleSheet.parentStyleSheet, None) # sheet self.assertEqual(ir.styleSheet.title, 'title') self.assertEqual( ir.styleSheet.cssText, '@charset "ascii";\n@import "level2/css.css" "title2";'.encode()) ir2 = ir.styleSheet.cssRules[1] self.assertEqual(ir2.href, 'level2/css.css') self.assertEqual(ir2.styleSheet.href, '/root/level1/level2/css.css') # inherits ascii as no self charset is set self.assertEqual(ir2.styleSheet.encoding, 'ascii') self.assertEqual(ir2.styleSheet.ownerRule, ir2) self.assertEqual(ir2.styleSheet.media.mediaText, 'all') self.assertEqual(ir2.styleSheet.parentStyleSheet, None) #ir.styleSheet self.assertEqual(ir2.styleSheet.title, 'title2') self.assertEqual( ir2.styleSheet.cssText, '@charset "ascii";\na {\n color: red\n }'.encode()) sheet = cssutils.parseString('@import "CANNOT-FIND.css";') ir = sheet.cssRules[0] self.assertEqual(ir.href, "CANNOT-FIND.css") self.assertEqual(type(ir.styleSheet), cssutils.css.CSSStyleSheet) def fetcher(url): if url.endswith('level1.css'): return None, '@charset "ascii"; @import "level2.css";'.encode() else: return None, 'a { color: red }'.encode() parser = cssutils.CSSParser(fetcher=fetcher) sheet = parser.parseString( '@charset "iso-8859-1";@import "level1.css";') self.assertEqual(sheet.encoding, 'iso-8859-1') sheet = sheet.cssRules[1].styleSheet self.assertEqual(sheet.encoding, 'ascii') sheet = sheet.cssRules[1].styleSheet self.assertEqual(sheet.encoding, 'ascii')
def getdata(html) : try: soup = BeautifulSoup(html,'html.parser') #BeautifulSoup_to_get_data title = soup.find('title') business_name = soup.find('span', attrs = {'class' : 'fn'}).text rating = soup.find('span', attrs = {'class' : 'value-titles'}).text review_ele = soup.findAll(class_= 'allratingM') total_rating_count=soup.find('span', attrs = {'class' : 'votes'}) long_addr= soup.find('span', attrs = {'id' : 'fulladdress'}) long_addr=long_addr.find('span', attrs = {'class' : 'lng_add'}).text category=soup.findAll(class_= 'lng_als_lst') pay_modes = soup.findAll(class_= 'lng_mdpay') also_listed=soup.findAll(class_= 'lng_als_lst') Year=soup.findAll('ul', attrs = {'class' : 'alstdul'}) web=soup.findAll('span', attrs = {'class' : 'mreinfp'}) #map phone number digits with image id style=soup.findAll('style', attrs = {'type' : 'text/css'}) except: print("Error : Page fromat changed") phone_map={} sheet = cssutils.parseString(str(style[len(style)-1].text)) for rule in sheet: try: if 'before' not in str(rule.selectorText): continue for property in rule.style: name = property.name value = property.value.strip() value=ord(value[1])-643073 if(value==15): value=9 phone_map[rule.selectorText]=value except: print('fail') #maping -> complete #extract all the phone no present phone_img=soup.findAll('span', attrs = {'class' : 'mobilesv'}) phone_number="" phone_number_final=[] count_phone_num=0 is_exist_phone={} for phone_img_div in phone_img: try: if(count_phone_num==11 and phone_number[0]=='0' and phone_number[1]=='1' and phone_number[2]=='1'): if(not phone_number in is_exist_phone.values()): phone_number_final.append(phone_number) is_exist_phone[phone_number]=phone_number phone_number="" count_phone_num=0 elif(count_phone_num==13): if(not phone_number in is_exist_phone.values()) : phone_number_final.append(phone_number) is_exist_phone[phone_number]=phone_number phone_number="" count_phone_num=0 for key, value in phone_map.items(): if str(phone_img_div['class'][1]) in key: if(value==16): phone_number=phone_number+"+" count_phone_num=count_phone_num+1 continue phone_number=phone_number+str(value) count_phone_num=count_phone_num+1 break except: print("") data['phone_number']=phone_number_final try: data['JustDail_business_title']=title.text.strip() except: data['JustDail_business_title']='None' try: data['business_name']=business_name except: data['business_name']='None' #websiite try: website=web[len(web)-1].findChildren('a', recursive=False)[0].text.strip() except: website='None' #Year_Established try: Year_Established=Year[len(Year)-1].findChildren("li" , recursive=False)[0].text.strip() except: Year_Established='None' #rating if(rating): try: data['total_rating']=rating except: data['total_rating']='None' try: data['total_rating_count']=total_rating_count.text except: data['total_rating_count']="None" #long_addr if(long_addr): try: data['long_addr']=long_addr except: data['long_addr']='None' #website_url if(website): try: data['website']=website except: data['website']='None' if(Year_Established): try: data['Year_Established']=Year_Established except: data['Year_Established']='None' #payment_methds pay_string=[] for pay in pay_modes: pay_string.append((pay.text).strip()) if(pay_string): try: data['Modes_of_payment']=pay_string except: data['Modes_of_payment']='None' #catgry cat_string=[] for cat in category: cat_string.append((cat.text).strip()) if(cat_string): try: data['category']=cat_string except: data['category']='None' #also_listed_in also_listed_string=[] for also_list in also_listed: also_listed_string.append((also_list.text).strip()) if(also_listed_string): try: data['Also_Listed_in']=also_listed_string except: data['Also_Listed_in']='None' #reviews/rating review=[] # all user_reviews for div in review_ele: dic={} name=div.find('span', attrs = {'class' : 'rName'}) user_rating= div.find('span', attrs = {'class' : 'star_m'}) user_rating_date= div.find('span', attrs = {'class' : 'dtyr'}) user_review=div.find('p', attrs = {'class' : 'rwopinion2'}) if(name): try: dic['user_name']=name.text except: dic['user_name']='None' if(user_rating): try: review_rat= user_rating['aria-label'] dic['user_rating'] = review_rat[len(review_rat)-1] except: dic['user_rating']='0' if(user_rating_date): try : dic['user_rating_date']=user_rating_date['content'] except: dic['user_rating_date']="None" if(user_review): try: dic['user_review']=user_review.text except: dic['user_review']='None' try: if(dic): review.append(dic) except: print('Cannot add record') data['reviews']=review
def test_resolveImports(self): "cssutils.resolveImports(sheet)" if mock: self._tempSer() cssutils.ser.prefs.useMinified() a = u'@charset "iso-8859-1";@import"b.css";\xe4{color:green}'.encode( 'iso-8859-1') b = u'@charset "ascii";\\E4 {color:red}'.encode('ascii') # normal m = mock.Mock() with mock.patch('cssutils.util._defaultFetcher', m): m.return_value = (None, b) s = cssutils.parseString(a) # py3 TODO self.assertEqual(a, s.cssText) self.assertEqual(b, s.cssRules[1].styleSheet.cssText) c = cssutils.resolveImports(s) # py3 TODO self.assertEqual( u'\xc3\xa4{color:red}\xc3\xa4{color:green}'.encode( 'iso-8859-1'), c.cssText) c.encoding = 'ascii' self.assertEqual( ur'@charset "ascii";\E4 {color:red}\E4 {color:green}'. encode(), c.cssText) # b cannot be found m = mock.Mock() with mock.patch('cssutils.util._defaultFetcher', m): m.return_value = (None, None) s = cssutils.parseString(a) # py3 TODO self.assertEqual(a, s.cssText) self.assertEqual(cssutils.css.CSSStyleSheet, type(s.cssRules[1].styleSheet)) c = cssutils.resolveImports(s) # py3 TODO self.assertEqual( u'@import"b.css";\xc3\xa4{color:green}'.encode( 'iso-8859-1'), c.cssText) # @import with media a = u'@import"b.css";@import"b.css" print, tv ;@import"b.css" all;' b = u'a {color: red}' m = mock.Mock() with mock.patch('cssutils.util._defaultFetcher', m): m.return_value = (None, b) s = cssutils.parseString(a) c = cssutils.resolveImports(s) self.assertEqual( 'a{color:red}@media print,tv{a{color:red}}a{color:red}'. encode(), c.cssText) # cannot resolve with media => keep original a = u'@import"b.css"print;' b = u'@namespace "http://example.com";' m = mock.Mock() with mock.patch('cssutils.util._defaultFetcher', m): m.return_value = (None, b) s = cssutils.parseString(a) c = cssutils.resolveImports(s) self.assertEqual(a.encode(), c.cssText) # urls are adjusted too, layout: # a.css # c.css # img/img.gif # b/ # b.css # subimg/subimg.gif a = u''' @import"b/b.css"; a { x: url(/img/abs.gif); y: url(img/img.gif); z: url(b/subimg/subimg.gif); }''' def fetcher(url): c = { 'b.css': u''' @import"../c.css"; b { x: url(/img/abs.gif); y: url(../img/img.gif); z: url(subimg/subimg.gif); }''', 'c.css': u''' c { x: url(/img/abs.gif); y: url(./img/img.gif); z: url(./b/subimg/subimg.gif); }''' } return 'utf-8', c[os.path.split(url)[1]] @mock.patch.object(cssutils.util, '_defaultFetcher', new=fetcher) def do(): s = cssutils.parseString(a) r = cssutils.resolveImports(s) return s, r s, r = do() cssutils.ser.prefs.useDefaults() cssutils.ser.prefs.keepComments = False self.assertEqual( u'''c { x: url(/img/abs.gif); y: url(img/img.gif); z: url(b/subimg/subimg.gif) } b { x: url(/img/abs.gif); y: url(img/img.gif); z: url(b/subimg/subimg.gif) } a { x: url(/img/abs.gif); y: url(img/img.gif); z: url(b/subimg/subimg.gif) }'''.encode(), r.cssText) cssutils.ser.prefs.useDefaults() else: self.assertEqual(False, u'Mock needed for this test')