def test_children(self): "CSSStyleDeclaration.children()" style = '/*1*/color: red; color: green; @x;' types = [ cssutils.css.CSSComment, cssutils.css.Property, cssutils.css.Property, cssutils.css.CSSUnknownRule, ] def t(s): for i, x in enumerate(s.children()): self.assertEqual(types[i], type(x)) self.assertEqual(x.parent, s) t(cssutils.parseStyle(style)) t(cssutils.parseString('a {' + style + '}').cssRules[0].style) t( cssutils.parseString('@media all {a {' + style + '}}').cssRules[0].cssRules[0].style) s = cssutils.parseStyle(style) s['x'] = '0' self.assertEqual(s, s.getProperty('x').parent) s.setProperty('y', '1') self.assertEqual(s, s.getProperty('y').parent)
def collect_font_stats(self): self.page.evaljs('window.font_stats.get_font_face_rules()') font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception('Unknown error occurred while reading font-face rules') # Weed out invalid font-face rules rules = [] for rule in font_face_rules: ff = rule.get('font-family', None) if not ff: continue style = parseStyle('font-family:%s'%ff, validate=False) ff = [x.value for x in style.getProperty('font-family').propertyValue] if not ff or ff[0] == 'inherit': continue rule['font-family'] = frozenset(icu_lower(f) for f in ff) src = rule.get('src', None) if not src: continue style = parseStyle('background-image:%s'%src, validate=False) src = style.getProperty('background-image').propertyValue[0].uri if not src.startswith('file://'): self.log.warn('Unknown URI in @font-face: %r'%src) continue src = src[len('file://'):] if iswindows and src.startswith('/'): src = src[1:] src = src.replace('/', os.sep) src = unquote(src) name = self.container.abspath_to_name(src) if not self.container.has_name(name): self.log.warn('Font %r referenced in @font-face rule not found' %name) continue rule['src'] = name normalize_font_properties(rule) rule['width'] = widths[rule['font-stretch']] rule['weight'] = int(rule['font-weight']) rules.append(rule) if not rules: return for rule in rules: if rule['src'] not in self.font_stats: self.font_stats[rule['src']] = set() self.page.evaljs('window.font_stats.get_font_usage()') font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception('Unknown error occurred while reading font usage') exclude = {'\n', '\r', '\t'} for font in font_usage: text = set() for t in font['text']: text |= frozenset(t) text.difference_update(exclude) if not text: continue for rule in get_matching_rules(rules, font): self.font_stats[rule['src']] |= text
def set_style_property(tag, property_name, value, editor): ''' Set a style property, i.e. a CSS property inside the style attribute of the tag. Any existing style attribute is updated or a new attribute is inserted. ''' block, offset = find_attribute_in_tag(tag.start_block, tag.start_offset + 1, 'style') c = editor.textCursor() def css(d): return d.cssText.replace('\n', ' ') if block is None or offset is None: d = parseStyle('') d.setProperty(property_name, value) c.setPosition(tag.end_block.position() + tag.end_offset) c.insertText(' style="%s"' % css(d)) else: c.setPosition(block.position() + offset - 1) end_block, end_offset = find_end_of_attribute(block, offset + 1) if end_block is None: return error_dialog(editor, _('Invalid markup'), _( 'The current block tag has an existing unclosed style attribute. Run the Fix HTML' ' tool first.'), show=True) c.setPosition(end_block.position() + end_offset, c.KeepAnchor) d = parseStyle(editor.selected_text_from_cursor(c)[1:-1]) d.setProperty(property_name, value) c.insertText('"%s"' % css(d))
def test_children(self): "CSSStyleDeclaration.children()" style = u'/*1*/color: red; color: green; @x;' types = [ (cssutils.css.CSSComment, None), (cssutils.css.Property, 'parentStyle'), #DEPRECATED (cssutils.css.Property, 'parentStyle'), #DEPRECATED (cssutils.css.CSSUnknownRule, None) ] def t(s): for i, x in enumerate(s.children()): self.assertEqual(types[i][0], type(x)) self.assertEqual(s, x.parent) if types[i][1]: #DEPRECATED self.assertEqual(s, getattr(x, types[i][1])) t(cssutils.parseStyle(style)) t(cssutils.parseString(u'a {' + style + '}').cssRules[0].style) t( cssutils.parseString(u'@media all {a {' + style + '}}').cssRules[0].cssRules[0].style) s = cssutils.parseStyle(style) s['x'] = '0' self.assertEqual(s, s.getProperty('x').parent) s.setProperty('y', '1') self.assertEqual(s, s.getProperty('y').parent)
def test_border_condensation(self): vals = "red solid 5px" css = "; ".join( "border-%s-%s: %s" % (edge, p, v) for edge in EDGES for p, v in zip(BORDER_PROPS, vals.split()) ) style = parseStyle(css) condense_rule(style) for e, p in product(EDGES, BORDER_PROPS): self.assertFalse(style.getProperty("border-%s-%s" % (e, p))) self.assertFalse(style.getProperty("border-%s" % e)) self.assertFalse(style.getProperty("border-%s" % p)) self.assertEqual(style.getProperty("border").value, vals) css = "; ".join( "border-%s-%s: %s" % (edge, p, v) for edge in ("top",) for p, v in zip(BORDER_PROPS, vals.split()) ) style = parseStyle(css) condense_rule(style) self.assertEqual(style.cssText, "border-top: %s" % vals) css += ";" + "; ".join( "border-%s-%s: %s" % (edge, p, v) for edge in ("right", "left", "bottom") for p, v in zip(BORDER_PROPS, vals.replace("red", "green").split()) ) style = parseStyle(css) condense_rule(style) self.assertEqual(len(style.getProperties()), 4) self.assertEqual(style.getProperty("border-top").value, vals) self.assertEqual(style.getProperty("border-left").value, vals.replace("red", "green"))
def test_border_condensation(self): vals = 'red solid 5px' css = '; '.join('border-%s-%s: %s' % (edge, p, v) for edge in EDGES for p, v in zip(BORDER_PROPS, vals.split())) style = parseStyle(css) condense_rule(style) for e, p in product(EDGES, BORDER_PROPS): self.assertFalse(style.getProperty('border-%s-%s' % (e, p))) self.assertFalse(style.getProperty('border-%s' % e)) self.assertFalse(style.getProperty('border-%s' % p)) self.assertEqual(style.getProperty('border').value, vals) css = '; '.join('border-%s-%s: %s' % (edge, p, v) for edge in ('top', ) for p, v in zip(BORDER_PROPS, vals.split())) style = parseStyle(css) condense_rule(style) self.assertEqual(style.cssText, 'border-top: %s' % vals) css += ';' + '; '.join( 'border-%s-%s: %s' % (edge, p, v) for edge in ('right', 'left', 'bottom') for p, v in zip(BORDER_PROPS, vals.replace('red', 'green').split())) style = parseStyle(css) condense_rule(style) self.assertEqual(len(style.getProperties()), 4) self.assertEqual(style.getProperty('border-top').value, vals) self.assertEqual( style.getProperty('border-left').value, vals.replace('red', 'green'))
def test_children(self): "CSSStyleDeclaration.children()" style = u'/*1*/color: red; color: green; @x;' types = [ (cssutils.css.CSSComment, None), (cssutils.css.Property, 'parentStyle'), #DEPRECATED (cssutils.css.Property, 'parentStyle'), #DEPRECATED (cssutils.css.CSSUnknownRule, None) ] def t(s): for i, x in enumerate(s.children()): self.assertEqual(types[i][0], type(x)) self.assertEqual(s, x.parent) if types[i][1]: #DEPRECATED self.assertEqual(s, getattr(x, types[i][1])) t(cssutils.parseStyle(style)) t(cssutils.parseString(u'a {'+style+'}').cssRules[0].style) t(cssutils.parseString(u'@media all {a {'+style+'}}').cssRules[0].cssRules[0].style) s = cssutils.parseStyle(style) s['x'] = '0' self.assertEqual(s, s.getProperty('x').parent) s.setProperty('y', '1') self.assertEqual(s, s.getProperty('y').parent)
def abstract_diagram(self, tile, tileset=None): tilediagfile = etree.parse( pkg_resources.resource_stream( __name__, os.path.join('seqdiagrambases', '{}-abstract.svg'.format(self._abase)))) tilediag = tilediagfile.getroot().find("./*[@class='tile']") c = tile.get('color', None) if c is None: fill = None elif c[0] == "#": fill = c else: fill = xcolors.get(c, None) tilediag.find("./*[@class='tilename']").text = tile.name if fill: s = cssutils.parseStyle( tilediag.find("./*[@class='tilerect']").attrib['style']) s['fill'] = fill tilediag.find("./*[@class='tilerect']").attrib['style'] = s.cssText if self._orient: tilediag.find("./*[@class='type_sw']").text = self._orient[0] tilediag.find("./*[@class='type_ne']").text = self._orient[1] if not tileset: return (tilediag, 1) for endn, loc in zip(tile.ends, self._a_endlocs): if endn in tileset.ends.keys(): end = tileset.ends[endn] elif endn[:-1] in tileset['ends'].keys() and endn[-1] == '/': end = tileset['ends'][endn[:-1]] else: end = None tilediag.find("./*[@class='endname_{}']".format(loc)).text = endn if end and ('color' in end.keys()): ec = tilediag.find("./*[@class='endcolor_{}']".format(loc)) s = cssutils.parseStyle(ec.attrib['style']) c = end.get('color', None) if c is None: fill = None elif c[0] == "#": fill = c else: fill = xcolors.get(c, None) s['fill'] = fill ec.attrib['style'] = s.getCssText('') return (tilediag, 1)
def test_parseStyle(self): "cssutils.parseStyle()" s = cssutils.parseStyle('x:0; y:red') self.assertEqual(type(s), cssutils.css.CSSStyleDeclaration) self.assertRaises(xml.dom.SyntaxErr, cssutils.parseStyle, '@import "x";') tests = [(u'content: "ä"', 'iso-8859-1'), (u'content: "€"', 'utf-8')] for v, e in tests: s = cssutils.parseStyle(v.encode(e), encoding=e) self.assertEqual(s.cssText, v) self.assertRaises(UnicodeDecodeError, cssutils.parseStyle, u'content: "ä"'.encode('utf-8'), 'ascii')
def collect_font_stats(self): self.page.evaljs('window.font_stats.get_font_face_rules()') font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception( 'Unknown error occurred while reading font-face rules') # Weed out invalid font-face rules rules = [] for rule in font_face_rules: ff = rule.get('font-family', None) if not ff: continue style = parseStyle('font-family:%s' % ff, validate=False) ff = [ x.value for x in style.getProperty('font-family').propertyValue ] if not ff or ff[0] == 'inherit': continue rule['font-family'] = frozenset(icu_lower(f) for f in ff) src = rule.get('src', None) if not src: continue style = parseStyle('background-image:%s' % src, validate=False) src = style.getProperty('background-image').propertyValue[0].uri name = self.href_to_name(src, '@font-face rule') rule['src'] = name normalize_font_properties(rule) rule['width'] = widths[rule['font-stretch']] rule['weight'] = int(rule['font-weight']) rules.append(rule) if not rules: return for rule in rules: if rule['src'] not in self.font_stats: self.font_stats[rule['src']] = set() self.page.evaljs('window.font_stats.get_font_usage()') font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception('Unknown error occurred while reading font usage') exclude = {'\n', '\r', '\t'} for font in font_usage: text = set() for t in font['text']: text |= frozenset(t) text.difference_update(exclude) if not text: continue for rule in get_matching_rules(rules, font): self.font_stats[rule['src']] |= text
def __find_types(self, table): for td in table.find_all('td'): span = td.find('a') if span: type_text = span.text type_color = cssutils.parseStyle(td['style'])['background'] self.update_data([[type_text, type_color]])
def test_replaceUrls(self): "cssutils.replaceUrls()" cssutils.ser.prefs.keepAllProperties = True css=''' @import "im1"; @import url(im2); a { background-image: url(c) !important; background-\image: url(b); background: url(a) no-repeat !important; }''' s = cssutils.parseString(css) cssutils.replaceUrls(s, lambda old: "NEW" + old) self.assertEqual(u'@import "NEWim1";', s.cssRules[0].cssText) self.assertEqual(u'NEWim2', s.cssRules[1].href) self.assertEqual(u'''background-image: url(NEWc) !important; background-\\image: url(NEWb); background: url(NEWa) no-repeat !important''', s.cssRules[2].style.cssText) cssutils.ser.prefs.keepAllProperties = False # CSSStyleDeclaration style = cssutils.parseStyle(u'''color: red; background-image: url(1.png), url('2.png')''') cssutils.replaceUrls(style, lambda url: 'prefix/'+url) self.assertEqual(style.cssText, u'''color: red; background-image: url(prefix/1.png), url(prefix/2.png)''')
def exceptresp(): try: description = "" imgurl = "" #first method resp = requests.get( 'https://www.facebook.com/video.php?v=' + self._video_id) textsoup = BeautifulSoup(resp.content, "html5lib") description = textsoup.find( "meta", property="og:description").get("content", "") imgurl = textsoup.find( "meta", property="og:image").get("content", "") if not imgurl: resp = requests.get( 'https://www.facebook.com/video/embed?video_id=' + self._video_id) textsoup = BeautifulSoup(resp.content, "html5lib") firstdiv = textsoup.body.find('div') firstimgstyle = textsoup.body.find('img').get('style') style = cssutils.parseStyle(firstimgstyle) imgurl = style['background-image'] imgurl = imgurl.replace('url(', '').replace(')', '') if 'uiBoxRed' in firstdiv.get("class", "uiBoxRed"): return False self._data["status"] = True self._results = { 'title': "", 'description': description, 'duration': '', 'status': True, 'image': imgurl } return True except: return False
def background_image_finder(pipeline_index, soup, finder_image_urls=[], *args, **kwargs): """ Find image URL in background-image Example: <div style="width: 100%; height: 100%; background-image: url(http://distilleryimage10.ak.instagram.com/bde04558a43b11e28e5d22000a1f979a_7.jpg);" class="Image iLoaded iWithTransition Frame" src="http://distilleryimage10.ak.instagram.com/bde04558a43b11e28e5d22000a1f979a_7.jpg"></div> to http://distilleryimage10.ak.instagram.com/bde04558a43b11e28e5d22000a1f979a_7.jpg """ now_finder_image_urls = [] for tag in soup.find_all(style=True): style_string = tag['style'] if 'background-image' in style_string.lower(): style = cssutils.parseStyle(style_string) background_image = style.getProperty('background-image') if background_image: for property_value in background_image.propertyValue: background_image_url = str(property_value.value) if background_image_url: if (background_image_url not in finder_image_urls) and \ (background_image_url not in now_finder_image_urls): now_finder_image_urls.append(background_image_url) output = {} output['finder_image_urls'] = finder_image_urls + now_finder_image_urls return output
def _build_format(self, element): args = {} for attribute, value in element.attrs.items(): if attribute == "class" and value: # This selects the first one which isn't an empty string. We could handle multiple classes here somehow. vals = [v for v in value if v] if vals: args["style"] = vals elif attribute == "style": styles = cssutils.parseStyle(value) for name in Format.NESTED_STYLES: args[name] = defaultdict(str) for style in styles: for nested_name in Format.NESTED_STYLES: nested_name_with_dash = nested_name + "-" if style.name.startswith(nested_name_with_dash): args[nested_name][style.name.replace( nested_name_with_dash, "")] = style.value break else: name = style.name.lower().replace("-", "_") if name in Format.NESTED_STYLES: # Not supported. Use explicit 'margin-right', # 'margin-left' etc rather than just 'margin'. continue elif name in Format.FORMAT_ALIASES: name = Format.FORMAT_ALIASES[name] if name in Format.optional: args[name] = style.value.strip() return Format(**args)
def process_images_and_emojis(self, soup): # process images & emojis cache_images = True for img in soup.findAll("img"): if img.has_attr("src"): if cache_images and "data:image" not in img["src"]: img_src = img["src"] # if the path starts with /, it's one of notion's predefined images if img["src"].startswith("/"): img_src = f'https://www.notion.so{img["src"]}' # notion's own default images urls are in a weird format, need to sanitize them # img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0] # if (not '.amazonaws' in img_src): # img_src = urllib.parse.unquote(img_src) cached_image = self.cache_file(img_src) img["src"] = cached_image elif img["src"].startswith("/"): img["src"] = f'https://www.notion.so{img["src"]}' # on emoji images, cache their sprite sheet and re-set their background url if img.has_attr("class") and "notion-emoji" in img["class"]: style = cssutils.parseStyle(img["style"]) spritesheet = style["background"] spritesheet_url = spritesheet[ spritesheet.find("(") + 1 : spritesheet.find(")") ] cached_spritesheet_url = self.cache_file( f"https://www.notion.so{spritesheet_url}" ) style["background"] = spritesheet.replace( spritesheet_url, str(cached_spritesheet_url) ) img["style"] = style.cssText
def clean_html(self, html): if not isinstance(html, unicode): raise ValueError('We only support cleaning unicode HTML fragments') #We wrap the content up in an extra div tag (otherwise lxml does wierd things to it - like adding in <p> tags and stuff) divnode = fromstring(u'<div>' + html + u'</div>') self(divnode) # Strip all class attributes etree.strip_attributes(divnode, 'class') for style in divnode.xpath("//@style"): parent = style.getparent() try: cssStyle = cssutils.parseStyle(style) except Exception, e: logging.info("Style %s failed to parse with error %s." % (style, e)) parent.attrib.pop('style', None) continue # Set the line separator so that the style gets serialized cssutils.ser.prefs.lineSeparator = '' # Only allow valid style properties cssutils.ser.prefs.validOnly = True new_style = cssStyle.cssText if not new_style.strip(): parent.attrib.pop('style', None) else: parent.attrib['style'] = new_style
def to_ast(css): """ Our "AST" is a set of (property, value, priority) tuples, one per CSS statement. Value strings are normalized by cssutils. """ parsed = cssutils.parseStyle(css) return frozenset(simplify(x) for x in parsed.children())
def handle_html_content(self, content): soup = BeautifulSoup(content, 'html.parser') for p_elem in soup.find_all('p'): css = None if 'style' in p_elem.attrs: css = cssutils.parseStyle(p_elem.attrs['style']) text_list = p_elem.text.split() p_new = soup.new_tag('p', style=css.cssText if css else None) for idx, word in enumerate(text_list): if len(self.dorks) <= 0: self.dorks = yield from self.get_dorks() word += ' ' if idx % 5 == 0: a_tag = soup.new_tag( 'a', href=self.dorks.pop(), style='color:{color};text-decoration:none;cursor:text;'.format( color=css.color if css and 'color' in css.keys() else '#000000' ) ) a_tag.string = word p_new.append(a_tag) else: p_new.append(soup.new_string(word)) p_elem.replace_with(p_new) content = soup.encode('utf-8') return content
def test_parseStyle(self): "cssutils.parseStyle()" s = cssutils.parseStyle('x:0; y:red') self.assertEqual(type(s), cssutils.css.CSSStyleDeclaration) self.assertRaises(xml.dom.SyntaxErr, cssutils.parseStyle, '@import "x";') tests = [ (u'content: "ä"', 'iso-8859-1'), (u'content: "€"', 'utf-8') ] for v, e in tests: s = cssutils.parseStyle(v.encode(e), encoding=e) self.assertEqual(s.cssText, v) self.assertRaises(UnicodeDecodeError, cssutils.parseStyle, u'content: "ä"'.encode('utf-8'), 'ascii')
def getPropertyLink(page_source, type_property, type_buy): soupProperty = BeautifulSoup(page_source, 'html5lib') divPropertyInfoContainer = soupProperty.find_all( 'div', attrs={"class": "srpBlockListRow"}) for info in divPropertyInfoContainer: heading = info.find_all('p', attrs={"class": "proHeading"}) for data in heading: links = data.find_all('a', href=True) for link in links: print link.get_text() print link['href'] propertyURL = 'http://www.magicbricks.com' + link['href'] print propertyURL try: propertyImgUrlStyle = info.find('div', attrs={"class": "thumbnailBG"})['style'] style = cssutils.parseStyle(propertyImgUrlStyle) propertyImgUrl = style['background-image'] propertyImgUrl = propertyImgUrl.replace('url(', '').replace(')', '') print propertyImgUrl imgURL = propertyImgUrl HousesLink.append({ "PropertyUrl": propertyURL, "PropertyImg": imgURL, "BuyType": type_buy }) except: HousesLink.append({ "PropertyUrl": propertyURL, "PropertyImg": '', "BuyType": type_buy })
def _extract_colors(html: str) -> List[swatch.RgbColor]: try: result = [] soup = BeautifulSoup(html, "html.parser") theme_panel_tag = soup.find( class_=re.compile("^Colorwheel__themepanel___*")) swatch_tags = theme_panel_tag.find_all( class_=re.compile("^Swatch__swatch___*")) for swatch_tag in swatch_tags: # Extracting CSS background color (RGB mode) css_rgb_color = cssutils.parseStyle(swatch_tag["style"]).background # Extracting RGB color ingredients rgb = [int(i) for i in re.findall(r"\d{1,3}", css_rgb_color)] # Creating RGB color result.append( swatch.RgbColor(name='', red=rgb[0], green=rgb[1], blue=rgb[2])) return result except: raise SwatchException
def get_player_data(self, url, initial_data = None): player_data = {} if initial_data is not None: player_data = initial_data try: self.driver.get(url) page = BeautifulSoup(self.driver.page_source, 'html.parser') player_data['first_name'] = page.find(class_ = 'firstName').get_text() player_data['last_name'] = page.find(class_ = 'lastName').get_text() player_data['position'] = page.find(class_ = 'playerPosition').get_text() table = page.find(class_ = 'profiletable').find_all(class_ = 'row') player_data['name'] = table[0].find_all('div')[1].get_text() player_data['nationality'] = table[1].find('nationality-flags').find('span').get_text().strip() player_data['birth_date'] = table[2].find_all('div')[1].get_text().replace('.', '-') player_data['height'] = table[4].find_all('div')[1].get_text().replace(' cm', '') image_style = page.find(class_ = 'playerImage')['style'] style = cssutils.parseStyle(image_style)['background-image'] player_data['image_url'] = style.replace('url(', '').replace(')', '') if self.verbose: print('Scraped player:', player_data['name']) return player_data except Exception as ex: error = str(ex) print(error) return { 'error': error }
def parse(self, response): result = {"results": []} tags = response.css('{tag}::attr(style)'.format(tag=tag)).getall() for t in tags: arsed_css = dict(cssutils.parseStyle(t)) if css_property in arsed_css: property_value = arsed_css[css_property] if condition == "if_equal": _css_value = css_value if str(css_value).isdigit() and str(property_value).isdigit(): property_value = float(property_value) _css_value = float(css_value) if property_value == _css_value: result["results"].append({condition: "{}:{}".format(css_property, property_value)}) elif condition == "if_lees_than": if float(property_value) < float(css_value): result["results"].append({condition: "{}:{}".format(css_property, property_value)}) elif condition == "if_more_than": if float(property_value) > float(css_value): result["results"].append({condition: "{}:{}".format(css_property, property_value)}) with open("{output}".format(output=output), "w+") as output_file: output_file.write(json.dumps(result))
def test_edge_condensation(self): for s, v in { (1, 1, 3): None, (1, 2, 3, 4): '2pt 3pt 4pt 1pt', (1, 2, 3, 2): '2pt 3pt 2pt 1pt', (1, 2, 1, 3): '2pt 1pt 3pt', (1, 2, 1, 2): '2pt 1pt', (1, 1, 1, 1): '1pt', ('2%', '2%', '2%', '2%'): '2%', tuple('0 0 0 0'.split()): '0', }.items(): for prefix in ('margin', 'padding'): css = { '%s-%s' % (prefix, x): str(y) + 'pt' if isinstance(y, (int, float)) else y for x, y in zip(('left', 'top', 'right', 'bottom'), s) } css = '; '.join(('%s:%s' % (k, v) for k, v in css.items())) style = parseStyle(css) condense_rule(style) val = getattr(style.getProperty(prefix), 'value', None) self.assertEqual(v, val) if val is not None: for edge in EDGES: self.assertFalse( getattr( style.getProperty('%s-%s' % (prefix, edge)), 'value', None))
def mars_featured_image(): #Initializing the browser by calling init_browser browser = init_browser() # Visit mars.nasa.gov/news url_to_scrape = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url_to_scrape) # Time delay of 2 secs to make sure the browser loads time.sleep(2) # Find the button "full image" and instantiate button click button = browser.find_by_id("full_image") button.click() # Scrape page into Soup html = browser.html # Create BeautifulSoup object; parsed with 'html.parser' soup = BeautifulSoup(html, 'html.parser') # Scrape the background image url url = cssutils.parseStyle( soup.find('article')['style'])['background-image'] # Remove extra stuff from the url url = url.replace('url(', '').replace(')', '') # Define the base_url base_url = 'https://www.jpl.nasa.gov' # Create the url for the background image image_url = base_url + url # Close the browser after scraping browser.quit() # Return scraped item return image_url
def scrape_item_details(self, divs): section = divs[0].find("span").text.strip() exhibitions = divs[1].find_all(class_="card exhibition-card") e = [] for _ in exhibitions: payment_type = _.find(class_="exhibition-payment-type").text.strip() title = _.find(class_="exhibition-title d-flex flex-column").text.strip() date = _.find(class_="exhibition-date").text.strip() description = _.find(class_="exhibition-description").text.strip() div_style = _.find(class_="w-100").find("div")["style"] style = cssutils.parseStyle(div_style) url = style["background-image"] img = url.replace("url(", "").replace(")", "") img_url = f"https://www.nationalgallery.org.uk/{img}" e.append( { "id": sha256(img_url.encode()).hexdigest(), "payment_type": payment_type, "title": title, "date": date, "description": description, "img": img_url, } ) return {"section": section, "exhibitions": e}
def test_replaceUrls(self): "cssutils.replaceUrls()" cssutils.ser.prefs.keepAllProperties = True css = ''' @import "im1"; @import url(im2); a { background-image: url(c) !important; background-\image: url(b); background: url(a) no-repeat !important; }''' s = cssutils.parseString(css) cssutils.replaceUrls(s, lambda old: "NEW" + old) self.assertEqual(u'@import "NEWim1";', s.cssRules[0].cssText) self.assertEqual(u'NEWim2', s.cssRules[1].href) self.assertEqual( u'''background-image: url(NEWc) !important; background-\\image: url(NEWb); background: url(NEWa) no-repeat !important''', s.cssRules[2].style.cssText) cssutils.ser.prefs.keepAllProperties = False # CSSStyleDeclaration style = cssutils.parseStyle(u'''color: red; background-image: url(1.png), url('2.png')''') cssutils.replaceUrls(style, lambda url: 'prefix/' + url) self.assertEqual( style.cssText, u'''color: red; background-image: url(prefix/1.png), url(prefix/2.png)''')
def getdata_hackerearth(): url = "https://www.hackerearth.com/challenges/?filters=competitive%2Chackathon%2Chiring%2Cuniversity" source = requests.get(url).text soup = BeautifulSoup(source, 'lxml') opportunitiesList = soup.find(id="challenge-container").find( "div", class_="upcoming challenge-list").find_all( "div", class_="challenge-card-modern") opportunity_names = [] opportunity_link = [] opportunity_image = [] opportunity_tags = [] opportunity_starttime = [] opportunity_endtime = [] for opportunity in opportunitiesList: try: name = opportunity.find( "span", class_="challenge-list-title challenge-card-wrapper").text link = opportunity.find("a").get("href") #getting image from style div_style = opportunity.find("div", class_="event-image")['style'] style = cssutils.parseStyle(div_style) image = style['background-image'].replace('url(', '').replace(')', '') tags = opportunity.find("div", class_="challenge-type").text.strip() #To scrape start and end date source = requests.get(link).text soup = BeautifulSoup(source, 'lxml') try: starttime = soup.find( "div", class_="start-time-block").find_all("div")[1].text endtime = soup.find( "div", class_="end-time-block").find_all("div")[1].text except: starttime = soup.find("div", class_="event-timings").find_all( "span", class_="timing-text")[0].text endtime = soup.find("div", class_="event-timings").find_all( "span", class_="timing-text")[1].text start = cleantime(starttime) end = cleantime(endtime) #adding data to the list if filterdata(soup.find(id="overview")): opportunity_names.append(name) opportunity_link.append(link) opportunity_image.append(image) opportunity_tags.append(tags) opportunity_starttime.append(start) opportunity_endtime.append(end) except: print("inconsistent panel") return [ opportunity_names, opportunity_link, opportunity_image, opportunity_tags, opportunity_starttime, opportunity_endtime ]
def check_page(self, page_path): """ checks a page for links :param page_path: path to page :return: list of links on a page """ if self.verbose: print('Pulling links from: %s' % page_path) s_content = LinkChecker.parse_page(page_path) # check img tags img_links = s_content.find_all('img', class_=lambda x: x != 'no-check', src=True) img_links = [img['src'] for img in img_links] # check css backgrounds e_style = s_content.find_all( lambda tag: len(tag.attrs) and 'style' in tag.attrs and 'url' in tag['style']) for element in e_style: style = cssutils.parseStyle(element['style']) if (style['background-image'] or style['background']) \ and (re.search(r'(png|jpg|jpe?g|gif)', style.cssText, flags=re.IGNORECASE)): bgi = style['background-image'] if style['background-image'] \ else style['background'] bgi = re.sub(r'(.*?\(\'*\"*)(.*?\.)(png|jpg|jpe?g|gif)(.*?\'*\"*\))', r'\2\3', bgi, flags=re.I) bgi = re.sub(r'(.*?\.png|gif|jpe?g)(.*)', r'\1', bgi, flags=re.I) img_links.append(bgi) return img_links
def append_list(list_name, elem_url): image_url = '' if elem_url is not None: # from image element extract url using cssutils style = cssutils.parseStyle(elem_url.get('style')) image_url = style['background-image'] # cleaning the url IMPORTANT: This is getting url with 128 by 128 change it to 400 by 400 image_url = image_url.replace('url(', '').replace(')', '') # append in corresponding list if list_name == 0: pic_url_0.append(image_url) if list_name == 1: pic_url_1.append(image_url) if list_name == 2: pic_url_2.append(image_url) if list_name == 3: pic_url_3.append(image_url) if list_name == 4: pic_url_4.append(image_url) if list_name == 5: pic_url_5.append(image_url) if list_name == 6: pic_url_6.append(image_url) if list_name == 7: pic_url_7.append(image_url) if list_name == 8: pic_url_8.append(image_url) if list_name == 9: pic_url_9.append(image_url)
def check_page(self, page_path): """ checks a page for links :param page_path: path to page :return: list of links on a page """ if self.verbose: print('Pulling links from: %s' % page_path) s_content = LinkChecker.parse_page(page_path) # check img tags img_links = s_content.find_all('img', class_=lambda x: x != 'no-check', src=True) img_links = [img['src'] for img in img_links] # check css backgrounds e_style = s_content.find_all(lambda tag: len(tag.attrs) and 'style' in tag.attrs and 'url' in tag['style']) for element in e_style: style = cssutils.parseStyle(element['style']) if (style['background-image'] or style['background']) \ and (re.search(r'(png|jpg|jpe?g|gif)', style.cssText, flags=re.IGNORECASE)): bgi = style['background-image'] if style['background-image'] \ else style['background'] bgi = re.sub( r'(.*?\(\'*\"*)(.*?\.)(png|jpg|jpe?g|gif)(.*?\'*\"*\))', r'\2\3', bgi, flags=re.I) bgi = re.sub(r'(.*?\.png|gif|jpe?g)(.*)', r'\1', bgi, flags=re.I) img_links.append(bgi) return img_links
def test_list_style_normalization(self): def ls_dict(expected): ans = { 'list-style-%s' % x: DEFAULTS['list-style-%s' % x] for x in ('type', 'image', 'position') } for k, v in expected.items(): ans['list-style-%s' % k] = v return ans for raw, expected in { 'url(http://www.example.com/images/list.png)': { 'image': 'url(http://www.example.com/images/list.png)' }, 'inside square': { 'position': 'inside', 'type': 'square' }, 'upper-roman url(img) outside': { 'position': 'outside', 'type': 'upper-roman', 'image': 'url(img)' }, }.items(): cval = tuple(parseStyle('list-style: %s' % raw, validate=False))[0].cssValue self.assertDictEqual( ls_dict(expected), normalizers['list-style']('list-style', cval))
def scrape_images(): browser = init_browser() images_data = {} mars_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(mars_url) time.sleep(5) images_html = browser.html images_soup = bs(images_html, 'html.parser') art_style = images_soup.find('article')['style'] style = cssutils.parseStyle(art_style) img_url = style['background-image'] img_url = img_url.replace('url(', '').replace(')', '') featured_image_url = 'https://www.jpl.nasa.gov' + img_url featured_image_url images_data['image_url'] = featured_image_url browser.quit() return images_data
async def handle_content(self, content): soup = BeautifulSoup(content, 'html.parser') if self.no_dorks is not True: for p_elem in soup.find_all('p'): if p_elem.findChildren(): continue css = None if 'style' in p_elem.attrs: css = cssutils.parseStyle(p_elem.attrs['style']) text_list = p_elem.text.split() p_new = soup.new_tag('p', style=css.cssText if css else None) for idx, word in enumerate(text_list): # Fetch dorks if required if len(self.dorks) <= 0: self.dorks = await self.get_dorks() word += ' ' if idx % 5 == 0: a_tag = soup.new_tag( 'a', href=self.dorks.pop(), style= 'color:{color};text-decoration:none;cursor:text;'. format(color=css.color if css and 'color' in css.keys() else '#000000')) a_tag.string = word p_new.append(a_tag) else: p_new.append(soup.new_string(word)) p_elem.replace_with(p_new) content = soup.encode('utf-8') return content
def fetch_telegram(url, page_name): print('Start fetch {}: {}'.format(page_name, url)) s = requests.Session() headers = { "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:71.0) Gecko/20100101 Firefox/71.0' } resp = s.get(url, headers=headers, verify=False) soup = BeautifulSoup(resp.text, 'html.parser') soup = soup.find_all("div", {"class": "tgme_widget_message_bubble"}) for i in range(len(soup)): info_post = soup[i].find_all("div", {"class": "tgme_widget_message_info"}) mess_text = soup[i].find_all("div", {"class": "tgme_widget_message_text"}) # remove posts with only images if mess_text == []: continue content = BeautifulSoup(str(mess_text[0]), 'html.parser') info_post = BeautifulSoup(str(info_post[0]), 'html.parser') media = soup[i].find_all("i", {"class": "link_preview_image"}) # Post announcements removed if content.find_all('a') == []: continue get_url = content.find_all('a')[0] url = get_url.get('href') origin = info_post.find_all('a')[0].get('href') content = mess_text[0].decode_contents() post_time = info_post.find_all('time')[0].get('datetime') if media != []: div_style = BeautifulSoup(str(media[0]), 'html.parser').find('i')['style'] style = cssutils.parseStyle(div_style) url_image = style['background-image'] url_image = url_image[4:-1] else: url_image = '' info = { "category": LINK.CATEGORY_WEB, "content": content, "created_at": datetime.strptime(post_time[:-6], "%Y-%m-%dT%H:%M:%S"), "kind": LINK.KIND_LINK, "media": url_image, "origin": origin, "read": LINK.UNREAD, "status": LINK.STATUS_DONE, "title": page_name, "url": url } info['url'] = clean_up_url(info['url']) if Link.query.filter(Link.url == info['url']).first(): continue else: Link.insert_from(info)
def scrape_info(): browser = init_browser() # Visit visitcostarica.herokuapp.com # url = "https://visitcostarica.herokuapp.com/" url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) time.sleep(1) # Scrape page into Soup html = browser.html soup = bs(html, "html.parser") carousel = soup.find('div', class_='carousel_items') div_style = carousel.find('article')['style'] style = cssutils.parseStyle(div_style) partial_url = style['background-image'] # print(partial_url) partial_url = partial_url.replace('url(', '').replace(')', '') featured_image_url = "https://jpl.nasa.gov" + partial_url print("###############") print(featured_image_url) print("###############") # mars_image = soup.find('div',id='page') mars_image = featured_image_url # Get the average temps # avg_temps = soup.find('div', id='weather') # Get the min avg temp # min_temp = avg_temps.find_all('strong')[0].text # Get the max avg temp # max_temp = avg_temps.find_all('strong')[1].text # BONUS: Find the src for the sloth image # relative_image_path = soup.find_all('img')[2]["src"] # mars_img = url + relative_image_path min_temp = 66 max_temp = 99 # Store data in a dictionary mars_data = { "mars_img": mars_image, "min_temp": min_temp, "max_temp": max_temp } # Close the browser after scraping browser.quit() # Return results return mars_data
def get_image_src_from_style(self, style_content): import cssutils style = cssutils.parseStyle(style_content) url = style['background-image'] url = url.replace("'", '').replace('"', '') url = url.replace('url(', '').replace(')', '') return url
def styled(element, styles): if element.tag != 'span': element = sub_element(element, 'span') declare = parseStyle(element.attrib.get('style', '')) for k, v in styles.items(): declare.setProperty(k, v) element.attrib['style'] = declare.getCssText(' ') return element
def craft_image_url(image): image_url = image["style"] image_url = cssutils.parseStyle(image_url) image_url = image_url["background-image"] image_url = image_url.replace("url(", "").replace(")", "") image_url = image_url[1:-15] response = urllib.request.urlopen(image_url) return image_url, response
def test_border_normalization(self): def border_edge_dict(expected, edge="right"): ans = { "border-%s-%s" % (edge, x): DEFAULTS["border-%s-%s" % (edge, x)] for x in ("style", "width", "color") } for x, v in expected.iteritems(): ans["border-%s-%s" % (edge, x)] = v return ans def border_dict(expected): ans = {} for edge in EDGES: ans.update(border_edge_dict(expected, edge)) return ans def border_val_dict(expected, val="color"): ans = {"border-%s-%s" % (edge, val): DEFAULTS["border-%s-%s" % (edge, val)] for edge in EDGES} for edge in EDGES: ans["border-%s-%s" % (edge, val)] = expected return ans for raw, expected in { "solid 1px red": {"color": "red", "width": "1px", "style": "solid"}, "1px": {"width": "1px"}, "#aaa": {"color": "#aaa"}, "2em groove": {"width": "2em", "style": "groove"}, }.iteritems(): for edge in EDGES: br = "border-%s" % edge val = tuple(parseStyle("%s: %s" % (br, raw), validate=False))[0].cssValue self.assertDictEqual(border_edge_dict(expected, edge), normalizers[br](br, val)) for raw, expected in { "solid 1px red": {"color": "red", "width": "1px", "style": "solid"}, "1px": {"width": "1px"}, "#aaa": {"color": "#aaa"}, "thin groove": {"width": "thin", "style": "groove"}, }.iteritems(): val = tuple(parseStyle("%s: %s" % ("border", raw), validate=False))[0].cssValue self.assertDictEqual(border_dict(expected), normalizers["border"]("border", val)) for name, val in {"width": "10%", "color": "rgb(0, 1, 1)", "style": "double"}.iteritems(): cval = tuple(parseStyle("border-%s: %s" % (name, val), validate=False))[0].cssValue self.assertDictEqual(border_val_dict(val, name), normalizers["border-" + name]("border-" + name, cval))
def strip_dom(self, tree): root = tree.getroot() for grp in root.findall('svg:g', namespaces): if grp.get(_fqattr('inkscape', 'groupmode')) != 'layer': continue style = cssutils.parseStyle(grp.get('style', '')) if style.getProperty('display').value == 'none': root.remove(grp) continue elem = root.find('sodipodi:namedview', namespaces) if elem is not None: # pragma: no branch root.remove(elem) elem = root.find('svg:metadata', namespaces) if elem is not None: # pragma: no branch root.remove(elem) with tempfile.TemporaryDirectory() as tmpdirname: svgpath = os.path.join(tmpdirname, 'svg') with open(svgpath, 'wb') as fp: tree.write(fp, encoding='utf-8', xml_declaration=True) # OMG.. there must be a better way... # OTOH, distributed archives should always contain the stripped # version, so this would only ever run on dev machines. pngpath = os.path.join(tmpdirname, 'png') subprocess.check_call(['inkscape', svgpath, '-e', pngpath], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) info = subprocess.check_output( ['convert', pngpath, '-trim', 'info:-']) info = info.decode('ascii') m = re.match( r'.* PNG (\d+)x(\d+) (\d+)x(\d+)\+(\d+)\+(\d+) .*$', info) assert m is not None, info width, height, canvas_width, canvas_height, xoffset, yoffset = [ int(g) for g in m.groups()] xoffset -= 8 yoffset -= 8 width += 16 height += 16 root.set( 'viewBox', '%d %d %d %d' % (xoffset, yoffset, width, height)) root.set( _fqattr('noisicaa', 'origin-x'), '%f' % ((canvas_width / 2 - xoffset) / 2)) root.set( _fqattr('noisicaa', 'origin-y'), '%f' % ((canvas_height / 2 - yoffset) / 2)) root.set('width', '%f' % (width / 2)) root.set('height', '%f' % (height / 2))
def get_note_esp_vert(soup): #<span style="float: left; width: 160px;">Espaces verts :</span> <div class="progress_bar"><div style="height: 19px;width: 124px; opacity: 0.5 ; background-color: #fff; float: right;"></div></div><br class="clear" /> var=None try: temp=soup.find("span", text=re.compile("Espaces verts")).find_next("div").find_next("div")["style"] var = 6-(int(re.match(r'\d+', parseStyle(temp).width).group())/31) except Exception as e: print("no note_esp_vert", e) print("note_esp_vert", var) return var
def make_text(text, width=-1, style=''): """ Make and return a TextFragment built from a TextBox in an HTML document. """ style = dict(effective_declarations(cssutils.parseStyle( 'font-family: Nimbus Mono L, Liberation Mono, FreeMono, Monospace; ' + style))) style = computed_from_cascaded(None, style, None) surface = cairo.SVGSurface(None, 1, 1) return TextFragment(text, style, cairo.Context(surface), width)
def dl_stickers(page): images = page.find_all('span', attrs={"style": not ""}) for i in images: imageurl = i['style'] imageurl = cssutils.parseStyle(imageurl) imageurl = imageurl['background-image'] imageurl = imageurl.replace('url(', '').replace(')', '') imageurl = imageurl[1:-15] response = urllib.request.urlopen(imageurl) resize_sticker(response, imageurl)
def test_border_normalization(self): def border_edge_dict(expected, edge='right'): ans = {'border-%s-%s' % (edge, x): DEFAULTS['border-%s-%s' % (edge, x)] for x in ('style', 'width', 'color')} for x, v in expected.iteritems(): ans['border-%s-%s' % (edge, x)] = v return ans def border_dict(expected): ans = {} for edge in EDGES: ans.update(border_edge_dict(expected, edge)) return ans def border_val_dict(expected, val='color'): ans = {'border-%s-%s' % (edge, val): DEFAULTS['border-%s-%s' % (edge, val)] for edge in EDGES} for edge in EDGES: ans['border-%s-%s' % (edge, val)] = expected return ans for raw, expected in { 'solid 1px red': {'color':'red', 'width':'1px', 'style':'solid'}, '1px': {'width': '1px'}, '#aaa': {'color': '#aaa'}, '2em groove': {'width':'2em', 'style':'groove'}, }.iteritems(): for edge in EDGES: br = 'border-%s' % edge val = tuple(parseStyle('%s: %s' % (br, raw), validate=False))[0].cssValue self.assertDictEqual(border_edge_dict(expected, edge), normalizers[br](br, val)) for raw, expected in { 'solid 1px red': {'color':'red', 'width':'1px', 'style':'solid'}, '1px': {'width': '1px'}, '#aaa': {'color': '#aaa'}, 'thin groove': {'width':'thin', 'style':'groove'}, }.iteritems(): val = tuple(parseStyle('%s: %s' % ('border', raw), validate=False))[0].cssValue self.assertDictEqual(border_dict(expected), normalizers['border']('border', val)) for name, val in { 'width': '10%', 'color': 'rgb(0, 1, 1)', 'style': 'double', }.iteritems(): cval = tuple(parseStyle('border-%s: %s' % (name, val), validate=False))[0].cssValue self.assertDictEqual(border_val_dict(val, name), normalizers['border-'+name]('border-'+name, cval))
def toBootstrap(form): soup = BeautifulSoup(form) while True: # Change tbody tags to div.containers . This works for bootstrap because .containers are stackable(but not container-fluid) tbody = soup.find('tbody') if not tbody: break tbody.name = 'div' tbody['class'] = tbody.get('class', []) + ['containers'] while True: # Change table tags to div.containers table = soup.find('table') if not table: break table.name = 'div' table['class'] = table.get('class', []) + ['containers'] while True: # Change tr to div.row tr = soup.find('tr') if not tr: break tr.name = 'div' tr['class'] = tr.get('class', []) + ['row'] while True: # Change td to columns. The xs is for smaller screens. sm is for bigger. These can be changed easily later for fit the layout needs. td = soup.find('td') if not td: break td.name = 'div' td['class'] = td.get('class', []) + ['col-xs-12', 'col-sm-12'] divs = soup.findAll('div') # Strip every width, height and text-align inline css style. for div in divs: css = cssutils.parseStyle(div.get('style', '')) del css['width'] del css['height'] del css['text-align'] del css['vertical-align'] div['style'] = css.cssText while True: # Recusively delete the empty p div span tags. On the websites they are often used to add space. This is not a common practice and won't work for most browsers. # empty = soup.find(lambda tag: tag.name in ['p','div','span'] and (not tag.contents and (tag.string == None or len(tag.string)<=6))) empty = soup.find(lambda tag: tag.name in ['p','div','span'] and tag.find(True) is None and (tag.string is None or tag.string.strip()=="")) if not empty: break print (empty) empty.extract() for tag in soup(): del tag['align'] del tag['valign'] del tag['width'] del tag['height'] del tag['max-width'] del tag['max-height'] return soup
def test_border_condensation(self): vals = 'red solid 5px' css = '; '.join('border-%s-%s: %s' % (edge, p, v) for edge in EDGES for p, v in zip(BORDER_PROPS, vals.split())) style = parseStyle(css) condense_rule(style) for e, p in product(EDGES, BORDER_PROPS): self.assertFalse(style.getProperty('border-%s-%s' % (e, p))) self.assertFalse(style.getProperty('border-%s' % e)) self.assertFalse(style.getProperty('border-%s' % p)) self.assertEqual(style.getProperty('border').value, vals) css = '; '.join('border-%s-%s: %s' % (edge, p, v) for edge in ('top',) for p, v in zip(BORDER_PROPS, vals.split())) style = parseStyle(css) condense_rule(style) self.assertEqual(style.cssText, 'border-top: %s' % vals) css += ';' + '; '.join('border-%s-%s: %s' % (edge, p, v) for edge in ('right', 'left', 'bottom') for p, v in zip(BORDER_PROPS, vals.replace('red', 'green').split())) style = parseStyle(css) condense_rule(style) self.assertEqual(len(style.getProperties()), 4) self.assertEqual(style.getProperty('border-top').value, vals) self.assertEqual(style.getProperty('border-left').value, vals.replace('red', 'green'))
def test_edge_normalization(self): def edge_dict(prefix, expected): return {'%s-%s' % (prefix, edge) : x for edge, x in zip(EDGES, expected)} for raw, expected in { '2px': ('2px', '2px', '2px', '2px'), '1em 2em': ('1em', '2em', '1em', '2em'), '1em 2em 3em': ('1em', '2em', '3em', '2em'), '1 2 3 4': ('1', '2', '3', '4'), }.iteritems(): for prefix in ('margin', 'padding'): cval = tuple(parseStyle('%s: %s' % (prefix, raw), validate=False))[0].cssValue self.assertDictEqual(edge_dict(prefix, expected), normalizers[prefix](prefix, cval))
def test_parsevalidation(self): style = 'color: 1' t = 'a { %s }' % style cssutils.log.setLevel(logging.DEBUG) # sheet s = self._setHandler() cssutils.parseString(t) self.assertNotEqual(len(s.getvalue()), 0) s = self._setHandler() cssutils.parseString(t, validate=False) self.assertEqual(s.getvalue(), '') # style s = self._setHandler() cssutils.parseStyle(style) self.assertNotEqual(len(s.getvalue()), 0) s = self._setHandler() cssutils.parseStyle(style, validate=True) self.assertNotEqual(len(s.getvalue()), 0) s = self._setHandler() cssutils.parseStyle(style, validate=False) self.assertEqual(s.getvalue(), '')
def test_list_style_normalization(self): def ls_dict(expected): ans = {'list-style-%s' % x : DEFAULTS['list-style-%s' % x] for x in ('type', 'image', 'position')} for k, v in expected.iteritems(): ans['list-style-%s' % k] = v return ans for raw, expected in { 'url(http://www.example.com/images/list.png)': {'image': 'url(http://www.example.com/images/list.png)'}, 'inside square': {'position':'inside', 'type':'square'}, 'upper-roman url(img) outside': {'position':'outside', 'type':'upper-roman', 'image':'url(img)'}, }.iteritems(): cval = tuple(parseStyle('list-style: %s' % raw, validate=False))[0].cssValue self.assertDictEqual(ls_dict(expected), normalizers['list-style']('list-style', cval))
def test_edge_normalization(self): def edge_dict(prefix, expected): return {"%s-%s" % (prefix, edge): x for edge, x in zip(EDGES, expected)} for raw, expected in { "2px": ("2px", "2px", "2px", "2px"), "1em 2em": ("1em", "2em", "1em", "2em"), "1em 2em 3em": ("1em", "2em", "3em", "2em"), "1 2 3 4": ("1", "2", "3", "4"), }.iteritems(): for prefix in ("margin", "padding"): cval = tuple(parseStyle("%s: %s" % (prefix, raw), validate=False))[0].cssValue self.assertDictEqual(edge_dict(prefix, expected), normalizers[prefix](prefix, cval))
def iterstyles(node, rules): yield from rules # According to CSS 2.1 (http://www.w3.org/TR/CSS21/cascade.html#specificity) # style attributes have the highest weight, so we yield it last # (CSS 3 uses the same weight) if "style" in node.attrs: style = node.attrs.style if not style.isfancy(): yield ( (1, 0, 0, 0), xfind.IsSelector(node), cssutils.parseStyle(str(style)) # parse the style out of the style attribute )
def test_children(self): "CSSStyleDeclaration.children()" style = u'/*1*/color: red; color: green; @x;' types = [ cssutils.css.CSSComment, cssutils.css.Property, cssutils.css.Property, cssutils.css.CSSUnknownRule ] def t(s): for i, x in enumerate(s.children()): self.assertEqual(types[i], type(x)) self.assertEqual(x.parent, s) t(cssutils.parseStyle(style)) t(cssutils.parseString(u'a {'+style+'}').cssRules[0].style) t(cssutils.parseString(u'@media all {a {'+style+'}}').cssRules[0].cssRules[0].style) s = cssutils.parseStyle(style) s['x'] = '0' self.assertEqual(s, s.getProperty('x').parent) s.setProperty('y', '1') self.assertEqual(s, s.getProperty('y').parent)
def test_list_style_normalization(self): def ls_dict(expected): ans = {"list-style-%s" % x: DEFAULTS["list-style-%s" % x] for x in ("type", "image", "position")} for k, v in expected.iteritems(): ans["list-style-%s" % k] = v return ans for raw, expected in { "url(http://www.example.com/images/list.png)": {"image": "url(http://www.example.com/images/list.png)"}, "inside square": {"position": "inside", "type": "square"}, "upper-roman url(img) outside": {"position": "outside", "type": "upper-roman", "image": "url(img)"}, }.iteritems(): cval = tuple(parseStyle("list-style: %s" % raw, validate=False))[0].cssValue self.assertDictEqual(ls_dict(expected), normalizers["list-style"]("list-style", cval))
def csstext_to_pairs(csstext): """ csstext_to_pairs takes css text and make it to list of tuple of key,value. """ # The lock is required to avoid ``cssutils`` concurrency # issues documented in issue #65 with csstext_to_pairs._lock: return sorted( [ (prop.name.strip(), format_value(prop)) for prop in cssutils.parseStyle(csstext) ], key=itemgetter(0), )
def _filterStyles(self, article): for node in article.tree.xpath('//*[@style]'): #original_style = node.get('style') try: styles = cssutils.parseStyle(node.get('style'), validate=True) except ValueError: # the node style is broken and cssutils crashes - we remove the style del node.attrib['style'] removed_style = False for style in styles.children(): if hasattr(style, 'valid') and not style.valid: styles.removeProperty(style.name) removed_style = True if removed_style: node.set('style', styles.getCssText().replace('\n', ''))
def _apply_style_attr(self, url_replacer=None): attrib = self._element.attrib if 'style' not in attrib: return css = attrib['style'].split(';') css = filter(None, (x.strip() for x in css)) css = [y.strip() for y in css] css = [y for y in css if self.MS_PAT.match(y) is None] css = '; '.join(css) try: style = parseStyle(css, validate=False) except CSSSyntaxError: return if url_replacer is not None: replaceUrls(style, url_replacer, ignoreImportRules=True) self._style.update(self._stylizer.flatten_style(style))
def parse_tag(tag, update_time): obj = {} # 車両番号 if not tag.has_attr("src"): logging.error('no "src" attribute: %s', tag) return None car_id, _ = os.path.splitext(os.path.basename(tag["src"])) obj["car_id"] = car_id # 位置 if not tag.has_attr("style"): logging.error('no "style" attribute: %s', tag) return None style = parseStyle(tag["style"]) top = style.getProperty("top").propertyValue if top.length == 0: logging.error('no css style "top": %s', tag) return None left = style.getProperty("left").propertyValue if left.length == 0: logging.error('no css style "left": %s', tag) return None pos = resource.get_position(top[0].value, left[0].value) if pos is None: logging.error("undefined position: <top=%d, left=%d>", top[0].value, left[0].value) return None obj["status"] = pos[0] obj["direction"] = pos[1] obj["current_stop"] = pos[2] if pos[3] is not None: obj["next_stop"] = pos[3] # 行先 if not tag.has_attr("title"): logging.error('no "title" attribute: %s', tag) return None if not tag["title"].startswith(u"行先:"): logging.error('"title" attribute is not destination: %s', tag) return None dest = tag["title"].replace(u"行先:", "").replace(u"行き", "") obj["destination"] = resource.find_stop_code(dest) # 更新日時 obj["update_time"] = update_time return Car(**obj)