class Pynliner(object): """Pynliner class""" soup = False style_string = False stylesheet = False output = False def __init__(self, log=None, allow_conditional_comments=False): self.log = log cssutils.log.enabled = False if log is None else True self.extra_style_strings = [] self.allow_conditional_comments = allow_conditional_comments self.root_url = None self.relative_url = None def from_url(self, url): """Gets remote HTML page for conversion Downloads HTML page from `url` as a string and passes it to the `from_string` method. Also sets `self.root_url` and `self.relative_url` for use in importing <link> elements. Returns self. >>> p = Pynliner() >>> p.from_url('http://somewebsite.com/file.html') <Pynliner object at 0x26ac70> """ self.url = url self.relative_url = '/'.join(url.split('/')[:-1]) + '/' self.root_url = '/'.join(url.split('/')[:3]) self.source_string = self._get_url(self.url) return self def from_string(self, string): """Generates a Pynliner object from the given HTML string. Returns self. >>> p = Pynliner() >>> p.from_string('<style>h1 {color:#ffcc00;}</style><h1>Hi</h1>') <Pynliner object at 0x26ac70> """ self.source_string = string return self def with_cssString(self, css_string): """Adds external CSS to the Pynliner object. Can be "chained". Returns self. >>> html = "<h1>Hello World!</h1>" >>> css = "h1 { color:#ffcc00; }" >>> p = Pynliner() >>> p.from_string(html).with_cssString(css) <pynliner.Pynliner object at 0x2ca810> """ self.extra_style_strings.append(css_string) return self def run(self): """Applies each step of the process if they have not already been performed. Returns Unicode output with applied styles. >>> html = "<style>h1 { color:#ffcc00; }</style><h1>Hello World!</h1>" >>> Pynliner().from_string(html).run() u'<h1 style="color: #fc0">Hello World!</h1>' """ if not self.soup: self._get_soup() if not self.stylesheet: self._get_styles() self._apply_styles() self._get_output() self._clean_output() return self.output def _get_url(self, url): """Returns the response content from the given url """ return urllib2.urlopen(url).read() def _get_soup(self): """Convert source string to BeautifulSoup object. Sets it to self.soup. If using mod_wgsi, use html5 parsing to prevent BeautifulSoup incompatibility. """ import lxml.html self.soup = lxml.html.fromstring(self.source_string) return # Check if mod_wsgi is running # - see http://code.google.com/p/modwsgi/wiki/TipsAndTricks try: from mod_wsgi import version self.soup = BeautifulSoup(self.source_string, "html5lib") except: self.soup = BeautifulSoup(self.source_string) def _get_styles(self): """Gets all CSS content from and removes all <link rel="stylesheet"> and <style> tags concatenating into one CSS string which is then parsed with cssutils and the resulting CSSStyleSheet object set to `self.stylesheet`. """ self._get_external_styles() self._get_internal_styles() for style_string in self.extra_style_strings: self.style_string += style_string import tinycss cssparser = tinycss.make_parser() self.stylesheet = cssparser.parse_stylesheet(self.style_string) # cssparser = cssutils.CSSParser(log=self.log) # self.stylesheet = cssparser.parseString(self.style_string) def _get_external_styles(self): """Gets <link> element styles """ if not self.style_string: self.style_string = u'' else: self.style_string += u'\n' link_tags = self.soup.cssselect("link[rel='stylesheet']") # link_tags = self.soup.findAll('link', {'rel': 'stylesheet'}) for tag in link_tags: url = tag.attrib.get('href', None) if not url: continue # url = tag['href'] # Convert the relative URL to an absolute URL ready to pass to urllib base_url = self.relative_url or self.root_url url = urlparse.urljoin(base_url, url) self.style_string += self._get_url(url) tag.drop_tree() # tag.extract() def _get_internal_styles(self): """Gets <style> element styles """ if not self.style_string: self.style_string = u'' else: self.style_string += u'\n' style_tags = self.soup.cssselect('style') # style_tags = self.soup.findAll('style') for tag in style_tags: self.style_string += u'\n'.join([tag.text, tag.tail]) + u'\n' # self.style_string += u'\n'.join(tag.contents) + u'\n' tag.drop_tree() # tag.extract() def _get_specificity_from_list(self, lst): """ Takes an array of ints and returns an integer formed by adding all ints multiplied by the power of 10 of the current index (1, 0, 0, 1) => (1 * 10**3) + (0 * 10**2) + (0 * 10**1) + (1 * 10**0) => 1001 """ return int(''.join(map(str, lst))) def _get_rule_specificity(self, rule): """ For a given CSSRule get its selector specificity in base 10 """ import cssselect sels = (s.specificity() for s in cssselect.parse(rule.selector.as_css())) return sum(map(self._get_specificity_from_list, sels)) # return sum(map(self._get_specificity_from_list, (s.specificity for s in rule.selectorList))) def _apply_styles(self): """Steps through CSS rules and applies each to all the proper elements as @style attributes prepending any current @style attributes. """ import tinycss import lxml.cssselect rules = (rule for rule in self.stylesheet.rules if isinstance(rule, tinycss.css21.RuleSet)) # rules = self.stylesheet.cssRules.rulesOfType(1) elem_prop_map = {} elem_style_map = {} # build up a property list for every styled element for rule in rules: # select elements for every selector # selectors = rule.selectorText.split(',') elements = [] # for selector in selectors: # elements += self.soup.cssselect(selector) # # elements += select(self.soup, selector) try: elements += self.soup.cssselect(rule.selector.as_css()) except lxml.cssselect.ExpressionError: # Bad rule (likely a pseudo-selector) pass # build prop_list for each selected element for elem in elements: if elem not in elem_prop_map: elem_prop_map[elem] = [] elem_prop_map[elem].append({ 'specificity': self._get_rule_specificity(rule), 'props': rule.declarations, # 'props': rule.style.getProperties(), }) # build up another property list using selector specificity for elem, props in elem_prop_map.items(): if elem not in elem_style_map: elem_style_map[elem] = cssutils.css.CSSStyleDeclaration() # ascending sort of prop_lists based on specificity props = sorted(props, key=lambda p: p['specificity']) # for each prop_list, apply to CSSStyleDeclaration for prop_list in map(lambda obj: obj['props'], props): for prop in prop_list: elem_style_map[elem][prop.name] = prop.value.as_css() # apply rules to elements for elem, style_declaration in elem_style_map.items(): if 'style' in elem.attrib: # if elem.has_key('style'): elem.attrib['style'] = u'%s; %s' % (style_declaration.cssText.replace('\n', ' '), elem.attrib['style']) # elem['style'] = u'%s; %s' % (style_declaration.cssText.replace('\n', ' '), elem['style']) else: elem.attrib['style'] = style_declaration.cssText.replace('\n', ' ') # elem['style'] = style_declaration.cssText.replace('\n', ' ') def _get_output(self): """Generate Unicode string of `self.soup` and set it to `self.output` Returns self.output """ import lxml.html self.output = lxml.html.tostring(self.soup) # self.output = unicode(self.soup) return self.output def _clean_output(self): """Clean up after BeautifulSoup's output. """ if self.allow_conditional_comments: matches = re.finditer('(<!--\[if .+\].+?<!\[endif\]-->)', self.output) for match in matches: comment = match.group() comment = comment.replace('>', '>') comment = comment.replace('<', '<') self.output = (self.output[:match.start()] + comment + self.output[match.end():])