Python BeautifulSoup.cssselect Examples

Programming Language: Python
Namespace/Package Name: BeautifulSoup
Class/Type: BeautifulSoup
Method/Function: cssselect
Examples at hotexamples.com: 1
Python BeautifulSoup.cssselect - 1 examples found. These are the top rated real world Python examples of BeautifulSoup.BeautifulSoup.cssselect extracted from open source projects. You can rate examples to help us improve the quality of examples.
Frequently Used Methods
Show Hide
BeautifulSoup(30)
decompose(30)
first(30)
find_all(30)
findAll(30)
find(30)
fetch(30)
feed(30)
getText(29)
insert(20)
findChildren(19)
body(12)
close(11)
__str__(11)
encode(8)
new_tag(6)
findChild(5)
append(4)
prettify(4)
findSelect(4)
decode(4)
get(4)
__unicode__(3)
goahead(3)
lower(3)
div(3)
findall(3)
pretify(3)
__init__(3)
firstText(2)
pop(2)
data(2)
findNext(2)
read(2)
index(1)
html(1)
query(1)
json(1)
load(1)
re_left(1)
noscript(1)
orig_url(1)
partition(1)
popTag(1)
pretiffy(1)
head(1)
findNextSiblings(1)
group(1)
encodeContents(1)
attrs(1)
Example #1
Show file
File: __init__.py Project: subblime/pynliner
class Pynliner(object):
    """Pynliner class"""

    soup = False
    style_string = False
    stylesheet = False
    output = False

    def __init__(self, log=None, allow_conditional_comments=False):
        self.log = log
        cssutils.log.enabled = False if log is None else True
        self.extra_style_strings = []
        self.allow_conditional_comments = allow_conditional_comments
        self.root_url = None
        self.relative_url = None

    def from_url(self, url):
        """Gets remote HTML page for conversion

        Downloads HTML page from `url` as a string and passes it to the
        `from_string` method. Also sets `self.root_url` and `self.relative_url`
        for use in importing <link> elements.

        Returns self.

        >>> p = Pynliner()
        >>> p.from_url('http://somewebsite.com/file.html')
        <Pynliner object at 0x26ac70>
        """
        self.url = url
        self.relative_url = '/'.join(url.split('/')[:-1]) + '/'
        self.root_url = '/'.join(url.split('/')[:3])
        self.source_string = self._get_url(self.url)
        return self

    def from_string(self, string):
        """Generates a Pynliner object from the given HTML string.

        Returns self.

        >>> p = Pynliner()
        >>> p.from_string('<style>h1 {color:#ffcc00;}</style><h1>Hi</h1>')
        <Pynliner object at 0x26ac70>
        """
        self.source_string = string
        return self

    def with_cssString(self, css_string):
        """Adds external CSS to the Pynliner object. Can be "chained".

        Returns self.

        >>> html = "<h1>Hello World!</h1>"
        >>> css = "h1 { color:#ffcc00; }"
        >>> p = Pynliner()
        >>> p.from_string(html).with_cssString(css)
        <pynliner.Pynliner object at 0x2ca810>
        """
        self.extra_style_strings.append(css_string)
        return self

    def run(self):
        """Applies each step of the process if they have not already been
        performed.

        Returns Unicode output with applied styles.

        >>> html = "<style>h1 { color:#ffcc00; }</style><h1>Hello World!</h1>"
        >>> Pynliner().from_string(html).run()
        u'<h1 style="color: #fc0">Hello World!</h1>'
        """
        if not self.soup:
            self._get_soup()
        if not self.stylesheet:
            self._get_styles()
        self._apply_styles()
        self._get_output()
        self._clean_output()
        return self.output

    def _get_url(self, url):
        """Returns the response content from the given url
        """
        return urllib2.urlopen(url).read()

    def _get_soup(self):
        """Convert source string to BeautifulSoup object. Sets it to self.soup.

        If using mod_wgsi, use html5 parsing to prevent BeautifulSoup
        incompatibility.
        """
        import lxml.html
        self.soup = lxml.html.fromstring(self.source_string)
        return

        # Check if mod_wsgi is running
        # - see http://code.google.com/p/modwsgi/wiki/TipsAndTricks
        try:
            from mod_wsgi import version
            self.soup = BeautifulSoup(self.source_string, "html5lib")
        except:
            self.soup = BeautifulSoup(self.source_string)

    def _get_styles(self):
        """Gets all CSS content from and removes all <link rel="stylesheet"> and
        <style> tags concatenating into one CSS string which is then parsed with
        cssutils and the resulting CSSStyleSheet object set to
        `self.stylesheet`.
        """
        self._get_external_styles()
        self._get_internal_styles()
        for style_string in self.extra_style_strings:
            self.style_string += style_string
        import tinycss
        cssparser = tinycss.make_parser()
        self.stylesheet = cssparser.parse_stylesheet(self.style_string)
        # cssparser = cssutils.CSSParser(log=self.log)
        # self.stylesheet = cssparser.parseString(self.style_string)

    def _get_external_styles(self):
        """Gets <link> element styles
        """
        if not self.style_string:
            self.style_string = u''
        else:
            self.style_string += u'\n'

        link_tags = self.soup.cssselect("link[rel='stylesheet']")
        # link_tags = self.soup.findAll('link', {'rel': 'stylesheet'})
        for tag in link_tags:
            url = tag.attrib.get('href', None)
            if not url:
                continue
            # url = tag['href']

            # Convert the relative URL to an absolute URL ready to pass to urllib
            base_url = self.relative_url or self.root_url
            url = urlparse.urljoin(base_url, url)

            self.style_string += self._get_url(url)
            tag.drop_tree()
            # tag.extract()

    def _get_internal_styles(self):
        """Gets <style> element styles
        """
        if not self.style_string:
            self.style_string = u''
        else:
            self.style_string += u'\n'

        style_tags = self.soup.cssselect('style')
        # style_tags = self.soup.findAll('style')
        for tag in style_tags:
            self.style_string += u'\n'.join([tag.text, tag.tail]) + u'\n'
            # self.style_string += u'\n'.join(tag.contents) + u'\n'
            tag.drop_tree()
            # tag.extract()

    def _get_specificity_from_list(self, lst):
        """
        Takes an array of ints and returns an integer formed
        by adding all ints multiplied by the power of 10 of the current index

        (1, 0, 0, 1) => (1 * 10**3) + (0 * 10**2) + (0 * 10**1) + (1 * 10**0) => 1001
        """
        return int(''.join(map(str, lst)))

    def _get_rule_specificity(self, rule):
        """
        For a given CSSRule get its selector specificity in base 10
        """
        import cssselect
        sels = (s.specificity() for s in cssselect.parse(rule.selector.as_css()))
        return sum(map(self._get_specificity_from_list, sels))
        # return sum(map(self._get_specificity_from_list, (s.specificity for s in rule.selectorList)))

    def _apply_styles(self):
        """Steps through CSS rules and applies each to all the proper elements
        as @style attributes prepending any current @style attributes.
        """
        import tinycss
        import lxml.cssselect
        rules = (rule for rule in self.stylesheet.rules if isinstance(rule, tinycss.css21.RuleSet))
        # rules = self.stylesheet.cssRules.rulesOfType(1)
        elem_prop_map = {}
        elem_style_map = {}
        
        # build up a property list for every styled element
        for rule in rules:
            # select elements for every selector
            # selectors = rule.selectorText.split(',')
            elements = []
            # for selector in selectors:
            #     elements += self.soup.cssselect(selector)
            #     # elements += select(self.soup, selector)
            try:
                elements += self.soup.cssselect(rule.selector.as_css())
            except lxml.cssselect.ExpressionError:
                # Bad rule (likely a pseudo-selector)
                pass
            # build prop_list for each selected element
            for elem in elements:
                if elem not in elem_prop_map:
                    elem_prop_map[elem] = []
                elem_prop_map[elem].append({
                    'specificity': self._get_rule_specificity(rule),
                    'props': rule.declarations,
                    # 'props': rule.style.getProperties(),
                })

        # build up another property list using selector specificity
        for elem, props in elem_prop_map.items():
            if elem not in elem_style_map:
                elem_style_map[elem] = cssutils.css.CSSStyleDeclaration()
            # ascending sort of prop_lists based on specificity
            props = sorted(props, key=lambda p: p['specificity'])
            # for each prop_list, apply to CSSStyleDeclaration
            for prop_list in map(lambda obj: obj['props'], props):
                for prop in prop_list:
                    elem_style_map[elem][prop.name] = prop.value.as_css()


        # apply rules to elements
        for elem, style_declaration in elem_style_map.items():
            if 'style' in elem.attrib:
            # if elem.has_key('style'):
                elem.attrib['style'] = u'%s; %s' % (style_declaration.cssText.replace('\n', ' '), elem.attrib['style'])
                # elem['style'] = u'%s; %s' % (style_declaration.cssText.replace('\n', ' '), elem['style'])
            else:
                elem.attrib['style'] = style_declaration.cssText.replace('\n', ' ')
                # elem['style'] = style_declaration.cssText.replace('\n', ' ')
        
    def _get_output(self):
        """Generate Unicode string of `self.soup` and set it to `self.output`

        Returns self.output
        """
        import lxml.html
        self.output = lxml.html.tostring(self.soup)
        # self.output = unicode(self.soup)
        return self.output
    
    def _clean_output(self):
        """Clean up after BeautifulSoup's output.
        """
        if self.allow_conditional_comments:
            matches = re.finditer('(<!--\[if .+\].+?&lt;!\[endif\]-->)', self.output)
            for match in matches:
                comment = match.group()
                comment = comment.replace('&gt;', '>')
                comment = comment.replace('&lt;', '<')
                self.output = (self.output[:match.start()] + comment +
                               self.output[match.end():])