Python BeautifulSoup.cssselect Beispiele

Programmiersprache: Python

Namespace / Paketname: BeautifulSoup

Klasse / Typ: BeautifulSoup

Methode / Funktion: cssselect

Beispiele auf hotexamples.com: 1

Python BeautifulSoup.cssselect - 1 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die BeautifulSoup.BeautifulSoup.cssselect, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

BeautifulSoup(30)

decompose(30)

first(30)

find_all(30)

findAll(30)

find(30)

fetch(30)

feed(30)

getText(29)

insert(20)

findChildren(19)

body(12)

close(11)

__str__(11)

encode(8)

new_tag(6)

findChild(5)

append(4)

prettify(4)

findSelect(4)

decode(4)

get(4)

__unicode__(3)

goahead(3)

lower(3)

div(3)

findall(3)

pretify(3)

__init__(3)

firstText(2)

pop(2)

data(2)

findNext(2)

read(2)

index(1)

html(1)

query(1)

json(1)

load(1)

re_left(1)

noscript(1)

orig_url(1)

partition(1)

popTag(1)

pretiffy(1)

head(1)

findNextSiblings(1)

group(1)

encodeContents(1)

attrs(1)

Beispiel #1

Datei anzeigen

Datei: __init__.py Projekt: subblime/pynliner

class Pynliner(object):
    """Pynliner class"""

    soup = False
    style_string = False
    stylesheet = False
    output = False

    def __init__(self, log=None, allow_conditional_comments=False):
        self.log = log
        cssutils.log.enabled = False if log is None else True
        self.extra_style_strings = []
        self.allow_conditional_comments = allow_conditional_comments
        self.root_url = None
        self.relative_url = None

    def from_url(self, url):
        """Gets remote HTML page for conversion

        Downloads HTML page from `url` as a string and passes it to the
        `from_string` method. Also sets `self.root_url` and `self.relative_url`
        for use in importing <link> elements.

        Returns self.

        >>> p = Pynliner()
        >>> p.from_url('http://somewebsite.com/file.html')
        <Pynliner object at 0x26ac70>
        """
        self.url = url
        self.relative_url = '/'.join(url.split('/')[:-1]) + '/'
        self.root_url = '/'.join(url.split('/')[:3])
        self.source_string = self._get_url(self.url)
        return self

    def from_string(self, string):
        """Generates a Pynliner object from the given HTML string.

        Returns self.

        >>> p = Pynliner()
        >>> p.from_string('<style>h1 {color:#ffcc00;}</style><h1>Hi</h1>')
        <Pynliner object at 0x26ac70>
        """
        self.source_string = string
        return self

    def with_cssString(self, css_string):
        """Adds external CSS to the Pynliner object. Can be "chained".

        Returns self.

        >>> html = "<h1>Hello World!</h1>"
        >>> css = "h1 { color:#ffcc00; }"
        >>> p = Pynliner()
        >>> p.from_string(html).with_cssString(css)
        <pynliner.Pynliner object at 0x2ca810>
        """
        self.extra_style_strings.append(css_string)
        return self

    def run(self):
        """Applies each step of the process if they have not already been
        performed.

        Returns Unicode output with applied styles.

        >>> html = "<style>h1 { color:#ffcc00; }</style><h1>Hello World!</h1>"
        >>> Pynliner().from_string(html).run()
        u'<h1 style="color: #fc0">Hello World!</h1>'
        """
        if not self.soup:
            self._get_soup()
        if not self.stylesheet:
            self._get_styles()
        self._apply_styles()
        self._get_output()
        self._clean_output()
        return self.output

    def _get_url(self, url):
        """Returns the response content from the given url
        """
        return urllib2.urlopen(url).read()

    def _get_soup(self):
        """Convert source string to BeautifulSoup object. Sets it to self.soup.

        If using mod_wgsi, use html5 parsing to prevent BeautifulSoup
        incompatibility.
        """
        import lxml.html
        self.soup = lxml.html.fromstring(self.source_string)
        return

        # Check if mod_wsgi is running
        # - see http://code.google.com/p/modwsgi/wiki/TipsAndTricks
        try:
            from mod_wsgi import version
            self.soup = BeautifulSoup(self.source_string, "html5lib")
        except:
            self.soup = BeautifulSoup(self.source_string)

    def _get_styles(self):
        """Gets all CSS content from and removes all <link rel="stylesheet"> and
        <style> tags concatenating into one CSS string which is then parsed with
        cssutils and the resulting CSSStyleSheet object set to
        `self.stylesheet`.
        """
        self._get_external_styles()
        self._get_internal_styles()
        for style_string in self.extra_style_strings:
            self.style_string += style_string
        import tinycss
        cssparser = tinycss.make_parser()
        self.stylesheet = cssparser.parse_stylesheet(self.style_string)
        # cssparser = cssutils.CSSParser(log=self.log)
        # self.stylesheet = cssparser.parseString(self.style_string)

    def _get_external_styles(self):
        """Gets <link> element styles
        """
        if not self.style_string:
            self.style_string = u''
        else:
            self.style_string += u'\n'

        link_tags = self.soup.cssselect("link[rel='stylesheet']")
        # link_tags = self.soup.findAll('link', {'rel': 'stylesheet'})
        for tag in link_tags:
            url = tag.attrib.get('href', None)
            if not url:
                continue
            # url = tag['href']

            # Convert the relative URL to an absolute URL ready to pass to urllib
            base_url = self.relative_url or self.root_url
            url = urlparse.urljoin(base_url, url)

            self.style_string += self._get_url(url)
            tag.drop_tree()
            # tag.extract()

    def _get_internal_styles(self):
        """Gets <style> element styles
        """
        if not self.style_string:
            self.style_string = u''
        else:
            self.style_string += u'\n'

        style_tags = self.soup.cssselect('style')
        # style_tags = self.soup.findAll('style')
        for tag in style_tags:
            self.style_string += u'\n'.join([tag.text, tag.tail]) + u'\n'
            # self.style_string += u'\n'.join(tag.contents) + u'\n'
            tag.drop_tree()
            # tag.extract()

    def _get_specificity_from_list(self, lst):
        """
        Takes an array of ints and returns an integer formed
        by adding all ints multiplied by the power of 10 of the current index

        (1, 0, 0, 1) => (1 * 10**3) + (0 * 10**2) + (0 * 10**1) + (1 * 10**0) => 1001
        """
        return int(''.join(map(str, lst)))

    def _get_rule_specificity(self, rule):
        """
        For a given CSSRule get its selector specificity in base 10
        """
        import cssselect
        sels = (s.specificity() for s in cssselect.parse(rule.selector.as_css()))
        return sum(map(self._get_specificity_from_list, sels))
        # return sum(map(self._get_specificity_from_list, (s.specificity for s in rule.selectorList)))

    def _apply_styles(self):
        """Steps through CSS rules and applies each to all the proper elements
        as @style attributes prepending any current @style attributes.
        """
        import tinycss
        import lxml.cssselect
        rules = (rule for rule in self.stylesheet.rules if isinstance(rule, tinycss.css21.RuleSet))
        # rules = self.stylesheet.cssRules.rulesOfType(1)
        elem_prop_map = {}
        elem_style_map = {}
        
        # build up a property list for every styled element
        for rule in rules:
            # select elements for every selector
            # selectors = rule.selectorText.split(',')
            elements = []
            # for selector in selectors:
            #     elements += self.soup.cssselect(selector)
            #     # elements += select(self.soup, selector)
            try:
                elements += self.soup.cssselect(rule.selector.as_css())
            except lxml.cssselect.ExpressionError:
                # Bad rule (likely a pseudo-selector)
                pass
            # build prop_list for each selected element
            for elem in elements:
                if elem not in elem_prop_map:
                    elem_prop_map[elem] = []
                elem_prop_map[elem].append({
                    'specificity': self._get_rule_specificity(rule),
                    'props': rule.declarations,
                    # 'props': rule.style.getProperties(),
                })

        # build up another property list using selector specificity
        for elem, props in elem_prop_map.items():
            if elem not in elem_style_map:
                elem_style_map[elem] = cssutils.css.CSSStyleDeclaration()
            # ascending sort of prop_lists based on specificity
            props = sorted(props, key=lambda p: p['specificity'])
            # for each prop_list, apply to CSSStyleDeclaration
            for prop_list in map(lambda obj: obj['props'], props):
                for prop in prop_list:
                    elem_style_map[elem][prop.name] = prop.value.as_css()


        # apply rules to elements
        for elem, style_declaration in elem_style_map.items():
            if 'style' in elem.attrib:
            # if elem.has_key('style'):
                elem.attrib['style'] = u'%s; %s' % (style_declaration.cssText.replace('\n', ' '), elem.attrib['style'])
                # elem['style'] = u'%s; %s' % (style_declaration.cssText.replace('\n', ' '), elem['style'])
            else:
                elem.attrib['style'] = style_declaration.cssText.replace('\n', ' ')
                # elem['style'] = style_declaration.cssText.replace('\n', ' ')
        
    def _get_output(self):
        """Generate Unicode string of `self.soup` and set it to `self.output`

        Returns self.output
        """
        import lxml.html
        self.output = lxml.html.tostring(self.soup)
        # self.output = unicode(self.soup)
        return self.output
    
    def _clean_output(self):
        """Clean up after BeautifulSoup's output.
        """
        if self.allow_conditional_comments:
            matches = re.finditer('(<!--\[if .+\].+?&lt;!\[endif\]-->)', self.output)
            for match in matches:
                comment = match.group()
                comment = comment.replace('&gt;', '>')
                comment = comment.replace('&lt;', '<')
                self.output = (self.output[:match.start()] + comment +
                               self.output[match.end():])