Python BeautifulSoup.findAll Examples

Programming Language: Python

Namespace/Package Name: bibim.util.beautifulsoup

Class/Type: BeautifulSoup

Method/Function: findAll

Examples at hotexamples.com: 6

Python BeautifulSoup.findAll - 6 examples found. These are the top rated real world Python examples of bibim.util.beautifulsoup.BeautifulSoup.findAll extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BeautifulSoup(13)

findAll(3)

find(1)

Example #1

Show file

 def test_get_text_from_non_leaf(self):
     soup = BeautifulSoup('<html><body>'
                          '<div>'
                          '<span>Text 01</span>'
                          '<span>Text 02</span>'
                          '</div>'
                          '</html></body>')
     text = soup.findAll('div', text=True)
     self.failUnless(len(text) == 2)
     self.failUnless(text[0] == u'Text 01')

Example #2

Show file

File: test_beautifulsoup.py Project: rxuriguera/bibtexIndexMaker

 def test_get_text_from_non_leaf(self):
     soup = BeautifulSoup('<html><body>'
                          '<div>'
                          '<span>Text 01</span>'
                          '<span>Text 02</span>'
                          '</div>'
                          '</html></body>')
     text = soup.findAll('div', text=True)
     self.failUnless(len(text) == 2)
     self.failUnless(text[0] == u'Text 01')

Example #3

Show file

File: helpers.py Project: Alex-Linhares/bibtexIndexMaker

    def clean_content(self, content):
        if not content:
            return None

        to_replace = {
            '\n': ' ',
            '\r': '',
            '\t': '',
            '<br>': ' ',
            '<br/>': ' ',
            '&amp;': '&',
            '&#38;': '&',
            '&#34;': '"',
            '&quot;': '"',
            '&rsquo;': "'",
            '&#39;': "'",
            '&apos;': "'",
            '&#x2013;': '-',
            '&nbsp;': ' '
        }
        for key in to_replace:
            content = content.replace(key, to_replace[key])

        # Remove consecutive whitespaces
        content = re.sub(' {2,}', ' ', content)
        content = re.sub('>( *)<', '><', content)

        content = BeautifulSoup(content)

        # Remove comments
        comments = content.findAll(text=lambda text: isinstance(text, Comment))
        [element.extract() for element in comments]

        # Remove unnecessary HTML elements
        for tag in ['meta', 'link', 'style', 'script']:
            elements = content.findAll(tag)
            [element.extract() for element in elements]

        return content

Example #4

Show file

File: helpers.py Project: rxuriguera/bibtexIndexMaker

    def clean_content(self, content):
        if not content:
            return None
        
        to_replace = {'\n':' ',
                      '\r':'',
                      '\t':'',
                      '<br>':' ',
                      '<br/>':' ',
                      '&amp;':'&',
                      '&#38;':'&',
                      '&#34;':'"',
                      '&quot;':'"',
                      '&rsquo;':"'",
                      '&#39;':"'",
                      '&apos;':"'",
                      '&#x2013;':'-',
                      '&nbsp;':' '}
        for key in to_replace:
            content = content.replace(key, to_replace[key])

        # Remove consecutive whitespaces
        content = re.sub(' {2,}', ' ', content)
        content = re.sub('>( *)<', '><', content)

        content = BeautifulSoup(content)
        
        # Remove comments
        comments = content.findAll(text=lambda text:isinstance(text, Comment))
        [element.extract() for element in comments]
        
        # Remove unnecessary HTML elements
        for tag in ['meta', 'link', 'style', 'script']:
            elements = content.findAll(tag)
            [element.extract() for element in elements]

        return content

Example #5

Show file

File: searcher.py Project: rxuriguera/bibtexIndexMaker

 def check_result_url(self, url, check_string):
     if url in self.results_cache.keys():
         return self.results_cache[url]
     else:
         elements = None
         try:
             time.sleep(5)
             page = self.browser.get_page(url)
             page = self._clean_content(page)
             page = BeautifulSoup(page)
             elements = page.findAll(True,
                                     text=re.compile(check_string.lower()))
         except BrowserError, e:
             print 'ERROR: Browser error: %s' % e
         except Exception, e:
             print 'ERROR: Error checking error: %s' % e

Example #6

Show file

File: searcher.py Project: Alex-Linhares/bibtexIndexMaker

 def check_result_url(self, url, check_string):
     if url in self.results_cache.keys():
         return self.results_cache[url]
     else:
         elements = None
         try:
             time.sleep(5)
             page = self.browser.get_page(url)
             page = self._clean_content(page)
             page = BeautifulSoup(page)
             elements = page.findAll(True,
                                     text=re.compile(check_string.lower()))
         except BrowserError, e:
             print 'ERROR: Browser error: %s' % e
         except Exception, e:
             print 'ERROR: Error checking error: %s' % e