Python BeautifulSoup Examples

Programming Language: Python

Namespace/Package Name: rooibos.util.BeautifulSoup

Class/Type: BeautifulSoup

Examples at hotexamples.com: 4

Python BeautifulSoup - 4 examples found. These are the top rated real world Python examples of rooibos.util.BeautifulSoup.BeautifulSoup extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BeautifulSoup(2)

find(2)

Example #1

Show file

File: svohp_import.py Project: cit-jmu/rooibos-apps

 def get_topics(self):
     html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, 'browse.html'), 'r').read()
     html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
     content = html.find(id='content')
     topics = dict()
     for a in content.findAll('a'):
         topics[self.clean_string(a.string)] = a.get('href').replace('%20', ' ')
     return topics

Example #2

Show file

File: svohp_import.py Project: hanleybrand/rooibos-apps

 def get_topics(self):
     html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, 'browse.html'),
                 'r').read()
     html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
     content = html.find(id='content')
     topics = dict()
     for a in content.findAll('a'):
         topics[self.clean_string(a.string)] = a.get('href').replace(
             '%20', ' ')
     return topics

Example #3

Show file

File: svohp_import.py Project: hanleybrand/rooibos-apps

 def get_items(self, file, heading_only=False):
     html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, file), 'r').read()
     html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
     content = html.find(id='content')
     result = dict()
     for item in content.findChildren('strong'):
         heading = self.clean_string(''.join(
             filter(lambda s: type(s) == NavigableString, item.contents)))
         if not heading:
             print "Bad heading (%s):" % file, item.contents
         if heading:
             h = re.search(r'^(.+), interview(ed)? by (.+) on (.+)$',
                           heading)
             if not h:
                 print "Bad heading (%s):" % file, heading
                 continue
             if heading_only:
                 result[heading] = None
                 continue
             tag = item.parent.findNextSibling('p')
             description = tag.string
             attachments = []
             while True:
                 tag = tag.findNextSibling('p')
                 if not tag.get('align') == 'right':
                     break
                 for a in tag.findAll('a'):
                     attachments.append(
                         dict(url=a.get('href'),
                              ext=a.get('href')[-3:].lower(),
                              title=self.clean_string(a.string)))
             d = re.search(
                 r'^(.+) Duration:? ((\d+) hr )?(\d+) ?min( (\d+) sec)?\.',
                 description, re.MULTILINE)
             if not d:
                 duration = 0
             else:
                 description = d.group(1)
                 duration = int(d.group(3) or '0') * 3600 + int(
                     d.group(4)) * 60 + int(d.group(6) or '0')
             result[heading] = dict(interviewee=h.group(1),
                                    interviewer=h.group(3),
                                    date=h.group(4),
                                    description=description,
                                    duration=duration,
                                    attachments=attachments)
     return result

Example #4

Show file

File: svohp_import.py Project: cit-jmu/rooibos-apps

 def get_items(self, file, heading_only=False):
     html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, file), 'r').read()
     html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
     content = html.find(id='content')
     result = dict()
     for item in content.findChildren('strong'):
         heading = self.clean_string(''.join(filter(lambda s: type(s)==NavigableString, item.contents)))
         if not heading:
             print "Bad heading (%s):" % file, item.contents
         if heading:
             h = re.search(r'^(.+), interview(ed)? by (.+) on (.+)$', heading)
             if not h:
                 print "Bad heading (%s):" % file, heading
                 continue
             if heading_only:
                 result[heading] = None
                 continue
             tag = item.parent.findNextSibling('p')
             description = tag.string
             attachments = []
             while True:
                 tag = tag.findNextSibling('p')
                 if not tag.get('align') == 'right':
                     break
                 for a in tag.findAll('a'):
                     attachments.append(dict(url=a.get('href'),
                                             ext=a.get('href')[-3:].lower(),
                                             title=self.clean_string(a.string)))
             d = re.search(r'^(.+) Duration:? ((\d+) hr )?(\d+) ?min( (\d+) sec)?\.', description, re.MULTILINE)
             if not d:
                 duration = 0
             else:
                 description = d.group(1)
                 duration = int(d.group(3) or '0') * 3600 + int(d.group(4)) * 60 + int(d.group(6) or '0')
             result[heading] = dict(interviewee=h.group(1),
                                       interviewer=h.group(3),
                                       date=h.group(4),
                                       description=description,
                                       duration=duration,
                                       attachments=attachments)
     return result