Example #1
0
 def get_topics(self):
     html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, 'browse.html'), 'r').read()
     html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
     content = html.find(id='content')
     topics = dict()
     for a in content.findAll('a'):
         topics[self.clean_string(a.string)] = a.get('href').replace('%20', ' ')
     return topics
Example #2
0
 def get_topics(self):
     html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, 'browse.html'),
                 'r').read()
     html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
     content = html.find(id='content')
     topics = dict()
     for a in content.findAll('a'):
         topics[self.clean_string(a.string)] = a.get('href').replace(
             '%20', ' ')
     return topics
Example #3
0
 def get_items(self, file, heading_only=False):
     html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, file), 'r').read()
     html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
     content = html.find(id='content')
     result = dict()
     for item in content.findChildren('strong'):
         heading = self.clean_string(''.join(
             filter(lambda s: type(s) == NavigableString, item.contents)))
         if not heading:
             print "Bad heading (%s):" % file, item.contents
         if heading:
             h = re.search(r'^(.+), interview(ed)? by (.+) on (.+)$',
                           heading)
             if not h:
                 print "Bad heading (%s):" % file, heading
                 continue
             if heading_only:
                 result[heading] = None
                 continue
             tag = item.parent.findNextSibling('p')
             description = tag.string
             attachments = []
             while True:
                 tag = tag.findNextSibling('p')
                 if not tag.get('align') == 'right':
                     break
                 for a in tag.findAll('a'):
                     attachments.append(
                         dict(url=a.get('href'),
                              ext=a.get('href')[-3:].lower(),
                              title=self.clean_string(a.string)))
             d = re.search(
                 r'^(.+) Duration:? ((\d+) hr )?(\d+) ?min( (\d+) sec)?\.',
                 description, re.MULTILINE)
             if not d:
                 duration = 0
             else:
                 description = d.group(1)
                 duration = int(d.group(3) or '0') * 3600 + int(
                     d.group(4)) * 60 + int(d.group(6) or '0')
             result[heading] = dict(interviewee=h.group(1),
                                    interviewer=h.group(3),
                                    date=h.group(4),
                                    description=description,
                                    duration=duration,
                                    attachments=attachments)
     return result
Example #4
0
 def get_items(self, file, heading_only=False):
     html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, file), 'r').read()
     html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
     content = html.find(id='content')
     result = dict()
     for item in content.findChildren('strong'):
         heading = self.clean_string(''.join(filter(lambda s: type(s)==NavigableString, item.contents)))
         if not heading:
             print "Bad heading (%s):" % file, item.contents
         if heading:
             h = re.search(r'^(.+), interview(ed)? by (.+) on (.+)$', heading)
             if not h:
                 print "Bad heading (%s):" % file, heading
                 continue
             if heading_only:
                 result[heading] = None
                 continue
             tag = item.parent.findNextSibling('p')
             description = tag.string
             attachments = []
             while True:
                 tag = tag.findNextSibling('p')
                 if not tag.get('align') == 'right':
                     break
                 for a in tag.findAll('a'):
                     attachments.append(dict(url=a.get('href'),
                                             ext=a.get('href')[-3:].lower(),
                                             title=self.clean_string(a.string)))
             d = re.search(r'^(.+) Duration:? ((\d+) hr )?(\d+) ?min( (\d+) sec)?\.', description, re.MULTILINE)
             if not d:
                 duration = 0
             else:
                 description = d.group(1)
                 duration = int(d.group(3) or '0') * 3600 + int(d.group(4)) * 60 + int(d.group(6) or '0')
             result[heading] = dict(interviewee=h.group(1),
                                       interviewer=h.group(3),
                                       date=h.group(4),
                                       description=description,
                                       duration=duration,
                                       attachments=attachments)
     return result