def get_topics(self): html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, 'browse.html'), 'r').read() html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) content = html.find(id='content') topics = dict() for a in content.findAll('a'): topics[self.clean_string(a.string)] = a.get('href').replace('%20', ' ') return topics
def get_topics(self): html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, 'browse.html'), 'r').read() html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) content = html.find(id='content') topics = dict() for a in content.findAll('a'): topics[self.clean_string(a.string)] = a.get('href').replace( '%20', ' ') return topics
def get_items(self, file, heading_only=False): html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, file), 'r').read() html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) content = html.find(id='content') result = dict() for item in content.findChildren('strong'): heading = self.clean_string(''.join( filter(lambda s: type(s) == NavigableString, item.contents))) if not heading: print "Bad heading (%s):" % file, item.contents if heading: h = re.search(r'^(.+), interview(ed)? by (.+) on (.+)$', heading) if not h: print "Bad heading (%s):" % file, heading continue if heading_only: result[heading] = None continue tag = item.parent.findNextSibling('p') description = tag.string attachments = [] while True: tag = tag.findNextSibling('p') if not tag.get('align') == 'right': break for a in tag.findAll('a'): attachments.append( dict(url=a.get('href'), ext=a.get('href')[-3:].lower(), title=self.clean_string(a.string))) d = re.search( r'^(.+) Duration:? ((\d+) hr )?(\d+) ?min( (\d+) sec)?\.', description, re.MULTILINE) if not d: duration = 0 else: description = d.group(1) duration = int(d.group(3) or '0') * 3600 + int( d.group(4)) * 60 + int(d.group(6) or '0') result[heading] = dict(interviewee=h.group(1), interviewer=h.group(3), date=h.group(4), description=description, duration=duration, attachments=attachments) return result
def get_items(self, file, heading_only=False): html = open(os.path.join(settings.SVOHP_MEDIA_ROOT, file), 'r').read() html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) content = html.find(id='content') result = dict() for item in content.findChildren('strong'): heading = self.clean_string(''.join(filter(lambda s: type(s)==NavigableString, item.contents))) if not heading: print "Bad heading (%s):" % file, item.contents if heading: h = re.search(r'^(.+), interview(ed)? by (.+) on (.+)$', heading) if not h: print "Bad heading (%s):" % file, heading continue if heading_only: result[heading] = None continue tag = item.parent.findNextSibling('p') description = tag.string attachments = [] while True: tag = tag.findNextSibling('p') if not tag.get('align') == 'right': break for a in tag.findAll('a'): attachments.append(dict(url=a.get('href'), ext=a.get('href')[-3:].lower(), title=self.clean_string(a.string))) d = re.search(r'^(.+) Duration:? ((\d+) hr )?(\d+) ?min( (\d+) sec)?\.', description, re.MULTILINE) if not d: duration = 0 else: description = d.group(1) duration = int(d.group(3) or '0') * 3600 + int(d.group(4)) * 60 + int(d.group(6) or '0') result[heading] = dict(interviewee=h.group(1), interviewer=h.group(3), date=h.group(4), description=description, duration=duration, attachments=attachments) return result