def getNewTalks(self): talkContainers = SoupStrainer(attrs = {'class':re.compile('talkMedallion')}) for talk in BeautifulSoup(self.html, parseOnlyThese = talkContainers): link = URLTED+talk.dt.a['href'] title = cleanHTML(talk.dt.a['title']) pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src']) yield {'url':link, 'Title':title, 'Thumb':pic}
def getPrograms(self): """Return all programs in self.html""" # Couldn't parse properly the file using "'div', {'class':'bloc-contenu-8'}" # BeautifulSoup returns nothing in that class # So use 'contenu-descr-8 ' and find previous tag soup = BeautifulSoup(cleanHTML(self.html)) for media in soup.findAll('div', {'class':'contenu-descr-8 '}): aTag = media.findPrevious('a') # Get link, title and thumb mediaLink = URLASI + aTag['href'] mediaTitle = aTag['title'].encode('utf-8') mediaThumb = URLASI + aTag.find('img', attrs = {'src':re.compile('.+?\.[png|jpg]')})['src'] yield {'url':mediaLink, 'Title':mediaTitle, 'Thumb':mediaThumb}
def getTalks(self): # themes loaded with a json call. Why are they not more consistant? from simplejson import loads # search HTML for the link to tedtalk's "api". It is easier to use regex here than BS. jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0] # make a dict from the json formatted string from above url talksMarkup = loads(getHTML(jsonUrl)) # parse through said dict for all the metadata for markup in talksMarkup['resultSet']['result']: talk = BeautifulSoup(markup['markup']) link = URLTED+talk.dt.a['href'] title = cleanHTML(talk.dt.a['title']) pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src']) yield {'url':link, 'Title':title, 'Thumb':pic}
def getPrograms(self): """Return all programs in self.html""" # Couldn't parse properly the file using "'div', {'class':'bloc-contenu-8'}" # BeautifulSoup returns nothing in that class # So use 'contenu-descr-8 ' and find previous tag soup = BeautifulSoup(cleanHTML(self.html)) for media in soup.findAll('div', {'class': 'contenu-descr-8 '}): aTag = media.findPrevious('a') # Get link, title and thumb mediaLink = URLASI + aTag['href'] mediaTitle = aTag['title'].encode('utf-8') mediaThumb = URLASI + aTag.find( 'img', attrs={'src': re.compile('.+?\.[png|jpg]')})['src'] yield { 'url': mediaLink, 'Title': mediaTitle, 'Thumb': mediaThumb }