def extractData(self): self.title = '' self.headline = self.soup.find('h1',attrs={'class':'print-title'}).string self.rubric = '' #now do the body processing self.body = self.soup.find('div', attrs={'class':'print-content'}) #remove empty and rubbish elements from the end of the body for element in self.body.contents[-5:]: element.extract() #this is the stupid paginfilter paragraph self.body.contents[1].extract() #download the images for element in self.body.findAll('img'): #skip past the spacer src = element['src'] if src[-10:] != 'spacer.gif': imgtag = ImageTag('http://moreintelligentlife.com', element['src']) imgtag.downloadImage() #update the image tag to point at the downloaded file element.replaceWith(imgtag.getImageTagString()) else: element.extract()
def extractData(self): if self.article_type in ['node','cfm']: self.title = unicode(self.soup.find('h2', attrs={'class':'fly-title'}).string.strip()) self.headline = unicode(self.soup.find('div', attrs={'class':'headline'}).string.strip()) self.rubric = unicode(self.soup.find('h1', attrs={'class':'rubric'}).string.strip()) elif self.article_type == 'blog': self.title = unicode(self.soup.find('h2', attrs={'class':'ec-blog-fly-title'}).string.strip()) self.headline = unicode(self.soup.find('p', attrs={'class':'ec-blog-headline'}).string.strip()) self.rubric = unicode('') #extract the body of the article if self.article_type in ['node','cfm']: self.body = self.soup.find('div', attrs={'class':'ec-article-content clear'}) elif self.article_type == 'blog': self.body = self.soup.find('div', attrs={'class':'ec-blog-body'}) #download the images for element in self.body.findAll('img'): imgtag = ImageTag('http://www.economist.com', element['src']) imgtag.downloadImage() #update the image tag to point at the downloaded file element.replaceWith(imgtag.getImageTagString()) #remove the related items divs for element in self.body.findAll('div', {'class':'related-items'}): element.extract()