Beispiel #1
0
 def extractData(self):
     self.title = ''
     self.headline = self.soup.find('h1',attrs={'class':'print-title'}).string
     self.rubric = ''
     
     #now do the body processing
     self.body = self.soup.find('div', attrs={'class':'print-content'})
     
     #remove empty and rubbish elements from the end of the body
     for element in self.body.contents[-5:]:
         element.extract()
     #this is the stupid paginfilter paragraph
     self.body.contents[1].extract()
     
     #download the images
     for element in self.body.findAll('img'):
         #skip past the spacer
         src = element['src']
         if src[-10:] != 'spacer.gif':
             imgtag = ImageTag('http://moreintelligentlife.com', element['src'])
             imgtag.downloadImage()
             #update the image tag to point at the downloaded file
             element.replaceWith(imgtag.getImageTagString())
         else:
             element.extract()
Beispiel #2
0
    def extractData(self):
        if self.article_type in ['node','cfm']:
            self.title = unicode(self.soup.find('h2', attrs={'class':'fly-title'}).string.strip())
            self.headline = unicode(self.soup.find('div', attrs={'class':'headline'}).string.strip())
            self.rubric = unicode(self.soup.find('h1', attrs={'class':'rubric'}).string.strip())
            
        elif self.article_type == 'blog':
            self.title = unicode(self.soup.find('h2', attrs={'class':'ec-blog-fly-title'}).string.strip())
            self.headline = unicode(self.soup.find('p', attrs={'class':'ec-blog-headline'}).string.strip()) 
            self.rubric = unicode('') 

        #extract the body of the article
        if self.article_type in ['node','cfm']:
            self.body = self.soup.find('div', attrs={'class':'ec-article-content clear'})
        elif self.article_type == 'blog':
            self.body = self.soup.find('div', attrs={'class':'ec-blog-body'})
        
        #download the images
        for element in self.body.findAll('img'):
            imgtag = ImageTag('http://www.economist.com', element['src'])
            imgtag.downloadImage()
            #update the image tag to point at the downloaded file
            element.replaceWith(imgtag.getImageTagString())
        
        #remove the related items divs
        for element in self.body.findAll('div', {'class':'related-items'}):
            element.extract()