Exemple #1
0
 def extract(self):
     soup = make_soup(FourFourTwo.base_url)
     divs =  soup.find('div', {'class': 'content-wrapper'})
     divs = divs.find('div', {'class': 'view-content'})
     divs = iter(divs.findChildren(recursive=False))
     self.articles = (self.crawl(div) for div in divs)
     return super().extract()
Exemple #2
0
 def crawl(self, tag):
     url = None
     try:
         anchor = tag.find('div', {'class': 'title'}).find('a')
         url = self.get_url(anchor)
         title = self.get_title(anchor)
         if title:
             date_published = self.get_date_published(tag.find('div',
                                                      {'class': 'created'}))
             div = make_soup(url)
             author = self.get_author(div.find('p', {'class': 'authorName'}))
             return Article(FourFourTwo.source, title, url, author,
                            date_published)
     except (exceptions.WebCrawlException, AttributeError) as e:
         return InvalidArticle(FourFourTwo.source, e.__class__.__name__,
                               e.message, url, str(e.tag))
Exemple #3
0
 def crawl(self, tag):
     url = None
     try:
         anchor = tag.find('a')
         url = self.get_url(anchor)
         # Added here because URLs are filtered via method _is_valid_article
         if not url:
             return None
         title = self.get_title(anchor)
         if title:
             div = make_soup(url)
             div = div.find('div', {'class': 'content__meta-container'})
             author = self.get_author(div.find('a', {'rel': 'author'}))
             date_published = self.get_date_published(None)
             return Article(TheGuardian.source, title, url, author,
                            date_published)
     except exceptions.WebCrawlException as e:
         return InvalidArticle(TheGuardian.source, e.__class__.__name__,
                               e.message, url, str(e.tag))
Exemple #4
0
 def crawl(self, tag):
     url = None
     try:
         anchor = tag.find('a')
         url = self.get_url(anchor)
         # Added here because URLs are filtered via method _is_valid_article
         if not url:
             return None
         title = self.get_title(anchor)
         if title:
             div = make_soup(url)
             div = div.find('div', {'class': 'content__meta-container'})
             author = self.get_author(div.find('a', {'rel': 'author'}))
             date_published = self.get_date_published(None)
             return Article(TheGuardian.source, title, url, author,
                            date_published)
     except exceptions.WebCrawlException as e:
         return InvalidArticle(TheGuardian.source, e.__class__.__name__,
                               e.message, url, str(e.tag))
Exemple #5
0
 def extract(self):
     soup = make_soup(ESPNFC.base_url)
     divs = soup.find('div', {'alt': ' TOP STORIES '})
     divs = iter(divs.find_all('div', {'class': 'grid-item-content'}))
     self.articles = (self.crawl(div) for div in divs)
     return super().extract()
Exemple #6
0
 def extract(self):
     url = self._generate_url()
     soup = make_soup(url)
     divs = iter(soup.find_all('div', {'class': 'fc-item__container'}))
     self.articles = (self.crawl(div) for div in divs)
     return super().extract()
Exemple #7
0
 def extract(self):
     url = self._generate_url()
     soup = make_soup(url)
     divs =  iter(soup.find_all('div', {'class': 'fc-item__container'}))
     self.articles = (self.crawl(div) for div in divs)
     return super().extract()
Exemple #8
0
 def extract(self):
     soup = make_soup(ESPNFC.base_url)
     divs = soup.find('div', {'alt': ' TOP STORIES '})
     divs = iter(divs.find_all('div', {'class': 'grid-item-content'}))
     self.articles = (self.crawl(div) for div in divs)
     return super().extract()