def extract(self): soup = make_soup(FourFourTwo.base_url) divs = soup.find('div', {'class': 'content-wrapper'}) divs = divs.find('div', {'class': 'view-content'}) divs = iter(divs.findChildren(recursive=False)) self.articles = (self.crawl(div) for div in divs) return super().extract()
def crawl(self, tag): url = None try: anchor = tag.find('div', {'class': 'title'}).find('a') url = self.get_url(anchor) title = self.get_title(anchor) if title: date_published = self.get_date_published(tag.find('div', {'class': 'created'})) div = make_soup(url) author = self.get_author(div.find('p', {'class': 'authorName'})) return Article(FourFourTwo.source, title, url, author, date_published) except (exceptions.WebCrawlException, AttributeError) as e: return InvalidArticle(FourFourTwo.source, e.__class__.__name__, e.message, url, str(e.tag))
def crawl(self, tag): url = None try: anchor = tag.find('a') url = self.get_url(anchor) # Added here because URLs are filtered via method _is_valid_article if not url: return None title = self.get_title(anchor) if title: div = make_soup(url) div = div.find('div', {'class': 'content__meta-container'}) author = self.get_author(div.find('a', {'rel': 'author'})) date_published = self.get_date_published(None) return Article(TheGuardian.source, title, url, author, date_published) except exceptions.WebCrawlException as e: return InvalidArticle(TheGuardian.source, e.__class__.__name__, e.message, url, str(e.tag))
def extract(self): soup = make_soup(ESPNFC.base_url) divs = soup.find('div', {'alt': ' TOP STORIES '}) divs = iter(divs.find_all('div', {'class': 'grid-item-content'})) self.articles = (self.crawl(div) for div in divs) return super().extract()
def extract(self): url = self._generate_url() soup = make_soup(url) divs = iter(soup.find_all('div', {'class': 'fc-item__container'})) self.articles = (self.crawl(div) for div in divs) return super().extract()