def get_article(self, url): '''Implementation for getting an article from the CBC. url: A URL in the cbc.ca/news/* domain. Returns: The Article representing the article at that url, or None if unable to scrape the article. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) try: headline = soup.h1.string except AttributeError: log.error('Exception trying to scrape CBC headline from %s' % (url)) return None article = soup.find('div', attrs={'class': 'story-content'}) paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([p.get_text() for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.CBC)
def get_article(self, url): '''Implementation for getting an article from CNN. Args: url: A URL in the www.cnn.* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) a = soup.find("title") k = a.text.split("-") headline = k[0] date = k[1] c = soup.findAll("p", attrs={'class': 'zn-body__paragraph'}) body = "" for paragraph in c: try: body += paragraph.text.decode("utf-8").replace("\"", "'") + " " except UnicodeEncodeError: pass log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.CNN)
def get_article(self, url): '''Implementation for getting an article from the Guardian. url: A URL in the guardian.com domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = soup.h1.string.strip('\n') if url.split('.com/')[1].startswith('theguardian'): article = soup.find('div', attrs={'class': 'flexible-content-body'}) else: article = soup.find('div', attrs={'class': 'content__article-body'}) paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([p.get_text() for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.GUARDIAN)
def get_article(self, url): '''Implementation for getting an article from the Russia Today. url: A URL in the russia_today.com domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = helpers.decode(soup.h1.string) article = soup.find('div', attrs={'class': 'cont-wp'}) paragraphs = article.find_all('p', attrs={'class': None}) p_text = [helpers.decode(p.get_text()) for p in paragraphs] # Get rid of 'Tags' and 'Trends' headers, and 'READ MORE' links body = ' '.join([ p for p in p_text if not (p.startswith('\nREAD') or p == 'Tags' or p == 'Trends') ]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.RUSSIA_TODAY)
def get_article(self, url): '''Implementation for getting an article from the Globe and Mail. url: A URL in the theglobeandmail.com/* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) soup.h1.a.extract() headline = soup.h1.get_text().encode('ascii', 'ignore').strip('\n') article = soup.find('div', attrs={'class': 'entry-content'}) # Remove other content that is inline with the article text [div.extract() for div in article.find_all('div', attrs={'class': 'entry-related'})] [aside.extract() for aside in article.find_all('aside')] paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([p.get_text().encode('ascii', 'ignore') for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.GLOBE_AND_MAIL)
def get_article(self, url): '''Implementation for getting an article from JPost. Args: url: A URL in the www.jpost.com/* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) try: a = soup.find('h1', attrs={'class': 'article-title'}) headline = a.text.strip().strip('\r\n') paragraphs = soup.find("div", {"class": "article-text"}) article = paragraphs.find("p") except Exception as e: log.error('Error scraping JPost article at %s: %s' % (url, e)) body = article.text log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.JPOST)
def get_article(self, url): '''Returns an Article representing the article at url.''' try: html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = self.get_headline(soup) body = self.get_body(soup) date = self.get_date(soup) except Exception as e: logger.log.error( "Hit exception on line number %s getting article for %s:" " %s" % (sys.exc_info()[-1].tb_lineno, url, e)) return None try: headline = helpers.decode(headline) body = helpers.decode(body) date = helpers.decode(date) except Exception as e: logger.log.error('Error on line %s decoding url %s: %s' % (sys.exc_info()[-1].tb_lineno, url, e)) return None logger.log.info('URL: %s' % url) logger.log.info('headline: %s' % headline) logger.log.info('Body: %s' % body) return news_interface.Article(headline, body, url, self.news_org, date)
def get_article(self, url): '''Implementation for getting an article from the NYTimes. url: A URL in the ny_times.com domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = helpers.decode(soup.h1.string) try: article = soup.find('div', attrs={'class': 'articleBody'}) paragraphs = article.find_all('p', attrs={'itemprop': 'articleBody'}) except AttributeError: # this article's html uses different attributes... sigh... # Hopefully there are only 2 versions article = soup.find('div', attrs={'class': 'story-body'}) paragraphs = article.find_all('p', attrs={'class': 'story-content'}) p_text = [helpers.decode(p.get_text()) for p in paragraphs] body = ' '.join([p for p in p_text]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.NY_TIMES)
def get_article(self, url): '''Implementation for getting an article from USA Today. url: A URL in the http://www.usatoday.com/story/* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) article = soup.article headline = helpers.decode(article.h1.string) paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([helpers.decode(p.get_text()) for p in paragraphs]) return news_interface.Article(headline, body, url, news_orgs.USA_TODAY)
def get_article(self, url): '''Implementation for getting an article from the New York Post. url: A URL in the nypost.com domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = helpers.decode(soup.h1.a.string) article = soup.find('div', attrs={'class': 'entry-content'}) paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([helpers.decode(p.get_text()) for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.NY_POST)
def get_article(self, url): '''Implementation for getting an article from BBC. url: A URL in the www.bbc.* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = soup.h1.string article = soup.find('div', attrs={'class': 'story-body'}) paragraphs = article.find_all('p', attrs={'class': None}) body = ' '.join([p.get_text() for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.BBC)
def get_article(self, url): '''Implementation for getting an article from Al Jazeera. Args: url: A URL in the aljazeera.* domain. Returns: The Article representing the article at that url, or None if unable to get the Article. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline = None potential_classes = ["heading-story", "articleOpinion-title"] for h1_class in potential_classes: try: headline = soup.find("h1", {"class": h1_class}).string break except AttributeError: continue if not headline: log.error( 'Exception trying to scrape Al Jazeera headline from %s' % (url)) return None headline = helpers.decode(headline) try: paragraphs = soup.find("div", {"class": "article-body"}) article = paragraphs.findAll("p") except AttributeError: paragraphs = soup.find("div", {"class": "text"}) article = paragraphs.findAll("p") body = ' '.join([helpers.decode(p.text) for p in article]) #log.info(headline) #log.info(body) return news_interface.Article(headline, body, url, news_orgs.ALJAZEERA)
def get_article(self, url): '''Implementation for getting an article from Todays Zaman. Args: url: A URL in the www.todayszaman.com/* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) a = soup.find("title") headline = helpers.decode(a.text) paragraphs = soup.find("div", {"id": "newsText"}) article = paragraphs.findAll("p") body = ' '.join([helpers.decode(p.text) for p in article]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.TODAYS_ZAMAN)
def get_article(self, url): '''Implementation for getting an article from REUTERS. url: A URL in the www.reuters.com* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) headline_div = soup.find('div', attrs={'class': 'column1 gridPanel grid8'}) headline = helpers.decode(headline_div.h1.string) paragraphs = soup.find('div', attrs={ 'class': 'column1 gridPanel grid8' }).findAll("p") body = ' '.join([helpers.decode(p.text) for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.REUTERS)
def get_article(self, url): '''Implementation for getting an article from Times of Israel. Args: url: A URL in the www.timesofisrael.com/* domain. Returns: The Article representing the article at that url. ''' html = helpers.get_content(url) if not html: return None soup = BeautifulSoup(html) h1 = soup.find('h1', attrs={'class': 'headline'}) headline = helpers.decode(h1.text) paragraphs = soup.findAll("p", {"itemprop": "articleBody"}) body = ' '.join([helpers.decode(p.text) for p in paragraphs]) log.info(headline) log.info(body) return news_interface.Article(headline, body, url, news_orgs.TIMES_OF_ISRAEL)