def scrape_article(article): try: article['fulltext'] = utils.scrape(article['link']) logger.info('[scraped] ' + article['link']) except: logger.error( '[scrapper] [error] Unable to scrape %s' % article['link'], exc_info=True) return article
def scrape_articles(limit=100): articles = ArticleModel.objects.exclude(fulltext__gt='').order_by('-date') count = 0 for article in articles: try: article.fulltext = utils.scrape(article.link) article.save() print '[scraped] ', article.link count += 1 except: print '[scrapper] [error] Unable to scrape ', article.link article.fulltext = article.content article.save() if limit and count >= limit: break
def test_news24_1(self): text = scrape('http://www.news24.com/SouthAfrica/News/R25m-worth-of-counterfeit-CDs-seized-20140320') print text self.assertTrue('producing the fake CDs' in text)
def test_rt(self): text = scrape('http://rt.com/politics/lavrov-crimea-slander-annexation-289/') print text self.assertTrue('On Thursday the bills were passed' in text)
def test_eonline(self): text = scrape('http://www.eonline.com/news/523549/shirtless-andrew-garfield-hits-the-waves-to-teach-autistic-kids-how-to-surf-swoon-alert?cmpid=rss-000000-rssfeed-365-topstories&utm_source=eonline&utm_medium=rssfeeds&utm_campaign=rss_topstories') print text self.assertTrue('Because his girlfriend' in text)
def test_supersport(self): text = scrape('http://www.supersport.com/motorsport/article.aspx?Id=2367604') print text self.assertTrue('Agag told AFP that people' in text)
def test_sport24(self): text = scrape('http://www.sport24.co.za/Golf/PGATour/Scott-sizzles-at-Bay-Hill-20140321') print text self.assertTrue('World No 2 Scott' in text)
def test_moneyweb(self): text = scrape('http://www.moneyweb.co.za/moneyweb-financial/rand-slips-against-dollar-2') print text self.assertTrue('consumer inflation data for February' in text)
def test_newage(self): text = scrape('http://www.thenewage.co.za/Detail.aspx?news_id=121438&cat_id=1007') print text self.assertTrue('KwaZulu-Natal transport' in text)
def test_ewn(self): text = scrape('http://ewn.co.za/2014/03/21/Police-hunt-for-Bosmont-school-robbers') print text self.assertTrue('The pair targeted school officials' in text)
def test_timeslive2(self): text = scrape('http://www.timeslive.co.za/politics/2014/03/20/kzn-leads-in-unauthorised-expenditure-with-r1.2billion') print text self.assertTrue('Researcher Georgina Alexander' in text)
def test_timeslive(self): text = scrape('http://www.timeslive.co.za/world/2014/03/20/more-eu-sanctions-on-russia-g-8-suspended-merkel') print text self.assertTrue('United States slapped sanctions' in text)
def test_sowetan(self): text = scrape('http://www.sowetanlive.co.za/news/2014/03/14/oscar-trial-first-cop-ignorant-about-window') print text self.assertTrue('Botha was replaced' in text)
def test_news24_3(self): text = scrape('http://www.news24.com/SouthAfrica/News/Energy-DG-steps-down-for-health-reasons-20140320') print text self.assertTrue('Deputy director general of Petroleum' in text)
def test_news24_2(self): text = scrape('http://www.news24.com/SouthAfrica/News/Transport-dept-to-probe-Kokstad-crash-20140320') print text self.assertTrue('At least two people were seriously' in text)