Example #1
0
 def scrapItem(self, url=None, category=None):
     '''
     Save news item
     '''
     newsSource = NewsSource(self.default_database, self.default_collection)
     newsItem = NewsItem(self.default_database, self.default_collection)
     
     if(url == None):
         item = newsSource.findEmptyNewsItem()
         title = item['title']
         url = item['url']
         category = item['category']
         self.logger.debug("Item: "+title+" / "+category)
         
     details = self.sources[category]
     scraper = Scraper(url)
     scrap = scraper.get()
     
     #Include the possibility of scraping multiple elements as news content
     allNews = scrap.select(details['news_item_expr'])
     #print(allNews)#temp
     if len(allNews) > 1:
         content = " ".join(str(n) for n in allNews)
     else:
         content = allNews[0]
     #content = scrap.select(details['news_item_expr'])[0]
     
     newsItem.addNewsItem(url, content)
     try:
         newsItem.insertNewsItem()
         self.logger.debug(Stripper().strip(str(content)))
     except OperationFailure as of:
         pass
     finally:
         newsItem.resetNewsItem()
             
     info = "Item Scraped: "+url
     self.logger.info(info)
     print(info)
Example #2
0
 def utusan(self):
     url = 'http://www.utusan.com.my/utusan/Dalam_Negeri/20130223/dn_24/Kerajaan-kaji-BI-sebagai-wajib-lulus-di-peringkat-SPM'
     newsItem = NewsItem('news', 'utusan')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))    
Example #3
0
 def theSunDaily(self):
     url = 'http://www.thesundaily.my/news/572375'
     newsItem = NewsItem('news', 'theSunDaily')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))
Example #4
0
 def theStar(self):
     url = 'http://thestar.com.my/news/story.asp?file=/2012/12/17/nation/20121217144240&sec=nation'
     newsItem = NewsItem('news','theStar')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))
Example #5
0
 def theMalaysianTimes(self):
     url = 'http://www.themalaysiantimes.com.my/?p=62774'
     newsItem = NewsItem('news','theMalaysianTimes')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))
Example #6
0
 def theMalaysianInsider(self):
     url = 'http://www.themalaysianinsider.com/bahasa/article/mb-terengganu-akui-kontraktor-lemah-punca-bumbung-runtuh-kali-kedua/'
     newsItem = NewsItem('news','theMalaysianInsider')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))
Example #7
0
 def selangorTimes(self):
     #url = 'http://www.selangortimes.com/index.php?section=news&permalink=20130207180252-mindef-spending-raises-eyebrows'
     url = 'http://www.selangortimes.com/index.php?section=culture&permalink=20130222113339-a-chance-to-dazzle-like-teresa-teng'
     newsItem = NewsItem('news', 'selangorTimes')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))
Example #8
0
 def selangorku(self):
     url = 'http://www.selangorku.com/?p=23146'
     newsItem = NewsItem('news', 'selangorku')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))
Example #9
0
 def newStraitsTimes(self):
     url = 'http://www.nst.com.my/nation/general/indians-now-backing-bn-says-dpm-1.221564'
     newsItem = NewsItem('news', 'newStraitsTimes')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))
Example #10
0
 def mySinchew(self):
     url = 'http://www.mysinchew.com/node/83148'
     newsItem = NewsItem('news', 'mySinchew')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))
Example #11
0
 def malaysiaKini(self):
     url = 'http://www.malaysiakini.com/news/221767'
     newsItem = NewsItem('news', 'malaysiaKini')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))
Example #12
0
 def malaysiaChronicle(self):
     url = 'http://www.malaysia-chronicle.com/index.php?option=com_k2&view=item&id=57921:have-you-gone-senile-karpal?-apologize-immediately-to-jui-meng&Itemid=2'
     newsItem = NewsItem('news', 'malaysiaChronicle')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))    
Example #13
0
 def malayMail(self):
     url = 'http://www.mmail.com.my/story/bukit-aman-task-force-probe-info-dept-hacking-48005'
     newsItem = NewsItem('news', 'malayMail')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))
Example #14
0
 def ipohEcho(self):
     url = 'http://ipohecho.com.my/v2/2013/02/16/flood-mitigation-projects-whats-been-done/'
     newsItem = NewsItem('news', 'ipohEcho')
     r = newsItem.findOne({'url': url})
     print(Stripper().strip(r['content']))