Example #1
0
 def get_page(self, title, cache=True, beautiful=False, **kwargs):
   '''Get the wikitext code of a wikipedia page with a title.
   cache [True] -- Cache the page so that we are not abusing wikipedia
   beautiful [False] -- Return the BeautifulSoup html code -- for tables
   **kwargs -- updates the requests params
   
   Example URL:
   http://en.wikipedia.org/w/api.php?
   action=parse&prop=wikitext&page=List_of_Nobel_laureates_in_Physics&
   format=json&section=1
   
   '''
   # Load from the cache
   if title in self.cache and cache:
     if beautiful:
       return BeautifulSoup(self.cache[title])
     else:
       return self.cache[title]
       
   # If using BeautifulSoup get the html rather than the wikitext
   if beautiful:
     kwargs['prop'] = 'text'
   
   # url parameters for the wikipedia api
   params = dict(
     action='parse',
     prop='wikitext',
     page=title.encode('utf-8'),
     format='json',
   )
   params.update(kwargs)
   
   request = self.session.get(URL, params=params)
   print 'Loaded: {}'.format(request.url)
   page = json.loads(request.content)['parse'][params['prop']]['*']
   
   # handle redirects nicely
   if '#REDIRECT' in page:
     if beautiful:
       code = BeautifulSoup(page)
       newtitle = code.find('span',{'class','redirectText'}).find('a').get('title')
     else:
       code = mwparserfromhell.parse(page)
       newtitle = u'{}'.format(code.filter_wikilinks()[0].title)
     kwargs.update(dict(cache=cache, beautiful=beautiful))
     return self.get_page(newtitle, **kwargs)
 
   # Save to the cache for the future
   if cache:
     self.cache[title] = page
 
   # Output the wikitext string or make it beautiful
   if beautiful:
     return BeautifulSoup(page)
   else:
     return page