def get_page(self, title, cache=True, beautiful=False, **kwargs): '''Get the wikitext code of a wikipedia page with a title. cache [True] -- Cache the page so that we are not abusing wikipedia beautiful [False] -- Return the BeautifulSoup html code -- for tables **kwargs -- updates the requests params Example URL: http://en.wikipedia.org/w/api.php? action=parse&prop=wikitext&page=List_of_Nobel_laureates_in_Physics& format=json§ion=1 ''' # Load from the cache if title in self.cache and cache: if beautiful: return BeautifulSoup(self.cache[title]) else: return self.cache[title] # If using BeautifulSoup get the html rather than the wikitext if beautiful: kwargs['prop'] = 'text' # url parameters for the wikipedia api params = dict( action='parse', prop='wikitext', page=title.encode('utf-8'), format='json', ) params.update(kwargs) request = self.session.get(URL, params=params) print 'Loaded: {}'.format(request.url) page = json.loads(request.content)['parse'][params['prop']]['*'] # handle redirects nicely if '#REDIRECT' in page: if beautiful: code = BeautifulSoup(page) newtitle = code.find('span',{'class','redirectText'}).find('a').get('title') else: code = mwparserfromhell.parse(page) newtitle = u'{}'.format(code.filter_wikilinks()[0].title) kwargs.update(dict(cache=cache, beautiful=beautiful)) return self.get_page(newtitle, **kwargs) # Save to the cache for the future if cache: self.cache[title] = page # Output the wikitext string or make it beautiful if beautiful: return BeautifulSoup(page) else: return page