def refresh_messages(site=None): site = site or wikipedia.getSite() # get 'all messages' special page's path path = site.allmessages_address() print 'Retrieving MediaWiki messages for %s' % repr(site) wikipedia.put_throttle() # It actually is a get, but a heavy one. allmessages = site.getUrl(path) print 'Parsing MediaWiki messages' soup = BeautifulSoup(allmessages, convertEntities=BeautifulSoup.HTML_ENTITIES) # The MediaWiki namespace in URL-encoded format, as it can contain # non-ASCII characters and spaces. quotedMwNs = urllib.quote( site.namespace(8).replace(' ', '_').encode(site.encoding())) mw_url = site.path() + "?title=" + quotedMwNs + ":" altmw_url = site.path() + "/" + quotedMwNs + ":" nicemw_url = site.nice_get_address(quotedMwNs + ":") shortmw_url = "/" + quotedMwNs + ":" ismediawiki = lambda url: url and (url.startswith( mw_url) or url.startswith(altmw_url) or url.startswith(nicemw_url) or url.startswith(shortmw_url)) # we will save the found key:value pairs here dictionary = {} try: for keytag in soup('a', href=ismediawiki): # Key strings only contain ASCII characters, so we can save them as # strs key = str(keytag.find(text=True)) keyrow = keytag.parent.parent if keyrow['class'] == "orig": valrow = keyrow.findNextSibling('tr') assert valrow['class'] == "new" value = unicode(valrow.td.string).strip() elif keyrow['class'] == 'def': value = unicode(keyrow('td')[1].string).strip() else: raise AssertionError("Unknown tr class value: %s" % keyrow['class']) dictionary[key] = value except Exception, e: wikipedia.debugDump( 'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' % (repr(e), str(e), unicode(path)), allmessages) raise
def refresh_messages(site = None): site = site or wikipedia.getSite() # get 'all messages' special page's path path = site.allmessages_address() print 'Retrieving MediaWiki messages for %s' % repr(site) wikipedia.put_throttle() # It actually is a get, but a heavy one. allmessages = site.getUrl(path) print 'Parsing MediaWiki messages' soup = BeautifulSoup(allmessages, convertEntities=BeautifulSoup.HTML_ENTITIES) # The MediaWiki namespace in URL-encoded format, as it can contain # non-ASCII characters and spaces. quotedMwNs = urllib.quote(site.namespace(8).replace(' ', '_').encode(site.encoding())) mw_url = site.path() + "?title=" + quotedMwNs + ":" altmw_url = site.path() + "/" + quotedMwNs + ":" nicemw_url = site.nice_get_address(quotedMwNs + ":") shortmw_url = "/" + quotedMwNs + ":" ismediawiki = lambda url:url and (url.startswith(mw_url) or url.startswith(altmw_url) or url.startswith(nicemw_url) or url.startswith(shortmw_url)) # we will save the found key:value pairs here dictionary = {} try: for keytag in soup('a', href=ismediawiki): # Key strings only contain ASCII characters, so we can save them as # strs key = str(keytag.find(text=True)) keyrow = keytag.parent.parent if keyrow['class'] == "orig": valrow = keyrow.findNextSibling('tr') assert valrow['class'] == "new" value = unicode(valrow.td.string).strip() elif keyrow['class'] == 'def': value = unicode(keyrow('td')[1].string).strip() else: raise AssertionError("Unknown tr class value: %s" % keyrow['class']) dictionary[key] = value except Exception, e: wikipedia.debugDump( 'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' % (repr(e), str(e), unicode(path)), allmessages) raise
def refresh(site, sysop=False, witheditsonly=True): #if not site.has_api() or site.versionnumber() < 10: # _refreshOld(site) # get botlist special page's URL if not site.loggedInAs(sysop=sysop): site.forceLogin(sysop=sysop) params = { 'action': 'query', 'list': 'allusers', 'augroup': 'bot', } if witheditsonly: params['auwitheditsonly'] = '' pywikibot.output(u'Retrieving bot user list for %s via API.' % repr(site)) pywikibot.put_throttle() # It actually is a get, but a heavy one. botlist = [] while True: data = pywikibot.query.GetData(params, site, sysop=sysop) if 'error' in data: raise RuntimeError('ERROR: %s' % data) botlist.extend([w['name'] for w in data['query']['allusers']]) if 'query-continue' in data: params['aufrom'] = data['query-continue']['allusers']['aufrom'] else: break pywikibot.output(u'Retrieving global bot user list for %s.' % repr(site)) pywikibot.put_throttle() # It actually is a get, but a heavy one. m1 = True offset = '' if site.versionnumber() >= 17: PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)(?:.*?)</li>' else: PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)</li>' while m1: text = site.getUrl( site.globalusers_address(offset=offset, group='Global_bot')) m1 = re.findall(u'<li>.*?</li>', text) for item in m1: m2 = re.search(PATTERN, item) (bot, flag_local, flag_global) = m2.groups() flag_local = (flag_local[:2] == u'<a') flag_global = True # since group='Global_bot' if bot not in botlist: botlist.append(bot) #print len(botlist) offset = bot.encode(site.encoding()) # Save the botlist to disk # The file is stored in the botlists subdir. Create if necessary. if sysop: f = open( pywikibot.config.datafilepath( 'botlists', 'botlist-%s-%s-sysop.dat' % (site.family.name, site.lang)), 'w') else: f = open( pywikibot.config.datafilepath( 'botlists', 'botlist-%s-%s.dat' % (site.family.name, site.lang)), 'w') pickle.dump(botlist, f) f.close()
def refresh(site, sysop=False, witheditsonly=True): #if not site.has_api() or site.versionnumber() < 10: # _refreshOld(site) # get botlist special page's URL if not site.loggedInAs(sysop=sysop): site.forceLogin(sysop=sysop) params = { 'action': 'query', 'list': 'allusers', 'augroup': 'bot', } if witheditsonly: params['auwitheditsonly'] = '' pywikibot.output(u'Retrieving bot user list for %s via API.' % repr(site)) pywikibot.put_throttle() # It actually is a get, but a heavy one. botlist = [] while True: data = pywikibot.query.GetData(params, site, sysop=sysop) if 'error' in data: raise RuntimeError('ERROR: %s' % data) botlist.extend([w['name'] for w in data['query']['allusers']]) if 'query-continue' in data: params['aufrom'] = data['query-continue']['allusers']['aufrom'] else: break pywikibot.output(u'Retrieving global bot user list for %s.' % repr(site)) pywikibot.put_throttle() # It actually is a get, but a heavy one. m1 = True offset = '' while m1: text = site.getUrl(site.globalusers_address(offset=offset, group='Global_bot')) m1 = re.findall(u'<li>.*?</li>', text) for item in m1: m2 = re.search(u'<li>(.*?)\((.*?),\s(.*?)\)</li>', item) (bot, flag_local, flag_global) = m2.groups() bot = bot[:-2] flag_local = (flag_local[:2] == u'<a') flag_global = True # since group='Global_bot' if bot not in botlist: botlist.append( bot ) #print len(botlist) offset = bot.encode(site.encoding()) # Save the botlist to disk # The file is stored in the botlists subdir. Create if necessary. if sysop: f = open(pywikibot.config.datafilepath('botlists', 'botlist-%s-%s-sysop.dat' % (site.family.name, site.lang)), 'w') else: f = open(pywikibot.config.datafilepath('botlists', 'botlist-%s-%s.dat' % (site.family.name, site.lang)), 'w') pickle.dump(botlist, f) f.close()