def titlegrabber(lang="English"): import xmlreader totalfile = open(scriptdir + "\\all_titles.txt", "w") writefile = open(scriptdir + "\\en_titles.txt", "w") English = set() all = set() if "wikt.xml" not in os.listdir(scriptdir): dump = xmlreader.XmlDump(scriptdir + "\wikt.bz2") else: dump = xmlreader.XmlDump(scriptdir + "\wikt.xml") for d in dump.parse(): all.add(d.title) if ":" in d.title: continue elif "==" + lang + "==" not in d.text: continue else: English.add(d.title) try: print d.title, len(English) except: pass for e in English: writefile.write(e.encode("utf-8") + "\r\n") for a in all: totalfile.write(a.encode("utf-8") + "\r\n") writefile.close() totalfile.close() return English
def __iter__(self): import xmlreader dump = xmlreader.XmlDump(self.xmlFilename) for entry in dump.parse(): text = pywikibot.removeDisabledParts(entry.text) if self.refR.search(text) and not self.referencesR.search(text): yield pywikibot.Page(pywikibot.getSite(), entry.title)
def dump_entries(dump=None, namespace=None, main_only=False, offline=False): """ Returns an iterator over every entry in the `dump` If the `dump` is not specified, `latest_dump()` is used. """ if dump is None: dump = latest_dump(offline=offline) def do_main_only(namespaces): for entry in xmlreader.XmlDump(dump).parse(): if not ":" in entry.title: yield entry elif not entry.title[:entry.title.find(":")] in namespaces: yield entry def do_namespace(namespace): for entry in xmlreader.XmlDump(dump).parse(): if entry.title[:entry.title.find(":")] == namespace: yield entry if main_only == False and namespace != "": if namespace: return do_namespace(namespace) else: return xmlreader.XmlDump(dump).parse() else: return do_main_only(namespaces)
def __iter__(self): """ Yield page objects until the entire XML dump has been read. """ import xmlreader mysite = pywikibot.getSite() dump = xmlreader.XmlDump(self.xmlfilename) # regular expression to find the original template. # {{vfd}} does the same thing as {{Vfd}}, so both will be found. # The old syntax, {{msg:vfd}}, will also be found. # TODO: check site.nocapitalize() templatePatterns = [] for template in self.templates: templatePattern = template.titleWithoutNamespace() if not pywikibot.getSite().nocapitalize: templatePattern = '[' + templatePattern[0].upper( ) + templatePattern[0].lower() + ']' + templatePattern[1:] templatePattern = re.sub(' ', '[_ ]', templatePattern) templatePatterns.append(templatePattern) templateRegex = re.compile( r'\{\{ *([mM][sS][gG]:)?(?:%s) *(?P<parameters>\|[^}]+|) *}}' % '|'.join(templatePatterns)) for entry in dump.parse(): if templateRegex.search(entry.text): page = pywikibot.Page(mysite, entry.title) yield page
def test_XmlDumpFirstRev(self): pages = [r for r in xmlreader.XmlDump("data/article-pear.xml").parse()] self.assertEquals(1, len(pages)) self.assertEquals(u"Automated conversion", pages[0].comment) self.assertEquals(u"Pear", pages[0].title) self.assertEquals(u"24278", pages[0].id) self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of')) self.assertTrue(not pages[0].isredirect)
def __init__(self, xmlFilename, xmlStart, namespaces): self.xmlStart = xmlStart self.namespaces = namespaces self.skipping = bool(xmlStart) self.site = pywikibot.getSite() dump = xmlreader.XmlDump(xmlFilename) self.parser = dump.parse()
def test_XmlDumpAllRevs(self): pages = [ r for r in xmlreader.XmlDump("data/article-pear.xml", allrevisions=True).parse() ] self.assertEquals(4, len(pages)) self.assertEquals(u"Automated conversion", pages[0].comment) self.assertEquals(u"Pear", pages[0].title) self.assertEquals(u"24278", pages[0].id) self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of')) self.assertEquals(u"Quercusrobur", pages[1].username) self.assertEquals(u"Pear", pages[0].title)
def __iter__(self): import xmlreader mysite = pywikibot.getSite() dump = xmlreader.XmlDump(self.xmlFilename) for entry in dump.parse(): if mysite.nocapitalize: title = re.escape(entry.title) else: title = '[%s%s]%s' % (re.escape( entry.title[0].lower()), re.escape( entry.title[0].upper()), re.escape(entry.title[1:])) selflinkR = re.compile(r'\[\[' + title + '(\|[^\]]*)?\]\]') if selflinkR.search(entry.text): yield pywikibot.Page(mysite, entry.title) continue
def __init__(self, xmlFilename, xmlStart, replacements, exceptions): self.xmlFilename = xmlFilename self.replacements = replacements self.exceptions = exceptions self.xmlStart = xmlStart self.skipping = bool(xmlStart) self.excsInside = [] if "inside-tags" in self.exceptions: self.excsInside += self.exceptions['inside-tags'] if "inside" in self.exceptions: self.excsInside += self.exceptions['inside'] import xmlreader self.site = pywikibot.getSite() dump = xmlreader.XmlDump(self.xmlFilename) self.parser = dump.parse()
def main(*args): genFactory = pagegenerators.GeneratorFactory() # If xmlfilename is None, references will be loaded from the live wiki. xmlfilename = None user = None skip = False timestamp = None # read command line parameters for arg in pywikibot.handleArgs(*args): xmlfilename = arg print xmlfilename importsite = "speedydeletion" outsite = pywikibot.getSite("en", importsite) outsite.forceLogin() dump = xmlreader.XmlDump(xmlfilename) count = 0 for entry in dump.parse(): # print file_store[entry.title] title = entry.title.encode("ascii", "ignore") m = re.search("Wikipedia:", entry.title) if m: pywikibot.output(u'skipping %s' % entry.title) next if entry.title != "Main Page": try: if (file_store[title]): count = count + 1 # pywikibot.output(u'was cached %s' % entry.title) else: pywikibot.output(u'not exists %s' % entry.title) except KeyError: print sys.exc_type, ":", "%s is not in the list." % sys.exc_value pywikibot.output(u'key error %s' % entry.title) try: outpage = pywikibot.Page(outsite, entry.title) if outpage.exists(): pywikibot.output(u'there is an article %s' % entry.title) file_store[title] = 1 else: pywikibot.output(u'is not there %s' % entry.title) contents = entry.text usernames = entry.username contents = contents + "\n{{wikipedia-deleted|%s}}" % usernames outpage.put(contents) try: file_store[title] = 1 except: pywikibot.output( u'could not save %s! to the list of article' % entry.title) finally: count = count + 1 finally: count = count + 1
# -*- coding: UTF-8 -*- import wikipedia, xmlreader, codecs, re, json, sys dump = xmlreader.XmlDump( "/data/project/dexbot/pywikipedia-git/wikidatawiki-20150603-pages-articles.xml.bz2" ) langs = ['en', 'fa'] a = 0 db = {} with codecs.open( '/data/project/dexbot/pywikipedia-git/snowball2_%s_%s.txt' % (langs[0], langs[1]), 'w', 'utf-8') as f: f.write('') def sep(lang): if lang == 'ja': return u'・' if lang == 'zh': return u'·' return ' ' def _make_old_dict(_contents): """Convert the new dictionary to the old one for consistency.""" if isinstance(_contents.get('claims', {}), list) and not _contents.get('sitelinks'): return _contents old_dict = _contents new_dict = { 'links': {},
def main(*args): genFactory = pagegenerators.GeneratorFactory() # If xmlfilename is None, references will be loaded from the live wiki. xmlfilename = None user = None skip = False timestamp = None # read command line parameters for arg in pywikibot.handleArgs(*args): xmlfilename = arg print xmlfilename importsite = "speedydeletion" outsite = pywikibot.getSite("en",importsite) outsite.forceLogin() mysite = pywikibot.getSite() dump = xmlreader.XmlDump(xmlfilename) #, allrevisions=True count = 0 for entry in dump.parse(): print entry.username # print entry.revisionid if entry.title != "Main Page" : page = pywikibot.Page(mysite, entry.title) try : if (file_store[entry.title] ) : pywikibot.output(u'skipping at %s' % entry.title) count = count +1 except: try : pywikibot.output(u'updating %s' % entry.title) outpage = pywikibot.Page(outsite, entry.title) contents = "" try: contents = outpage.get() except pywikibot.NoPage: contents = "" except pywikibot.IsRedirectPage: print "skipping redirect" if (not(contents)): contents = entry.text usernames = entry.username print ("http://%s%s" % ( outpage.site().hostname(), outpage.site().nice_get_address(outpage.title()) )) match = re.search(r'\{(wikipedia-deleted)',contents) if (match == None) : contents = contents + "\n{{wikipedia-deleted|%s}}" % usernames try : status = outpage.put(contents, "adding the username %s" % usernames) except pywikibot.exceptions.LockedPage: print "locked, skipping" else: print match print match.group(0) pywikibot.output(u'skipping, already done %s' % entry.title) # except: # print "hiccup" # finally : # count = count + 1 file_store[entry.title] = entry.title finally: count = count + 1 finally: count = count + 1 print "done with %s %d" % (entry.title, count)
# # MIT license #http://dumps.wikimedia.your.org/fawiki/ # #fawiki-20140802-pages-meta-current.xml.bz2 #http://dumps.wikimedia.your.org/fawiki/20150325/fawiki-20150325-pages-meta-current.xml.bz2 import wikipedia, xmlreader, codecs, re import os bot_adress = "/data/project/rezabot/" TheDay = '20150325' urllinkmain = 'http://dumps.wikimedia.your.org/fawiki/%s/fawiki-%s-pages-meta-current.xml.bz2' % ( TheDay, TheDay) print urllinkmain #os.system('wget '+urllinkmain +" "+bot_adress+"fawiki-"+TheDay+"-pages-meta-current.xml.bz2") dump = xmlreader.XmlDump(bot_adress + "fawiki-" + TheDay + "-pages-meta-current.xml.bz2") pre, noinclude, includeonly, tags1, tags2 = u'\n', u'\n', u'\n', u'\n', u'\n' for entry in dump.new_parse(): if entry.ns == '0': text = entry.text.replace(u' /', u'/').replace(u'/ ', u'/').replace( u'< ', u'<').replace(u' >', u'>') if u'<noinclude>' in text or u'</noinclude>' in text: noinclude += u"#[[%s]]\n" % entry.title elif u'<includeonly>' in text or u'</includeonly>' in text: includeonly += u"#[[%s]]\n" % entry.title elif u'<pre>' in text or u'</pre>' in text: pre += u"#[[%s]]\n" % entry.title elif u'__NOGALLERY__' in text: tags1 += u"#[[%s]]\n" % entry.title
def __init__(self, api): self.api = api self.dump = xmlreader.XmlDump(DUMP)
def main(): """Missing articles""" xml = xmlreader.XmlDump( '%s%s' % (dumppath and '%s/' % dumppath or '', dumpfilename), allrevisions=False) c = 0 bios = 0 global skip if skip: print 'Skiping to...', skip for x in xml.parse(): #parsing the whole dump, one page a time c += 1 if c % 10000 == 0: print 'Total pages analysed =', c, '| Bios =', bios if skip: if x.title == skip: skip = '' continue #filtering unuseful pages if re.search(title_ex_r, x.title) or \ re.search(red_r, x.text) or \ re.search(dis_r, x.text) or \ len(x.text.splitlines()) < 3 or len(x.text) < 1024*2: continue #si tiene iws hacia targetlang, no nos interesa, ya existe la bio if re.search(iws_target_r, x.text): continue #nombre con dos palabras largas al menos trozos = [ ] # no hacer la asignacion del bucle for directamente, sino almacena True y False en vez de los trozos [ len(trozo) >= 3 and trozos.append(trozo) for trozo in x.title.split(' ') ] if not len(trozos) >= 2: continue #metemos variantes sin acentos [(trozo != quitaracentos(trozo) and trozo not in trozos) and trozos.append(quitaracentos(trozo)) for trozo in trozos] #descartamos algunas bios if not re.search(birth_r, x.text) or not re.search( death_r, x.text): #si es BLP, fuera continue #sino podemos sacar su año de nacimiento ni fallecimiento, fuera if not re.search(birth_r, x.text) and not re.search( death_r, x.text) and bdtemplate_r.has_key( lang) and not re.search(bdtemplate_r[lang], x.text): continue print 'Analysing http://%s.wikipedia.org/wiki/%s' % ( lang, re.sub(' ', '_', x.title)) #buscando imágenes útiles para la bio images = re.findall(ur"(?im)[\s\/\:\|\=]+([^\/\:\|\=]+\.jpe?g)[\s\|]", x.text) image_cand = '' if images: for image in images: if len(re.findall(ur"(%s)" % ('|'.join(trozos)), image)) >= 1: image_cand = image break if image_cand: print 'We have image_cand' else: print 'No image_cand' #continue #description desc = re.findall( ur"(?im)^(\'{2,5}\s*.{,25}\s*%s[^\n\r]+)[\n\r]" % (x.title.split(' ')[0]), x.text) if not desc: print 'No description' continue else: print 'We have description' desc = desc[0] #birth and death dates birthdate = '' deathdate = '' #first try with birth/death categories m = birth_r.finditer(x.text) for i in m: birthdate = i.group('birthyear') break m = death_r.finditer(x.text) for i in m: deathdate = i.group('deathyear') break #second attempt uses bio first paragraph if not birthdate and not deathdate: m = dates_r[lang].finditer(desc) for i in m: """birthmonth = '' if i.group('birthday') and i.group('birthmonth'): if monthstoen.has_key(quitaracentos(i.group('birthmonth').lower())): birthmonth = monthstoen[i.group('birthmonth').lower()] deathmonth = '' if i.group('deathday') and i.group('deathmonth'): if monthstoen.has_key(quitaracentos(i.group('deathmonth').lower())): deathmonth = monthstoen[i.group('deathmonth').lower()] if birthmonth: #continue #temp birthdate = u'%s %s, %s' % (birthmonth, i.group('birthday'), i.group('birthyear')) else: birthdate = u'%s' % (i.group('birthyear')) if deathmonth: #continue #temp deathdate = u'%s %s, %s' % (deathmonth, i.group('deathday'), i.group('deathyear')) else: deathdate = u'%s' % (i.group('deathyear'))""" birthdate = i.group('birthyear') deathdate = i.group('deathyear') break #third case uses special templates #special cases for es: {{BD|XXXX|YYYY|DEFAULTSORT}}, or vi:, or others if not birthdate and not deathdate and bdtemplate_r.has_key(lang): m = bdtemplate_r[lang].finditer(x.text) for i in m: birthdate = u'%s' % (i.group('birthyear')) deathdate = u'%s' % (i.group('deathyear')) break if birthdate and deathdate: print 'We have birthdate and deathdate' if (int(deathdate[-4:]) - int(birthdate[-4:])) < 20: #weird, child prodigy? print 'But dates are weird', birthdate, deathdate continue #skiping bio else: print 'No birthdate or deathdate' #end birth and death dates #defaultsort m = defaultsort_r.finditer(x.text) defaultsort = '' for d in m: defaultsort = d.group("defaultsort") break if not defaultsort and bdtemplate_r.has_key(lang): m = bdtemplate_r[lang].finditer(x.text) for i in m: defaultsort = u'%s' % (i.group('defaultsort')) break if not defaultsort: #create myself defaultsort = u'%s, %s' % (' '.join( quitaracentos(x.title).split(' ')[1:]), quitaracentos( x.title).split(' ')[0]) #iws m = iws_r.finditer(x.text) iws = [] for iw in m: if not iw.group('iwlang') in [targetlang, lang]: iws.append([iw.group('iwlang'), iw.group('iwtitle')]) iws.append([lang, x.title]) if len(iws) < minimumiws: print 'No minimum interwikis' continue # this language and other wiki at least print 'We have %d interwikis' % len(iws) iws.sort() iws_plain = '' for iwlang, iwtitle in iws: iws_plain += u'[[%s:%s]]\n' % (iwlang, iwtitle) if desc and len(desc) < 2500 and birthdate and deathdate: #check if live version has interwiki or not sourcebio = wikipedia.Page(wikipedia.Site(lang, 'wikipedia'), x.title) if not sourcebio.exists(): print 'Page doesnt exist' continue if sourcebio.isRedirectPage(): print 'Page is redirect' continue if sourcebio.isDisambig(): print 'Page is disambig' continue if len(re.findall(iws_target_r, sourcebio.get())) != 0: print 'Found iw to target lang in the current version of article' continue #cats, esto es lo más costoso en tiempo, entonces lo dejamos para este último if justo antes de generar el output m = cats_r.finditer(x.text) cats = [] [ translatecat(cat.group('catname'), lang) and translatecat(cat.group('catname'), lang) not in cats and cats.append(translatecat(cat.group('catname'), lang)) for cat in m ] cats.sort() #nationality nationality = '' if cats: n = [cat.split(' ')[0] for cat in cats] for nn in n: if nn in nationalitytonation.keys(): if nationality: if nn != nationality: #conflict, several nationalities for this bio, blank nationality and exit nationality = '' break else: nationality = nn else: if not nn.isdigit(): f = open('missingarticlesxml.output.errors', 'a') f.write((u'missing nationality = %s\n' % (nn)).encode('utf-8')) f.close() if nationality: print 'We have nationality' else: print 'No nationality found' continue #occupations (usando cats) occupations = [] if nationality: for cat in cats: t = cat.split(' ') if ( t[0] == nationality or t[0].split('-')[0] == nationality ) and len( t ) == 2: # [[Category:Spanish writers]] [[Category:Spanish-language writers]] if t[1][-3:] == 'ies': if not '%sy' % t[1].rstrip('ies') in occupations: occupations.append( '%sy' % t[1].rstrip('ies') ) #remove final ies and add y elif t[1][-1] == 's': if not t[1].rstrip('s') in occupations: occupations.append( t[1].rstrip('s')) #remove final s elif t[1] == 'businesspeople': if not 'businessman' in occupations: occupations.append('businessman') if occupations: print 'We have occupation' else: print 'No occupations found' continue #la salida para esta bio output = u"""\n<br clear="all"/>\n==== [[%s]] ([[:%s:%s|%s]]) ====""" % ( x.title, lang, x.title, lang) if image_cand: output += u"""\n[[File:%s|thumb|right|120px|%s]]""" % ( image_cand, x.title) output += u"""\n<small><nowiki>%s</nowiki></small>""" % ( linkstoiws(desc, lang).strip()) output += u"""\n<pre>""" output += u"""\n{{Expand %s|%s}}""" % (langisotolang[lang], x.title) if image_cand: output += u"""\n[[File:%s|thumb|right|%s]]""" % (image_cand, x.title) output += u"""\n\'\'\'%s\'\'\' (%s–%s) was %s %s %s.""" % ( x.title, birthdate, deathdate, nationality and nationalitytonation[nationality][0] in ['A', 'E', 'I', 'O', 'U'] and 'an' or 'a', nationality and '[[%s|%s]]' % (nationalitytonation[nationality], nationality), occupations and (len(occupations) > 1 and '%s and %s' % (', '.join(occupations[:-1]), occupations[-1:][0]) or occupations[0]) or '...') output += u"""\n\n{{Persondata <!-- Metadata: see [[Wikipedia:Persondata]]. -->""" output += u"""\n| NAME = %s """ % (defaultsort) output += u"""\n| ALTERNATIVE NAMES = """ output += u"""\n| SHORT DESCRIPTION = """ output += u"""\n| DATE OF BIRTH = %s """ % (birthdate) output += u"""\n| PLACE OF BIRTH = """ output += u"""\n| DATE OF DEATH = %s """ % (deathdate) output += u"""\n| PLACE OF DEATH = """ output += u"""\n}}""" output += u"""\n{{DEFAULTSORT:%s}}""" % (defaultsort) if cats: output += u"""\n""" for cat in cats: if not cat in ['Men', 'Women', 'Fascists' ] and not cat.startswith('Death'): output += u"""\n[[Category:%s]]""" % (cat) output += u"""\n\n%s""" % (iws_plain) output += u"""\n%s""" % ( nationality and nationalitytonation[nationality] and '{{%s-bio-stub}}' % (nationalitytonation[nationality]) or '{{bio-stub}}') output += u"""\n</pre>""" #last replacements... output = re.sub(ur"{{United States-bio-stub}}", ur"{{US-bio-stub}}", output) output = re.sub(ur"{{Czech Republic-bio-stub}}", ur"{{Czech-bio-stub}}", output) #end last print '#' * 70 print x.title, 'https://%s.wikipedia.org/wiki/%s' % ( lang, x.title.replace(' ', '_')) print output bios += 1 print 'Total pages analysed =', c, '| Bios =', bios f = open( 'missingarticlesxml.output.%s.%02d.txt' % (lang, len(iws)), 'a') f.write(output.encode('utf-8')) f.close()
# -*- coding: utf-8 -*- import codecs import re import xmlreader dump = xmlreader.XmlDump("fawiki-20150325-pages-articles.xml.bz2") a = 0 f = codecs.open("markup.txt", "w", "utf-8") f.write("") f.close() rer = re.compile(ur'(<table|<pre>\s*?</pre>|<noinclude>\s*?</noinclude>|' '<includeonly>\s*?</includeonly>|__NOGALLERY__|' '__NOEDITSECTION__|__TOC__|__NOTOC__)') for entry in dump.new_parse(): if entry.ns in ['0', '14', '6', '4']: if rer.search(entry.text): a += 1 print "found one: %d" % a f = codecs.open("markup.txt", "a", "utf-8") f.write(u"[[%s]]\n" % entry.title) f.close()
def __init__(self, xmlfilename): import xmlreader self.xmldump = xmlreader.XmlDump(xmlfilename)
def do_main_only(namespaces): for entry in xmlreader.XmlDump(dump).parse(): if not ":" in entry.title: yield entry elif not entry.title[:entry.title.find(":")] in namespaces: yield entry
def test_XmlDumpRedirect(self): pages = [ r for r in xmlreader.XmlDump("data/article-pyrus.xml").parse() ] self.assertTrue(pages[0].isredirect)
def do_namespace(namespace): for entry in xmlreader.XmlDump(dump).parse(): if entry.title[:entry.title.find(":")] == namespace: yield entry
return 5 def cleantitle(title): title = re.sub(ur"[&]", ur"-", title) return title dumppath = '' if len(sys.argv) == 2: dumpfilename = sys.argv[1] #download commons dump dumppath = '/mnt/user-store/emijrp' dumpfilename = 'commonswiki-latest-pages-articles.xml.bz2' os.system('wget -c http://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-pages-articles.xml.bz2 -O %s/%s' % (dumppath, dumpfilename)) xml = xmlreader.XmlDump('%s%s' % (dumppath and '%s/' % dumppath or '', dumpfilename), allrevisions=False) path = '/home/emijrp/public_html/commonsexplorer' errors = 0 minpics = 1 #min pics to show for year maximages = 100000 #max images to show in the sum all years maxyear = 2000 minyear = 1850 c = 0 s = 0 coord_dec_r = re.compile(ur"(?im)(?P<all>{{\s*(Location dec|Object location dec)\s*\|\s*(?P<lat>[\d\.\-\+]+)\s*\|\s*(?P<lon>[\d\.\-\+]+)\s*\|?\s*[^\|\}]*\s*}})") coord_r = re.compile(ur"(?im)(?P<all>{{\s*(Location|Object location)\s*\|\s*(?P<lat_d>[\d\.\-\+]+)\s*\|\s*(?P<lat_m>[\d\.\-\+]+)\s*\|\s*(?P<lat_s>[\d\.\-\+]+)\s*\|\s*(?P<lat>[NS])\s*\|\s*(?P<lon_d>[\d\.\-\+]+)\s*\|\s*(?P<lon_m>[\d\.\-\+]+)\s*\|\s*(?P<lon_s>[\d\.\-\+]+)\s*\|\s*(?P<lon>[EW])\s*\|?\s*[^\|\}]*\s*}})") date_r = re.compile(ur"(?im)^\s*\|\s*Date\s*=\s*(?P<date>(\d{4}(-\d{2}-\d{2})?))\D") description_r = re.compile(ur"(?im)\{\{\s*en\s*\|\s*(1\s*\=)?\s*(?P<description>[^\{\}]{10,300})\s*\}\}") exclude_images_r = re.compile(ur"(?im)\b(maps?|mapa)\b") images_by_year = {}
def get_redirects_from_dump(self, alsoGetPageTitles=False): ''' Load a local XML dump file, look at all pages which have the redirect flag set, and find out where they're pointing at. Return a dictionary where the redirect names are the keys and the redirect targets are the values. ''' xmlFilename = self.xmlFilename redict = {} # open xml dump and read page titles out of it dump = xmlreader.XmlDump(xmlFilename) redirR = self.site.redirectRegex() readPagesCount = 0 if alsoGetPageTitles: pageTitles = set() for entry in dump.parse(): readPagesCount += 1 # always print status message after 10000 pages if readPagesCount % 10000 == 0: pywikibot.output(u'%i pages read...' % readPagesCount) if len(self.namespaces) > 0: if pywikibot.Page(self.site, entry.title).namespace() \ not in self.namespaces: continue if alsoGetPageTitles: pageTitles.add(entry.title.replace(' ', '_')) m = redirR.match(entry.text) if m: target = m.group(1) # There might be redirects to another wiki. Ignore these. for code in self.site.family.iwkeys: if target.startswith('%s:' % code) \ or target.startswith(':%s:' % code): if code == self.site.language(): # link to our wiki, but with the lang prefix target = target[(len(code) + 1):] if target.startswith(':'): target = target[1:] else: pywikibot.output( u'NOTE: Ignoring %s which is a redirect to %s:' % (entry.title, code)) target = None break # if the redirect does not link to another wiki if target: source = entry.title.replace(' ', '_') target = target.replace(' ', '_') # remove leading and trailing whitespace target = target.strip('_') # capitalize the first letter if not pywikibot.getSite().nocapitalize: source = source[:1].upper() + source[1:] target = target[:1].upper() + target[1:] if '#' in target: target = target[:target.index('#')].rstrip("_") if '|' in target: pywikibot.output( u'HINT: %s is a redirect with a pipelink.' % entry.title) target = target[:target.index('|')].rstrip("_") if target: # in case preceding steps left nothing redict[source] = target if alsoGetPageTitles: return redict, pageTitles else: return redict
def main(): """Localisation for dates (YYYY-MM-DD)""" month2number = { #English u"en": { u"january": u"01", u"jan": u"01", u"february": u"02", u"feb": u"02", u"march": u"03", u"mar": u"03", u"april": u"04", u"apr": u"04", u"may": u"05", u"june": u"06", u"jun": u"06", u"july": u"07", u"jul": u"07", u"august": u"08", u"aug": u"08", u"september": u"09", u"sep": u"09", u"sept": u"09", u"october": u"10", u"oct": u"10", u"november": u"11", u"nov": u"11", u"december": u"12", u"dec": u"12", }, #Spanish u"es": { u"enero": u"01", u"ene": u"01", u"febrero": u"02", u"feb": u"02", u"marzo": u"03", u"mar": u"03", u"abril": u"04", u"abr": u"04", u"mayo": u"05", u"may": u"05", u"junio": u"06", u"jun": u"06", u"julio": u"07", u"jul": u"07", u"agosto": u"08", u"ago": u"08", u"agos": u"08", u"setiembre": u"09", u"septiembre": u"09", u"sep": u"09", u"sept": u"09", u"octubre": u"10", u"oct": u"10", u"noviembre": u"11", u"nov": u"11", u"diciembre": u"12", u"dic": u"12", }, #French u"fr": { u"janvier": u"01", u"jan": u"01", u"février": u"02", u"fevrier": u"02", u"mars": u"03", u"avril": u"04", u"avr": u"04", u"mai": u"05", u"juin": u"06", u"juillet": u"07", u"août": u"08", u"aout": u"08", u"septembre": u"09", u"sept": u"09", u"sep": u"09", u"octobre": u"10", u"oct": u"10", u"novembre": u"11", u"nov": u"11", u"décembre": u"12", u"decembre": u"12", u"dec": u"12", }, #German u"de": { u"januar": u"01", u"jan": u"01", u"februar": u"02", u"feb": u"02", u"märz": u"03", u"marz": u"03", u"mar": u"03", u"april": u"04", u"apr": u"04", u"mai": u"05", u"juni": u"06", u"juli": u"07", u"august": u"08", u"aug": u"08", u"september": u"09", u"sept": u"09", u"sep": u"09", u"oktober": u"10", u"okt": u"10", u"november": u"11", u"nov": u"11", u"dezember": u"12", u"dez": u"12", }, #Italian u"it": { u"gennaio": u"01", u"gen": u"01", u"febbraio": u"02", u"feb": u"02", u"marzo": u"03", u"mar": u"03", u"aprile": u"04", u"apr": u"04", u"maggio": u"05", u"mag": u"05", u"giugno": u"06", u"luglio": u"07", u"agosto": u"08", u"ago": u"08", u"settembre": u"09", u"sett": u"09", u"set": u"09", u"ottobre": u"10", u"ott": u"10", u"novembre": u"11", u"nov": u"11", u"diciembre": u"12", u"dic": u"12", }, #Nederlands u"nl": { u"januari": u"01", u"jan": u"01", u"februari": u"02", u"feb": u"02", u"maart": u"03", u"april": u"04", u"apr": u"04", u"mei": u"05", u"juni": u"06", u"juli": u"07", u"augustus": u"08", u"aug": u"08", u"september": u"09", u"sept": u"09", u"sep": u"09", u"oktober": u"10", u"okt": u"10", u"november": u"11", u"nov": u"11", u"december": u"12", u"dec": u"12", }, #Polski u"pl": { u"styczeń": u"01", u"luty": u"02", u"marzec": u"03", u"kwiecień": u"04", u"maj": u"05", u"czerwiec": u"06", u"lipiec": u"07", u"sierpień": u"08", u"wrzesień": u"09", u"październik": u"10", u"listopad": u"11", u"grudzień": u"12", }, #Portuguese u"pt": { u"janeiro": u"01", u"jan": u"01", u"fevereiro": u"02", u"fev": u"02", u"março": u"03", u"mar": u"03", u"abril": u"04", u"abr": u"04", u"maio": u"05", u"junho": u"06", u"julho": u"07", u"agosto": u"08", u"setembro": u"09", u"outubro": u"10", u"novembre": u"11", u"dezembro": u"12", }, } #regexps spliter1 = ur'[\s\-\,\.\/\\]*' #spliter for months in words spliter2 = ur'' #todo, spliter for dates with month in numbers suffix1 = ur'[\s\.]*(st|nd|rd|th)?[\s\.]*' # March 1st, ..., not mandatory regexp_r = { 'en-ddmonthyyyy': re.compile( ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s%s(?P<month>%s)%s(?P<year>\d{4}))(?P<end>\s*))$" % (suffix1, spliter1, '|'.join( month2number['en'].keys()), spliter1)), 'en-monthddyyyy': re.compile( ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<month>%s)%s(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s%s(?P<year>\d{4}))(?P<end>\s*))$" % ('|'.join( month2number['en'].keys()), spliter1, suffix1, spliter1)), 'es-ddmonthyyyy': re.compile( ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])\s+de\s+(?P<month>%s)\s+de\s+(?P<year>\d{4}))(?P<end>\s*))$" % ('|'.join(month2number['es'].keys()))), 'fr-ddmonthyyyy': re.compile( ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])\s+(?P<month>%s)\s+(?P<year>\d{4}))(?P<end>\s*))$" % ('|'.join(month2number['fr'].keys()))), 'de-ddmonthyyyy': re.compile( ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s(?P<month>%s)%s(?P<year>\d{4}))(?P<end>\s*))$" % (spliter1, '|'.join(month2number['de'].keys()), spliter1)), 'it-ddmonthyyyy': re.compile( ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s(?P<month>%s)%s(?P<year>\d{4}))(?P<end>\s*))$" % (spliter1, '|'.join(month2number['it'].keys()), spliter1)), 'nl-ddmonthyyyy': re.compile( ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s(?P<month>%s)%s(?P<year>\d{4}))(?P<end>\s*))$" % (spliter1, '|'.join(month2number['nl'].keys()), spliter1)), 'pl-ddmonthyyyy': re.compile( ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s(?P<month>%s)%s(?P<year>\d{4}))(?P<end>\s*))$" % (spliter1, '|'.join(month2number['pl'].keys()), spliter1)), 'pt-ddmonthyyyy': re.compile( ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])\s+de\s+(?P<month>%s)\s+de\s+(?P<year>\d{4}))(?P<end>\s*))$" % ('|'.join(month2number['pt'].keys()))), } dumpfilename = '' modes = [] skip = u'' #'File:Lagothrix lagotricha.jpg' if len(sys.argv) >= 2: dumpfilename = sys.argv[1] else: print 'python script.py dumpfilename [mode] [skipuntilthispage]' sys.exit() if len(sys.argv) >= 3: #en1, fr1, etc, regexps if sys.argv[2] != 'all': modes = [sys.argv[2]] if not modes: modes = regexp_r.keys() if len(sys.argv) >= 4: skip = re.sub('_', ' ', sys.argv[3]) xml = xmlreader.XmlDump(dumpfilename, allrevisions=False) c = 0 if skip: print 'Skiping to...', skip for x in xml.parse(): #parsing the whole dump if not x.title.strip().startswith('File:'): continue c += 1 if skip: if x.title.strip() != skip: continue else: skip = '' for mode in modes: m = re.findall(regexp_r[mode], x.text) # check dump text if m: print c, 'Candidate found in dump: ', x.title page = wikipedia.Page(wikipedia.Site("commons", "commons"), x.title) if not page.exists() or page.isRedirectPage( ) or page.isDisambig(): print ' Page not found, deleted or redirect?' continue #next page in dump if not page.canBeEdited(): print ' Page cannot be edited, protected?' continue #next page in dump wtext = page.get() newtext = wtext if re.findall(regexp_r[mode], wtext): m = re.finditer( regexp_r[mode], wtext ) # check live text to verify that the date is still in Commons page for i in m: print ' Commons page has a date to translate:', x.title #text to remove if mode in [ 'en-ddmonthyyyy', 'en-monthddyyyy', 'es-ddmonthyyyy', 'fr-ddmonthyyyy', 'de-ddmonthyyyy', 'it-ddmonthyyyy', 'nl-ddmonthyyyy', 'pl-ddmonthyyyy', 'pt-ddmonthyyyy', ]: regexp_rep = i.group('all') elif False: #other modes... pass #text to insert monthname = i.group('month').strip().lower() if mode in [ 'en-ddmonthyyyy', 'en-monthddyyyy', 'es-ddmonthyyyy', 'fr-ddmonthyyyy', 'de-ddmonthyyyy', 'it-ddmonthyyyy', 'nl-ddmonthyyyy', 'pl-ddmonthyyyy', 'pt-ddmonthyyyy', ]: regexp_sub = ur"%s%s-%s-%02d%s" % ( i.group('ini'), i.group('year'), month2number[mode.split('-')[0]][monthname], int(i.group('day')), i.group('end')) elif False: #other modes... pass newtext = newtext.replace( regexp_rep, regexp_sub, 1) #replace only the first occurence if wtext != newtext: #submit only if difference appears wikipedia.showDiff(wtext, newtext) page.put( newtext, u"BOT - Changes to allow localization: %s → %s" % (regexp_rep, regexp_sub)) break #only one replacement and break else: print ' Text in Commons page does not contain a date to be localised' break #only one mode, then skip to the following page
def main(*args): print "ARGS:%s\n" % sys.argv genFactory = pagegenerators.GeneratorFactory() # If xmlfilename is None, references will be loaded from the live wiki. xmlfilename = None user = None skip = False timestamp = None # read command line parameters for arg in pywikibot.handleArgs(*args): xmlfilename = arg print xmlfilename insite = pywikibot.getSite("en", "wikipedia") importsite = "speedydeletion" outsite = pywikibot.getSite("en", importsite) outsite.forceLogin() try: print "try to open %s\n" % xmlfilename with open(xmlfilename) as f: pass except: print "cannot open %s\n" % xmlfilename exit(0) if sys.argv[1] == "--validate": tempfile = "%s.tmp" % xmlfilename status = subprocess.call("xmllint --recover %s -o %s" % (xmlfilename, tempfile), shell=True) print "status %d\n" % status else: tempfile = xmlfilename dump = xmlreader.XmlDump(tempfile) count = 0 for entry in dump.parse(): # print file_store[entry.title] title = entry.title.encode("utf8", "ignore") if re.search("^User:"******"^Wikipedia:", entry.title): # pywikibot.output(u'skipping %s' % entry.title) continue # if re.search("^User:"******"^User Talk:" , entry.title): # pywikibot.output(u'skipping %s' % entry.title) # continue if re.search(".css$", entry.title): # pywikibot.output(u'skipping %s' % entry.title) continue if re.search("^Main Page", entry.title): # pywikibot.output(u'skipping %s' % entry.title) continue # pywikibot.output(u'Considering %s' % entry.title) title = title.replace(":", "_") title = title.replace("!", "_") title = title.replace("/", "_") title = title.replace("\\", "_") title = decode(title) try: if (len(title) < 1): pywikibot.output(u'empty title:%s' % entry.title) continue if (file_store[title]): count = count + 1 else: pywikibot.output(u'not exists %s' % entry.title) except KeyError: try: outpage = pywikibot.Page(site=outsite, title=entry.title, insite=outsite) exists = False try: exists = outpage.exists() except: pywikibot.output( u'key error exiting article %s transformed to %s' % (entry.title, title)) if exists: #pywikibot.output(u'there is an article %s' % entry.title) try: file_store[title] = 1 except KeyError: pywikibot.output( u'key error saving article %s transformed to %s' % (entry.title, title)) else: pywikibot.output(u'is not there, adding %s' % entry.title) contents = entry.text usernames = entry.username if re.search('Template:', title): contents = contents + "<noinclude>{{wikipedia-template|%s}}</noinclude>" % usernames else: contents = contents + "\n{{wikipedia-deleted|%s}}" % usernames outpage._site = outsite try: outpage.put(contents) except: pywikibot.output(u'cannot put article %s / %s' % (entry.title, title)) try: file_store[title] = 1 except KeyboardInterrupt: print "Bye" sys.exit() except KeyError: pywikibot.output( u'could not save %s! to the list of article' % entry.title) except KeyboardInterrupt: print "Bye" sys.exit() except KeyError: pywikibot.output(u'problem with %s! ' % entry.title) finally: count = count + 1 except KeyboardInterrupt: print "Bye" sys.exit() except KeyError: pywikibot.output(u'problem2 with %s! ' % entry.title) finally: count = count + 1
createDB(conn=conn, cursor=cursor) limit = 1000 c = 0 c_page = 0 t1 = time.time() tt = time.time() r_internal_links = re.compile(ur'(?i)(\[\[[^\|\]\r\n]+?(\|[^\|\]\r\n]*?)?\]\])') #descontar external, images, categories, interwiki? r_external_links = re.compile(ur'(?i)\b(ftps?|git|gopher|https?|irc|mms|news|svn|telnet|worldwind)://') # http://en.wikipedia.org/wiki/Special:SiteMatrix r_interwikis = re.compile(ur'(?i)(\[\[([a-z]{2,3}|simple|classical)(\-([a-z]{2,3}){1,2}|tara)?\:[^\[\]]+?\]\])') r_sections = re.compile(ur'(?im)^(={1,6})[^=]+\1') r_templates = re.compile(ur'(?im)(^|[^\{])\{\{[^\{\}\|]+[\}\|]') # {{T1|...}} or {{T1}} xml = xmlreader.XmlDump(dumpfilename, innerxml=xmlfilename, allrevisions=True) errors = 0 errors_page = 0 page_id = -1 #impossible value page_title = '' page_editcount = 0 page_creation_timestamp = '' page_last_timestamp = '' page_text = '' page_size = 0 page_internal_links = 0 page_external_links = 0 page_interwikis = 0 page_sections = 0 page_templates = 0 rev_prev_text_for_diff = ''