def __iter__(self): """Yield page objects until the entire XML dump has been read.""" from pywikibot import xmlreader mysite = pywikibot.getSite() dump = xmlreader.XmlDump(self.xmlfilename) # regular expression to find the original template. # {{vfd}} does the same thing as {{Vfd}}, so both will be found. # The old syntax, {{msg:vfd}}, will also be found. # TODO: check site.nocapitalize() templatePatterns = [] for template in self.templates: templatePattern = template.titleWithoutNamespace() if not pywikibot.getSite().nocapitalize: templatePattern = '[%s%s]%s' % (templatePattern[0].upper(), templatePattern[0].lower(), templatePattern[1:]) templatePattern = re.sub(' ', '[_ ]', templatePattern) templatePatterns.append(templatePattern) templateRegex = re.compile( r'\{\{ *([mM][sS][gG]:)?(?:%s) *(?P<parameters>\|[^}]+|) *}}' % '|'.join(templatePatterns)) for entry in dump.parse(): if templateRegex.search(entry.text): page = pywikibot.Page(mysite, entry.title) yield page
def first_part(fileToParse): file = open("vajadzigie-raksti-et-part1.txt", "w", encoding='utf-8') num = 0 kopejaissaraksts = [] regexpat = "\[\[([^\|\]#]+)" with BZ2File(fileToParse) as xml_file: for page in xmlreader.XmlDump(fileToParse).parse(): if page.ns == "0" and not page.isredirect: num += 1 if num % 1000 == 0: print(num) links = [ match.group(1)[0].upper() + match.group(1)[1:] for match in re.finditer(regexpat, textlib.unescape(page.text)) if not re.search('^[a-z]{2,3}:', match.group(1)) ] links = [link.replace('_', ' ') for link in links] links = list(set(links)) kopejaissaraksts.extend(links) #pywikibot.output(kopejaissaraksts) #pywikibot.output(Counter(kopejaissaraksts)) #file.write( str(Counter(kopejaissaraksts))) return Counter(kopejaissaraksts)
def _get_entries(self, filename, **kwargs): """Get all entries via XmlDump.""" entries = [ r for r in xmlreader.XmlDump(join_xml_data_path(filename), ** kwargs).parse() ] return entries
def main(): lista_stron1 = xmlreader.XmlDump('plwiktionary-20101011-pages-articles.xml') lista_stron2 = xmlreader.XmlDump.parse(lista_stron1) licz_ipa = 0 licz_puste = 0 licz_all = 0 lista = [] sekcja = re.compile('==\s*.*?\({{język francuski}}\)\s*?==((.*?)==|(.*))', re.DOTALL) ipa = re.compile('({{etym\||{{etymn\||{{etymn2\|)') for a in lista_stron2: if '{{język francuski}}' in a.text: lista.append(a) for a in lista: s_sekcja = re.search(sekcja, a.text) if s_sekcja: tekst = s_sekcja.group(1) s_ipa = re.search(ipa, tekst) if s_ipa: licz_ipa = licz_ipa + 1 else: print(a.title) licz_puste = licz_puste + 1 licz_all = licz_all + 1 print('bez ipa: %d\nz ipa: %d\nwszystkie: %d' % (licz_puste, licz_ipa, licz_all))
def test_XmlDumpRedirect(self): pages = self._get_entries('article-pyrus.xml', allrevisions=True) pages = [ r for r in xmlreader.XmlDump( os.path.join(_xml_data_dir, "article-pyrus.xml")).parse() ] self.assertTrue(pages[0].isredirect)
def __iter__(self): from pywikibot import xmlreader dump = xmlreader.XmlDump(self.xmlFilename) for entry in dump.parse(): text = textlib.removeDisabledParts(entry.text) if self.refR.search(text) and not self.referencesR.search(text): yield pywikibot.Page(pywikibot.Site(), entry.title)
def get_redirects_from_dump(self, alsoGetPageTitles=False) -> Tuple[ Dict[str, str], Set[str]]: """ Extract redirects from dump. Load a local XML dump file, look at all pages which have the redirect flag set, and find out where they're pointing at. Return a dictionary where the redirect names are the keys and the redirect targets are the values. """ xmlFilename = self.opt.xml redict = {} # open xml dump and read page titles out of it dump = xmlreader.XmlDump(xmlFilename) redirR = self.site.redirect_regex readPagesCount = 0 pageTitles = set() for entry in dump.parse(): readPagesCount += 1 # always print status message after 10000 pages if readPagesCount % 10000 == 0: pywikibot.output('{} pages read...'.format(readPagesCount)) if self.opt.namespaces: if pywikibot.Page(self.site, entry.title).namespace() \ not in self.opt.namespaces: continue if alsoGetPageTitles: pageTitles.add(space_to_underscore(pywikibot.Link(entry.title, self.site))) m = redirR.match(entry.text) if m: target = m.group(1) # There might be redirects to another wiki. Ignore these. target_link = pywikibot.Link(target, self.site) try: target_link.parse() except SiteDefinitionError as e: pywikibot.log(e) pywikibot.output( 'NOTE: Ignoring {} which is a redirect ({}) to an ' 'unknown site.'.format(entry.title, target)) target_link = None else: if target_link.site != self.site: pywikibot.output( 'NOTE: Ignoring {} which is a redirect to ' 'another site {}.' .format(entry.title, target_link.site)) target_link = None # if the redirect does not link to another wiki if target_link and target_link.title: source = pywikibot.Link(entry.title, self.site) if target_link.anchor: pywikibot.output( 'HINT: {} is a redirect with a pipelink.' .format(entry.title)) redict[space_to_underscore(source)] = ( space_to_underscore(target_link)) return redict, pageTitles
def _compare(self, previous, variant, all_revisions): result = [entry.__dict__ for entry in xmlreader.XmlDump( os.path.join(_data_dir, 'article-pyrus' + variant), all_revisions).parse()] if previous: self.assertEqual(previous, result) return result
def getListFromXML(date, findLatest=False): #converts a wikimedia dump to a python generator of xml entries #if findLatest True, it will search for the newest dump in dumps folder filename = config.path[ 'dumps'] + '{0}/plwiktionary-{0}-pages-articles.xml.bz2'.format(date) if findLatest: now = datetime.datetime.now() checked = now found = 0 while checked > (now - datetime.timedelta(days=90)): tempDate = checked.strftime('%Y%m%d') tempFilename = config.path[ 'dumps'] + '{0}/plwiktionary-{0}-pages-articles.xml.bz2'.format( tempDate) if os.path.isfile(tempFilename): found = 1 break checked -= datetime.timedelta(days=1) #checking day by day if found: filename = tempFilename if os.path.isfile(filename): generator = xmlreader.XmlDump.parse(xmlreader.XmlDump(filename)) return generator else: print(filename) raise DumpNotFound
def test_XmlDumpRedirect(self): """Test XmlDump correctly parsing whether a page is a redirect.""" pages = self._get_entries('article-pyrus.xml', allrevisions=True) pages = [ r for r in xmlreader.XmlDump( join_xml_data_path('article-pyrus.xml')).parse() ] self.assertTrue(pages[0].isredirect)
def __init__(self, xmlFilename, xmlStart, namespaces): self.xmlStart = xmlStart self.namespaces = namespaces self.skipping = bool(xmlStart) self.site = pywikibot.Site() dump = xmlreader.XmlDump(xmlFilename) self.parser = dump.parse()
def generator(self): dump = xmlreader.XmlDump(self.dump_file) gen = dump.parse() for page in gen: if page.isredirect: continue if page.ns not in ("0", "6"): continue if self.make_fixes(page.text): yield page.title
def test_XmlDumpFirstRev(self): pages = [r for r in xmlreader.XmlDump(os.path.join(_data_dir, "article-pear.xml")).parse()] self.assertEqual(1, len(pages)) self.assertEqual(u"Automated conversion", pages[0].comment) self.assertEqual(u"Pear", pages[0].title) self.assertEqual(u"24278", pages[0].id) self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of')) self.assertTrue(not pages[0].isredirect)
def run(dumps): number = 500000 counter = 0 start_time = time.time() for casee in dumps: lang = casee.split('/')[-1].split('wiki')[0] dump = xmlreader.XmlDump(casee, True) bot = Bot() for case in page_info(dump, lang): counter += 1 if number and counter > number: break bot.parse_edits(case.values()) bot.parse_bad_edits(250) bot.dump() print(time.time() - start_time) site = pywikibot.Site('meta', fam='meta') page = pywikibot.Page( site, 'Research:Revision scoring as a service/Word lists/' + lang) try: text = page.get() except pywikibot.NoPage: text = ("{{Research:Revision scoring as a service/template/word list " "data\n |lang=%s\n |gen=250\n |badwords=-\n |informal=-" "\n |stopwords=-\n |dictionary=-\n |stemmer=-\n |contact=" "\n |features=no\n |labels=requested\n |campaign=no\n " "|needs=-\n |list-generated=\n |list-stop=\n}}\n" % lang) except: return False new_text = text if re.search(r'\|\s*?list\-generated\s*?\=\s*?', text): if re.search(r'\|\s*?list\-generated\s*?\=\s*?(\||\}\})', text): new_text = re.sub( r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})', r'\1%s\2' % bot.bad_words_res_text, new_text) else: new_text = re.sub( r'\}\}', r'|list-generated=%s\n}}' % bot.bad_words_res_text, new_text) if re.search(r'\|\s*?list\-stop\s*?\=\s*?', text): if re.search(r'\|\s*?list\-stop\s*?\=\s*?(\||\}\})', text): new_text = re.sub( r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})', r'\1%s\2' % bot.stop_words_res_text, new_text) else: new_text = re.sub( r'\}\}', r'|list-stop=%s\n}}' % bot.stop_words_res_text, new_text) if new_text != text: page.text = new_text page.save('Bot: update results')
def test_XmlDumpAllRevs(self): pages = [r for r in xmlreader.XmlDump(os.path.join(_data_dir, "article-pear.xml"), allrevisions=True).parse()] self.assertEqual(4, len(pages)) self.assertEqual(u"Automated conversion", pages[0].comment) self.assertEqual(u"Pear", pages[0].title) self.assertEqual(u"24278", pages[0].id) self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of')) self.assertEqual(u"Quercusrobur", pages[1].username) self.assertEqual(u"Pear", pages[0].title)
def main(): import argparse from pywikibot import xmlreader parser = argparse.ArgumentParser(description="Find fixable entries") parser.add_argument("xmlfile", help="Wiktionary dump") parser.add_argument( "--save", help="Save to wiktionary with specified commit message") parser.add_argument("--limit", type=int, help="Limit processing to first N articles") parser.add_argument("--progress", help="Display progress", action='store_true') args = parser.parse_args() dump = xmlreader.XmlDump(args.xmlfile) parser = dump.parse() stats = defaultdict(int) samples = defaultdict(set) count = 0 for page in parser: if ":" in page.title or "/" in page.title or page.isredirect: continue count += 1 if count % 1000 == 0 and args.progress: print(count, file=sys.stderr, end="\r") if args.limit and count > args.limit: break entry = SectionParser(page.text, page.title) validate_entry(entry) for section in entry.ifilter_sections(): item = f"{section.level}:{section.title}" stats[item] += 1 if samples[item] is not None: samples[item].add(page.title) if len(samples[item]) > 100: samples[item] = None export_errors("User:JeffDoozan/lists", args.save) if args.save: base_url = "User:JeffDoozan/stats/sections/latest" upload_stats(base_url, stats, args.save) upload_samples(base_url, samples, args.save)
def parseFile(fileToParse): num = 0 with BZ2File(fileToParse) as xml_file: for page in xmlreader.XmlDump(fileToParse).parse(): if page.ns == "0" and not page.isredirect: num += 1 if num % 2500 == 0: print(num) pagetext = textlib.unescape(page.text) pagetitle = page.title #nākošais parse_findings(check_nakosais,pagetext,pagetitle,file_nakosais,mas_nakosais)
def main(): import argparse argparser = argparse.ArgumentParser( description= "Detect possibly mismatched POS headers from enwiktionary dump.\nBy default, scans all languages." ) argparser.add_argument("--xml", help="XML file to load", required=True) argparser.add_argument("--limit", type=int, help="Limit processing to first N articles") argparser.add_argument("--progress", help="Display progress", action='store_true') argparser.add_argument( "--date", help="Date of the database dump (used to generate page messages)") argparser.add_argument( "--save", help="Save to wiktionary with specified commit message") args = argparser.parse_args() if not os.path.isfile(args.xml): raise FileNotFoundError(f"Cannot open: {args.xml}") dump = xmlreader.XmlDump(args.xml) parser = dump.parse() count = 0 for page in parser: if ":" in page.title or "/" in page.title: continue if not count % 1000 and args.progress: print(count, end='\r', file=sys.stderr) if args.limit and count >= args.limit: break count += 1 check_page(page.title, page.text) if args.save: base_url = "User:JeffDoozan/lists/mismatched pos" logger.save(base_url, WikiByLanguage, commit_message=args.save, page_limit=1000, data_date=args.date) else: dest = "mismatched" logger.save(dest, FileByLanguage, page_limit=1000, data_date=args.date)
def __iter__(self): """Yield page objects until the entire XML dump has been read.""" mysite = pywikibot.Site() dump = xmlreader.XmlDump(self.xmlfilename) # regular expression to find the original template. # {{vfd}} does the same thing as {{Vfd}}, so both will be found. # The old syntax, {{msg:vfd}}, will also be found. templatePatterns = [] for template in self.templates: templatePattern = template.title(withNamespace=False) if mysite.namespaces[10].case == 'first-letter': templatePattern = '[%s%s]%s' % (templatePattern[0].upper(), templatePattern[0].lower(), templatePattern[1:]) templatePattern = re.sub(' ', '[_ ]', templatePattern) templatePatterns.append(templatePattern)
def __init__(self, xmlFilename, xmlStart, replacements, exceptions): self.xmlFilename = xmlFilename self.replacements = replacements self.exceptions = exceptions self.xmlStart = xmlStart self.skipping = bool(xmlStart) self.excsInside = [] if "inside-tags" in self.exceptions: self.excsInside += self.exceptions['inside-tags'] if "inside" in self.exceptions: self.excsInside += self.exceptions['inside'] from pywikibot import xmlreader self.site = pywikibot.Site() dump = xmlreader.XmlDump(self.xmlFilename) self.parser = dump.parse()
def main(): import argparse argparser = argparse.ArgumentParser( description="Find forms with data beyond a simple form declaration") argparser.add_argument("--xml", help="XML file to load", required=True) argparser.add_argument("--limit", type=int, help="Limit processing to first N articles") argparser.add_argument("--progress", help="Display progress", action='store_true') argparser.add_argument( "--save", help="Save to wiktionary with specified commit message") args = argparser.parse_args() if not os.path.isfile(args.xml): raise FileNotFoundError(f"Cannot open: {args.xml}") def log(error, page, item, line=None): section = item._parent._name #print("logged:", [error, page, section, line]) logger.add(error, page, section, line) dump = xmlreader.XmlDump(args.xml) parser = dump.parse() count = 0 for page in parser: if ":" in page.title or "/" in page.title: continue if not count % 1000 and args.progress: print(count, end='\r', file=sys.stderr) if args.limit and count >= args.limit: break count += 1 check_page(page.title, page.text, log_function=log) if args.save: base_url = "User:JeffDoozan/lists" logger.save(base_url, WikiSaver, commit_message=args.save) else: dest = "" logger.save(dest, FileSaver)
def main(): data = '20110723' data_slownie = data[6] + data[7] + '.' + data[4] + data[5] + '.' + data[ 0] + data[1] + data[2] + data[3] lista_stron1 = xmlreader.XmlDump( '/mnt/user-store/dumps/plwiktionary/plwiktionary-%s-pages-articles.xml' % data) lista_stron = xmlreader.XmlDump.parse(lista_stron1) wikt = pywikibot.Site('pl', 'wiktionary') outputPage = pywikibot.Page(wikt, 'Wikipedysta:AlkamidBot/listy/bez_źródła') notFoundList = collections.defaultdict(list) for a in lista_stron: try: word = Haslo(a.title, a.text) except notFromMainNamespace: pass except sectionsNotFound: pass else: if word.type == 3: for lang in word.listLangs: if lang.type != 2 and lang.lang == 'arabski': lang.pola() if lang.type == 1: if not refs(lang.content, lang.zrodla): notFoundList['arabski'].append(lang.title) text = '' for a in notFoundList: text += '== %s ==' % (a) for b in notFoundList[a]: text += '\n*[[%s]]' % (b) text += '\n' file = open('output/bez_zrodla.txt', 'w') file.write(text.encode("utf-8")) file.close outputPage.put(text, comment="Aktualizacja listy")
def __init__(self, xmlFilename, xmlStart, replacements, exceptions, site): """Constructor.""" self.xmlFilename = xmlFilename self.replacements = replacements self.exceptions = exceptions self.xmlStart = xmlStart self.skipping = bool(xmlStart) self.excsInside = [] if "inside-tags" in self.exceptions: self.excsInside += self.exceptions['inside-tags'] if "inside" in self.exceptions: self.excsInside += self.exceptions['inside'] from pywikibot import xmlreader if site: self.site = site else: self.site = swwsite dump = xmlreader.XmlDump(self.xmlFilename) self.parser = dump.parse()
def scanWiki(self, fileToParse, plugins): counter = 0 whatToSearch = self.search[self.wiki] reflistTaskName = 'reflist' with BZ2File(fileToParse) as xml_file: for page in xmlreader.XmlDump(fileToParse).parse(): if page.ns == "0" and not page.isredirect: pagetext = textlib.unescape(page.text) pagetitle = page.title counter += 1 #if counter == 5000: break if counter % 10000 == 0: print(counter) for task in whatToSearch: for entry in whatToSearch[task]: doesHaveMatch = self.parse_findings( pagetext, entry['regex'], entry['flag']) if doesHaveMatch: if task in self.findings: self.findings[task].append(pagetitle) else: self.findings[task] = [pagetitle] if 'reflist' in plugins: doesHaveMatch = self.parse_reflist_search(pagetext) if doesHaveMatch: if reflistTaskName in self.findings: self.findings[reflistTaskName].append( pagetitle) else: self.findings[reflistTaskName] = [pagetitle] print('scan ended') return self.saveResultsToDatabase()
def main(): data = '20111102' lista_stron1 = xmlreader.XmlDump('/mnt/user-store/dumps/plwiktionary/plwiktionary-%s-pages-articles.xml' % data) lista_stron2 = xmlreader.XmlDump.parse(lista_stron1) text = '' tempLangs = [] notFound = [] notFoundList = collections.defaultdict(list) LangsMediaWiki = getAllLanguages() for a in lista_stron2: try: word = Haslo(a.title, a.text) except sectionsNotFound: pass else: if word.type == 3: for lang in word.listLangs: if lang.type != 2: lang.pola() if lang.type == 1 and lang.znaczeniaDetail: for d in lang.znaczeniaDetail: if '{{lm}} od' in d[1] or 'liczba mnoga od' in d[1] or 'zwykle w {{lm}}' in d[1] or 'zwykle w liczbie mnogiej' in d[1] or 'w {{lm}}' in d[1] or 'w liczbie mnogiej' in d[1] or 'l.m.' in d[1]: notFoundList['%s' % lang.lang].append(word.title) for a in LangsMediaWiki: if notFoundList['%s' % a.shortName] and a.shortName: text += '== %s ==' % (a.longName) for b in notFoundList['%s' % a.shortName]: text += '\n*[[%s]]' % (b) text += '\n' file = open('output/liczba_mnoga.txt', 'w') file.write(text.encode( "utf-8" )) file.close
def countMean(): global data data = '20110502' lista_stron1 = xmlreader.XmlDump('plwiktionary-%s-pages-articles.xml' % data) lista_stron = xmlreader.XmlDump.parse(lista_stron1) re_count = re.compile('(\: \([0-9]\.[0-9]\))') counter = 0 text = '' lista = [] for page in lista_stron: word = Haslo(page.title, page.text) if word.type == 3: for lang in word.listLangs: if lang.type == 1: lang.pola() if lang.znaczeniaWhole: if lang.type == 7: temp = [] temp.append(lang.lang) temp.append(word.title) lista.append(temp) def sortkey(row): return row[0] lista.sort(key=sortkey) for a in lista: text = text + '* [[%s]] (%s)\n' % (a[1], a[0]) file = open("output/brak_części_mowy.txt", 'a') file.write(text.encode("utf-8")) file.close
def __init__(self, xmlfilename): """Constructor.""" self.xmldump = xmlreader.XmlDump(xmlfilename)
def __init__(self, xmlfilename): """Initializer.""" self.xmldump = xmlreader.XmlDump(xmlfilename)
num = 0 context = 30 numprint = 150 finds = [] start = time.time() paths = '/public/dumps/public/lvwiki/20190201/lvwiki-20190201-pages-articles.xml.bz2' with BZ2File(paths) as xml_file: blah = False for page in xmlreader.XmlDump(paths).parse(): if page.ns == "0" and not page.isredirect: pagetext = textlib.unescape(page.text) pagetitle = page.title #if num==100: # break num += 1 if num % numprint == 0: print(num) sys.stdout.flush() for checkR in checklist: m = checkR.finditer(pagetext) if m:
def get_redirects_from_dump(self, alsoGetPageTitles=False): ''' Load a local XML dump file, look at all pages which have the redirect flag set, and find out where they're pointing at. Return a dictionary where the redirect names are the keys and the redirect targets are the values. ''' xmlFilename = self.xmlFilename redict = {} # open xml dump and read page titles out of it dump = xmlreader.XmlDump(xmlFilename) redirR = self.site.redirectRegex() readPagesCount = 0 if alsoGetPageTitles: pageTitles = set() for entry in dump.parse(): readPagesCount += 1 # always print status message after 10000 pages if readPagesCount % 10000 == 0: pywikibot.output(u'%i pages read...' % readPagesCount) if len(self.namespaces) > 0: if pywikibot.Page(self.site, entry.title).namespace() \ not in self.namespaces: continue if alsoGetPageTitles: pageTitles.add(entry.title.replace(' ', '_')) m = redirR.match(entry.text) if m: target = m.group(1) # There might be redirects to another wiki. Ignore these. for code in self.site.family.langs.keys(): if target.startswith('%s:' % code) \ or target.startswith(':%s:' % code): if code == self.site.language(): # link to our wiki, but with the lang prefix target = target[(len(code) + 1):] if target.startswith(':'): target = target[1:] else: pywikibot.output( u'NOTE: Ignoring %s which is a redirect to %s:' % (entry.title, code)) target = None break # if the redirect does not link to another wiki if target: source = entry.title.replace(' ', '_') target = target.replace(' ', '_') # remove leading and trailing whitespace target = target.strip('_') # capitalize the first letter if not pywikibot.Site().nocapitalize: source = source[:1].upper() + source[1:] target = target[:1].upper() + target[1:] if '#' in target: target = target[:target.index('#')].rstrip("_") if '|' in target: pywikibot.output( u'HINT: %s is a redirect with a pipelink.' % entry.title) target = target[:target.index('|')].rstrip("_") if target: # in case preceding steps left nothing redict[source] = target if alsoGetPageTitles: return redict, pageTitles else: return redict