Ejemplo n.º 1
0
 def __iter__(self):
     """Yield page objects until the entire XML dump has been read."""
     from pywikibot import xmlreader
     mysite = pywikibot.getSite()
     dump = xmlreader.XmlDump(self.xmlfilename)
     # regular expression to find the original template.
     # {{vfd}} does the same thing as {{Vfd}}, so both will be found.
     # The old syntax, {{msg:vfd}}, will also be found.
     # TODO: check site.nocapitalize()
     templatePatterns = []
     for template in self.templates:
         templatePattern = template.titleWithoutNamespace()
         if not pywikibot.getSite().nocapitalize:
             templatePattern = '[%s%s]%s' % (templatePattern[0].upper(),
                                             templatePattern[0].lower(),
                                             templatePattern[1:])
         templatePattern = re.sub(' ', '[_ ]', templatePattern)
         templatePatterns.append(templatePattern)
     templateRegex = re.compile(
         r'\{\{ *([mM][sS][gG]:)?(?:%s) *(?P<parameters>\|[^}]+|) *}}' %
         '|'.join(templatePatterns))
     for entry in dump.parse():
         if templateRegex.search(entry.text):
             page = pywikibot.Page(mysite, entry.title)
             yield page
Ejemplo n.º 2
0
def first_part(fileToParse):
    file = open("vajadzigie-raksti-et-part1.txt", "w", encoding='utf-8')
    num = 0
    kopejaissaraksts = []
    regexpat = "\[\[([^\|\]#]+)"

    with BZ2File(fileToParse) as xml_file:
        for page in xmlreader.XmlDump(fileToParse).parse():
            if page.ns == "0" and not page.isredirect:
                num += 1
                if num % 1000 == 0:
                    print(num)

                links = [
                    match.group(1)[0].upper() + match.group(1)[1:]
                    for match in re.finditer(regexpat,
                                             textlib.unescape(page.text))
                    if not re.search('^[a-z]{2,3}:', match.group(1))
                ]
                links = [link.replace('_', ' ') for link in links]
                links = list(set(links))

                kopejaissaraksts.extend(links)

    #pywikibot.output(kopejaissaraksts)
    #pywikibot.output(Counter(kopejaissaraksts))

    #file.write( str(Counter(kopejaissaraksts)))

    return Counter(kopejaissaraksts)
Ejemplo n.º 3
0
 def _get_entries(self, filename, **kwargs):
     """Get all entries via XmlDump."""
     entries = [
         r for r in xmlreader.XmlDump(join_xml_data_path(filename), **
                                      kwargs).parse()
     ]
     return entries
Ejemplo n.º 4
0
def main():

    lista_stron1 = xmlreader.XmlDump('plwiktionary-20101011-pages-articles.xml')
    lista_stron2 = xmlreader.XmlDump.parse(lista_stron1)
    licz_ipa = 0
    licz_puste = 0
    licz_all = 0
    lista = []

    sekcja = re.compile('==\s*.*?\({{język francuski}}\)\s*?==((.*?)==|(.*))', re.DOTALL)
    ipa = re.compile('({{etym\||{{etymn\||{{etymn2\|)')

    for a in lista_stron2:
        if '{{język francuski}}' in a.text:
            lista.append(a)

    for a in lista:
        s_sekcja = re.search(sekcja, a.text)
        if s_sekcja:

            tekst = s_sekcja.group(1)
            s_ipa = re.search(ipa, tekst)
            if s_ipa:
                licz_ipa = licz_ipa + 1
            else:
                print(a.title)
                licz_puste = licz_puste + 1
        licz_all = licz_all + 1

    print('bez ipa: %d\nz ipa: %d\nwszystkie: %d' % (licz_puste, licz_ipa, licz_all))
Ejemplo n.º 5
0
 def test_XmlDumpRedirect(self):
     pages = self._get_entries('article-pyrus.xml', allrevisions=True)
     pages = [
         r for r in xmlreader.XmlDump(
             os.path.join(_xml_data_dir, "article-pyrus.xml")).parse()
     ]
     self.assertTrue(pages[0].isredirect)
Ejemplo n.º 6
0
 def __iter__(self):
     from pywikibot import xmlreader
     dump = xmlreader.XmlDump(self.xmlFilename)
     for entry in dump.parse():
         text = textlib.removeDisabledParts(entry.text)
         if self.refR.search(text) and not self.referencesR.search(text):
             yield pywikibot.Page(pywikibot.Site(), entry.title)
Ejemplo n.º 7
0
    def get_redirects_from_dump(self, alsoGetPageTitles=False) -> Tuple[
            Dict[str, str], Set[str]]:
        """
        Extract redirects from dump.

        Load a local XML dump file, look at all pages which have the
        redirect flag set, and find out where they're pointing at. Return
        a dictionary where the redirect names are the keys and the redirect
        targets are the values.
        """
        xmlFilename = self.opt.xml
        redict = {}
        # open xml dump and read page titles out of it
        dump = xmlreader.XmlDump(xmlFilename)
        redirR = self.site.redirect_regex
        readPagesCount = 0
        pageTitles = set()
        for entry in dump.parse():
            readPagesCount += 1
            # always print status message after 10000 pages
            if readPagesCount % 10000 == 0:
                pywikibot.output('{} pages read...'.format(readPagesCount))
            if self.opt.namespaces:
                if pywikibot.Page(self.site, entry.title).namespace() \
                        not in self.opt.namespaces:
                    continue
            if alsoGetPageTitles:
                pageTitles.add(space_to_underscore(pywikibot.Link(entry.title,
                                                                  self.site)))

            m = redirR.match(entry.text)
            if m:
                target = m.group(1)
                # There might be redirects to another wiki. Ignore these.
                target_link = pywikibot.Link(target, self.site)
                try:
                    target_link.parse()
                except SiteDefinitionError as e:
                    pywikibot.log(e)
                    pywikibot.output(
                        'NOTE: Ignoring {} which is a redirect ({}) to an '
                        'unknown site.'.format(entry.title, target))
                    target_link = None
                else:
                    if target_link.site != self.site:
                        pywikibot.output(
                            'NOTE: Ignoring {} which is a redirect to '
                            'another site {}.'
                            .format(entry.title, target_link.site))
                        target_link = None
                # if the redirect does not link to another wiki
                if target_link and target_link.title:
                    source = pywikibot.Link(entry.title, self.site)
                    if target_link.anchor:
                        pywikibot.output(
                            'HINT: {} is a redirect with a pipelink.'
                            .format(entry.title))
                    redict[space_to_underscore(source)] = (
                        space_to_underscore(target_link))
        return redict, pageTitles
Ejemplo n.º 8
0
 def _compare(self, previous, variant, all_revisions):
     result = [entry.__dict__ for entry in xmlreader.XmlDump(
         os.path.join(_data_dir, 'article-pyrus' + variant),
         all_revisions).parse()]
     if previous:
         self.assertEqual(previous, result)
     return result
Ejemplo n.º 9
0
def getListFromXML(date, findLatest=False):
    #converts a wikimedia dump to a python generator of xml entries
    #if findLatest True, it will search for the newest dump in dumps folder

    filename = config.path[
        'dumps'] + '{0}/plwiktionary-{0}-pages-articles.xml.bz2'.format(date)

    if findLatest:
        now = datetime.datetime.now()
        checked = now
        found = 0

        while checked > (now - datetime.timedelta(days=90)):

            tempDate = checked.strftime('%Y%m%d')
            tempFilename = config.path[
                'dumps'] + '{0}/plwiktionary-{0}-pages-articles.xml.bz2'.format(
                    tempDate)

            if os.path.isfile(tempFilename):
                found = 1
                break

            checked -= datetime.timedelta(days=1)  #checking day by day

        if found:
            filename = tempFilename

    if os.path.isfile(filename):
        generator = xmlreader.XmlDump.parse(xmlreader.XmlDump(filename))
        return generator
    else:
        print(filename)
        raise DumpNotFound
Ejemplo n.º 10
0
 def test_XmlDumpRedirect(self):
     """Test XmlDump correctly parsing whether a page is a redirect."""
     pages = self._get_entries('article-pyrus.xml', allrevisions=True)
     pages = [
         r for r in xmlreader.XmlDump(
             join_xml_data_path('article-pyrus.xml')).parse()
     ]
     self.assertTrue(pages[0].isredirect)
Ejemplo n.º 11
0
    def __init__(self, xmlFilename, xmlStart, namespaces):
        self.xmlStart = xmlStart
        self.namespaces = namespaces
        self.skipping = bool(xmlStart)
        self.site = pywikibot.Site()

        dump = xmlreader.XmlDump(xmlFilename)
        self.parser = dump.parse()
Ejemplo n.º 12
0
 def generator(self):
     dump = xmlreader.XmlDump(self.dump_file)
     gen = dump.parse()
     for page in gen:
         if page.isredirect:
             continue
         if page.ns not in ("0", "6"):
             continue
         if self.make_fixes(page.text):
             yield page.title
Ejemplo n.º 13
0
 def test_XmlDumpFirstRev(self):
     pages = [r for r in
              xmlreader.XmlDump(os.path.join(_data_dir,
                                             "article-pear.xml")).parse()]
     self.assertEqual(1, len(pages))
     self.assertEqual(u"Automated conversion", pages[0].comment)
     self.assertEqual(u"Pear", pages[0].title)
     self.assertEqual(u"24278", pages[0].id)
     self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of'))
     self.assertTrue(not pages[0].isredirect)
Ejemplo n.º 14
0
def run(dumps):
    number = 500000
    counter = 0
    start_time = time.time()
    for casee in dumps:
        lang = casee.split('/')[-1].split('wiki')[0]
        dump = xmlreader.XmlDump(casee, True)
        bot = Bot()
        for case in page_info(dump, lang):
            counter += 1
            if number and counter > number:
                break
            bot.parse_edits(case.values())
    bot.parse_bad_edits(250)
    bot.dump()
    print(time.time() - start_time)
    site = pywikibot.Site('meta', fam='meta')
    page = pywikibot.Page(
        site, 'Research:Revision scoring as a service/Word lists/' + lang)
    try:
        text = page.get()
    except pywikibot.NoPage:
        text = ("{{Research:Revision scoring as a service/template/word list "
                "data\n  |lang=%s\n  |gen=250\n  |badwords=-\n  |informal=-"
                "\n  |stopwords=-\n  |dictionary=-\n  |stemmer=-\n  |contact="
                "\n  |features=no\n  |labels=requested\n  |campaign=no\n  "
                "|needs=-\n  |list-generated=\n  |list-stop=\n}}\n" % lang)
    except:
        return False
    new_text = text
    if re.search(r'\|\s*?list\-generated\s*?\=\s*?', text):
        if re.search(r'\|\s*?list\-generated\s*?\=\s*?(\||\}\})', text):
            new_text = re.sub(
                r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})',
                r'\1%s\2' % bot.bad_words_res_text,
                new_text)
    else:
        new_text = re.sub(
            r'\}\}',
            r'|list-generated=%s\n}}' % bot.bad_words_res_text,
            new_text)
    if re.search(r'\|\s*?list\-stop\s*?\=\s*?', text):
        if re.search(r'\|\s*?list\-stop\s*?\=\s*?(\||\}\})', text):
            new_text = re.sub(
                r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})',
                r'\1%s\2' % bot.stop_words_res_text,
                new_text)
    else:
        new_text = re.sub(
            r'\}\}',
            r'|list-stop=%s\n}}' % bot.stop_words_res_text,
            new_text)
    if new_text != text:
        page.text = new_text
        page.save('Bot: update results')
Ejemplo n.º 15
0
 def test_XmlDumpAllRevs(self):
     pages = [r for r in
              xmlreader.XmlDump(os.path.join(_data_dir,
                                             "article-pear.xml"),
                                allrevisions=True).parse()]
     self.assertEqual(4, len(pages))
     self.assertEqual(u"Automated conversion", pages[0].comment)
     self.assertEqual(u"Pear", pages[0].title)
     self.assertEqual(u"24278", pages[0].id)
     self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of'))
     self.assertEqual(u"Quercusrobur", pages[1].username)
     self.assertEqual(u"Pear", pages[0].title)
Ejemplo n.º 16
0
def main():

    import argparse
    from pywikibot import xmlreader

    parser = argparse.ArgumentParser(description="Find fixable entries")
    parser.add_argument("xmlfile", help="Wiktionary dump")
    parser.add_argument(
        "--save", help="Save to wiktionary with specified commit message")
    parser.add_argument("--limit",
                        type=int,
                        help="Limit processing to first N articles")
    parser.add_argument("--progress",
                        help="Display progress",
                        action='store_true')
    args = parser.parse_args()

    dump = xmlreader.XmlDump(args.xmlfile)
    parser = dump.parse()

    stats = defaultdict(int)
    samples = defaultdict(set)

    count = 0
    for page in parser:
        if ":" in page.title or "/" in page.title or page.isredirect:
            continue

        count += 1
        if count % 1000 == 0 and args.progress:
            print(count, file=sys.stderr, end="\r")
        if args.limit and count > args.limit:
            break

        entry = SectionParser(page.text, page.title)

        validate_entry(entry)

        for section in entry.ifilter_sections():
            item = f"{section.level}:{section.title}"
            stats[item] += 1
            if samples[item] is not None:
                samples[item].add(page.title)
                if len(samples[item]) > 100:
                    samples[item] = None

    export_errors("User:JeffDoozan/lists", args.save)
    if args.save:
        base_url = "User:JeffDoozan/stats/sections/latest"
        upload_stats(base_url, stats, args.save)
        upload_samples(base_url, samples, args.save)
Ejemplo n.º 17
0
def parseFile(fileToParse):
	num = 0
	with BZ2File(fileToParse) as xml_file:
		for page in xmlreader.XmlDump(fileToParse).parse():
			if page.ns == "0" and not page.isredirect:
				num += 1
				if num % 2500 == 0:
					print(num)

				pagetext = textlib.unescape(page.text)
				pagetitle = page.title

				#nākošais
				parse_findings(check_nakosais,pagetext,pagetitle,file_nakosais,mas_nakosais)
Ejemplo n.º 18
0
def main():

    import argparse
    argparser = argparse.ArgumentParser(
        description=
        "Detect possibly mismatched POS headers from enwiktionary dump.\nBy default, scans all languages."
    )
    argparser.add_argument("--xml", help="XML file to load", required=True)
    argparser.add_argument("--limit",
                           type=int,
                           help="Limit processing to first N articles")
    argparser.add_argument("--progress",
                           help="Display progress",
                           action='store_true')
    argparser.add_argument(
        "--date",
        help="Date of the database dump (used to generate page messages)")
    argparser.add_argument(
        "--save", help="Save to wiktionary with specified commit message")
    args = argparser.parse_args()

    if not os.path.isfile(args.xml):
        raise FileNotFoundError(f"Cannot open: {args.xml}")

    dump = xmlreader.XmlDump(args.xml)
    parser = dump.parse()
    count = 0
    for page in parser:
        if ":" in page.title or "/" in page.title:
            continue

        if not count % 1000 and args.progress:
            print(count, end='\r', file=sys.stderr)

        if args.limit and count >= args.limit:
            break
        count += 1

        check_page(page.title, page.text)

    if args.save:
        base_url = "User:JeffDoozan/lists/mismatched pos"
        logger.save(base_url,
                    WikiByLanguage,
                    commit_message=args.save,
                    page_limit=1000,
                    data_date=args.date)
    else:
        dest = "mismatched"
        logger.save(dest, FileByLanguage, page_limit=1000, data_date=args.date)
Ejemplo n.º 19
0
 def __iter__(self):
     """Yield page objects until the entire XML dump has been read."""
     mysite = pywikibot.Site()
     dump = xmlreader.XmlDump(self.xmlfilename)
     # regular expression to find the original template.
     # {{vfd}} does the same thing as {{Vfd}}, so both will be found.
     # The old syntax, {{msg:vfd}}, will also be found.
     templatePatterns = []
     for template in self.templates:
         templatePattern = template.title(withNamespace=False)
         if mysite.namespaces[10].case == 'first-letter':
             templatePattern = '[%s%s]%s' % (templatePattern[0].upper(),
                                             templatePattern[0].lower(),
                                             templatePattern[1:])
         templatePattern = re.sub(' ', '[_ ]', templatePattern)
         templatePatterns.append(templatePattern)
Ejemplo n.º 20
0
    def __init__(self, xmlFilename, xmlStart, replacements, exceptions):
        self.xmlFilename = xmlFilename
        self.replacements = replacements
        self.exceptions = exceptions
        self.xmlStart = xmlStart
        self.skipping = bool(xmlStart)

        self.excsInside = []
        if "inside-tags" in self.exceptions:
            self.excsInside += self.exceptions['inside-tags']
        if "inside" in self.exceptions:
            self.excsInside += self.exceptions['inside']
        from pywikibot import xmlreader
        self.site = pywikibot.Site()
        dump = xmlreader.XmlDump(self.xmlFilename)
        self.parser = dump.parse()
Ejemplo n.º 21
0
def main():

    import argparse
    argparser = argparse.ArgumentParser(
        description="Find forms with data beyond a simple form declaration")
    argparser.add_argument("--xml", help="XML file to load", required=True)
    argparser.add_argument("--limit",
                           type=int,
                           help="Limit processing to first N articles")
    argparser.add_argument("--progress",
                           help="Display progress",
                           action='store_true')
    argparser.add_argument(
        "--save", help="Save to wiktionary with specified commit message")
    args = argparser.parse_args()

    if not os.path.isfile(args.xml):
        raise FileNotFoundError(f"Cannot open: {args.xml}")

    def log(error, page, item, line=None):
        section = item._parent._name
        #print("logged:", [error, page, section, line])
        logger.add(error, page, section, line)

    dump = xmlreader.XmlDump(args.xml)
    parser = dump.parse()
    count = 0
    for page in parser:
        if ":" in page.title or "/" in page.title:
            continue

        if not count % 1000 and args.progress:
            print(count, end='\r', file=sys.stderr)

        if args.limit and count >= args.limit:
            break
        count += 1

        check_page(page.title, page.text, log_function=log)

    if args.save:
        base_url = "User:JeffDoozan/lists"
        logger.save(base_url, WikiSaver, commit_message=args.save)
    else:
        dest = ""
        logger.save(dest, FileSaver)
Ejemplo n.º 22
0
def main():

    data = '20110723'
    data_slownie = data[6] + data[7] + '.' + data[4] + data[5] + '.' + data[
        0] + data[1] + data[2] + data[3]
    lista_stron1 = xmlreader.XmlDump(
        '/mnt/user-store/dumps/plwiktionary/plwiktionary-%s-pages-articles.xml'
        % data)
    lista_stron = xmlreader.XmlDump.parse(lista_stron1)
    wikt = pywikibot.Site('pl', 'wiktionary')
    outputPage = pywikibot.Page(wikt,
                                'Wikipedysta:AlkamidBot/listy/bez_źródła')

    notFoundList = collections.defaultdict(list)

    for a in lista_stron:
        try:
            word = Haslo(a.title, a.text)
        except notFromMainNamespace:
            pass
        except sectionsNotFound:
            pass
        else:
            if word.type == 3:
                for lang in word.listLangs:
                    if lang.type != 2 and lang.lang == 'arabski':
                        lang.pola()
                        if lang.type == 1:
                            if not refs(lang.content, lang.zrodla):
                                notFoundList['arabski'].append(lang.title)

    text = ''
    for a in notFoundList:
        text += '== %s ==' % (a)
        for b in notFoundList[a]:
            text += '\n*[[%s]]' % (b)
        text += '\n'

    file = open('output/bez_zrodla.txt', 'w')
    file.write(text.encode("utf-8"))
    file.close

    outputPage.put(text, comment="Aktualizacja listy")
Ejemplo n.º 23
0
    def __init__(self, xmlFilename, xmlStart, replacements, exceptions, site):
        """Constructor."""
        self.xmlFilename = xmlFilename
        self.replacements = replacements
        self.exceptions = exceptions
        self.xmlStart = xmlStart
        self.skipping = bool(xmlStart)

        self.excsInside = []
        if "inside-tags" in self.exceptions:
            self.excsInside += self.exceptions['inside-tags']
        if "inside" in self.exceptions:
            self.excsInside += self.exceptions['inside']
        from pywikibot import xmlreader
        if site:
            self.site = site
        else:
            self.site = swwsite
        dump = xmlreader.XmlDump(self.xmlFilename)
        self.parser = dump.parse()
Ejemplo n.º 24
0
    def scanWiki(self, fileToParse, plugins):
        counter = 0
        whatToSearch = self.search[self.wiki]

        reflistTaskName = 'reflist'

        with BZ2File(fileToParse) as xml_file:
            for page in xmlreader.XmlDump(fileToParse).parse():
                if page.ns == "0" and not page.isredirect:
                    pagetext = textlib.unescape(page.text)
                    pagetitle = page.title

                    counter += 1
                    #if counter == 5000: break

                    if counter % 10000 == 0:
                        print(counter)

                    for task in whatToSearch:
                        for entry in whatToSearch[task]:
                            doesHaveMatch = self.parse_findings(
                                pagetext, entry['regex'], entry['flag'])
                            if doesHaveMatch:
                                if task in self.findings:
                                    self.findings[task].append(pagetitle)
                                else:
                                    self.findings[task] = [pagetitle]
                    if 'reflist' in plugins:
                        doesHaveMatch = self.parse_reflist_search(pagetext)

                        if doesHaveMatch:
                            if reflistTaskName in self.findings:
                                self.findings[reflistTaskName].append(
                                    pagetitle)
                            else:
                                self.findings[reflistTaskName] = [pagetitle]

        print('scan ended')
        return self.saveResultsToDatabase()
Ejemplo n.º 25
0
def main():

    data = '20111102'

    lista_stron1 = xmlreader.XmlDump('/mnt/user-store/dumps/plwiktionary/plwiktionary-%s-pages-articles.xml' % data)
    lista_stron2 = xmlreader.XmlDump.parse(lista_stron1)
    text = ''

    tempLangs = []

    notFound = []
    notFoundList = collections.defaultdict(list)

    LangsMediaWiki = getAllLanguages()

    for a in lista_stron2:
        try: word = Haslo(a.title, a.text)
        except sectionsNotFound:
            pass
        else:
            if word.type == 3:
                for lang in word.listLangs:
                    if lang.type != 2:
                        lang.pola()
                        if lang.type == 1 and lang.znaczeniaDetail:
                            for d in lang.znaczeniaDetail:
                                if '{{lm}} od' in d[1] or 'liczba mnoga od' in d[1] or 'zwykle w {{lm}}' in d[1] or 'zwykle w liczbie mnogiej' in d[1] or 'w {{lm}}' in d[1] or 'w liczbie mnogiej' in d[1] or 'l.m.' in d[1]:
                                    notFoundList['%s' % lang.lang].append(word.title)

    for a in LangsMediaWiki:
        if notFoundList['%s' % a.shortName] and a.shortName:
            text += '== %s ==' % (a.longName)
            for b in notFoundList['%s' % a.shortName]:
                text += '\n*[[%s]]' % (b)
            text += '\n'

    file = open('output/liczba_mnoga.txt', 'w')
    file.write(text.encode( "utf-8" ))
    file.close
Ejemplo n.º 26
0
def countMean():

    global data
    data = '20110502'
    lista_stron1 = xmlreader.XmlDump('plwiktionary-%s-pages-articles.xml' %
                                     data)
    lista_stron = xmlreader.XmlDump.parse(lista_stron1)

    re_count = re.compile('(\: \([0-9]\.[0-9]\))')
    counter = 0

    text = ''
    lista = []

    for page in lista_stron:
        word = Haslo(page.title, page.text)
        if word.type == 3:
            for lang in word.listLangs:
                if lang.type == 1:
                    lang.pola()
                    if lang.znaczeniaWhole:
                        if lang.type == 7:
                            temp = []
                            temp.append(lang.lang)
                            temp.append(word.title)
                            lista.append(temp)

    def sortkey(row):
        return row[0]

    lista.sort(key=sortkey)
    for a in lista:
        text = text + '* [[%s]] (%s)\n' % (a[1], a[0])

    file = open("output/brak_części_mowy.txt", 'a')
    file.write(text.encode("utf-8"))
    file.close
Ejemplo n.º 27
0
 def __init__(self, xmlfilename):
     """Constructor."""
     self.xmldump = xmlreader.XmlDump(xmlfilename)
Ejemplo n.º 28
0
 def __init__(self, xmlfilename):
     """Initializer."""
     self.xmldump = xmlreader.XmlDump(xmlfilename)
Ejemplo n.º 29
0
num = 0

context = 30

numprint = 150

finds = []

start = time.time()

paths = '/public/dumps/public/lvwiki/20190201/lvwiki-20190201-pages-articles.xml.bz2'

with BZ2File(paths) as xml_file:
    blah = False
    for page in xmlreader.XmlDump(paths).parse():
        if page.ns == "0" and not page.isredirect:
            pagetext = textlib.unescape(page.text)
            pagetitle = page.title

            #if num==100:
            #	break

            num += 1
            if num % numprint == 0:
                print(num)
                sys.stdout.flush()

            for checkR in checklist:
                m = checkR.finditer(pagetext)
                if m:
Ejemplo n.º 30
0
    def get_redirects_from_dump(self, alsoGetPageTitles=False):
        '''
        Load a local XML dump file, look at all pages which have the
        redirect flag set, and find out where they're pointing at. Return
        a dictionary where the redirect names are the keys and the redirect
        targets are the values.
        '''
        xmlFilename = self.xmlFilename
        redict = {}
        # open xml dump and read page titles out of it
        dump = xmlreader.XmlDump(xmlFilename)
        redirR = self.site.redirectRegex()
        readPagesCount = 0
        if alsoGetPageTitles:
            pageTitles = set()
        for entry in dump.parse():
            readPagesCount += 1
            # always print status message after 10000 pages
            if readPagesCount % 10000 == 0:
                pywikibot.output(u'%i pages read...' % readPagesCount)
            if len(self.namespaces) > 0:
                if pywikibot.Page(self.site, entry.title).namespace() \
                        not in self.namespaces:
                    continue
            if alsoGetPageTitles:
                pageTitles.add(entry.title.replace(' ', '_'))

            m = redirR.match(entry.text)
            if m:
                target = m.group(1)
                # There might be redirects to another wiki. Ignore these.
                for code in self.site.family.langs.keys():
                    if target.startswith('%s:' % code) \
                            or target.startswith(':%s:' % code):
                        if code == self.site.language():
                            # link to our wiki, but with the lang prefix
                            target = target[(len(code) + 1):]
                            if target.startswith(':'):
                                target = target[1:]
                        else:
                            pywikibot.output(
                                u'NOTE: Ignoring %s which is a redirect to %s:'
                                % (entry.title, code))
                            target = None
                            break
                # if the redirect does not link to another wiki
                if target:
                    source = entry.title.replace(' ', '_')
                    target = target.replace(' ', '_')
                    # remove leading and trailing whitespace
                    target = target.strip('_')
                    # capitalize the first letter
                    if not pywikibot.Site().nocapitalize:
                        source = source[:1].upper() + source[1:]
                        target = target[:1].upper() + target[1:]
                    if '#' in target:
                        target = target[:target.index('#')].rstrip("_")
                    if '|' in target:
                        pywikibot.output(
                            u'HINT: %s is a redirect with a pipelink.' %
                            entry.title)
                        target = target[:target.index('|')].rstrip("_")
                    if target:  # in case preceding steps left nothing
                        redict[source] = target
        if alsoGetPageTitles:
            return redict, pageTitles
        else:
            return redict