Beispiel #1
0
def workon(page):
    mysite = wikipedia.getSite()
    try:
        text = page.get()
    except wikipedia.IsRedirectPage:
        wikipedia.output(u'%s is a redirect page. Skipping' % page.aslink())
        return
    except wikipedia.NoPage:
        wikipedia.output(u'%s does not exist. Skipping' % page.aslink())
        return
    wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
    links = page.linkedPages()
    if len(links) > 0:
        wikipedia.getall(mysite,links)
    else:
        wikipedia.output('Nothing left to do.')
        return
    
    for page2 in links:
        try:
            target = page2.getRedirectTarget()
        except (wikipedia.Error,wikipedia.SectionError):
            continue
        text = treat(text, page2, target)
    if text != page.get():
        comment = wikipedia.translate(mysite, msg)
        wikipedia.showDiff(page.get() ,text)
        try:
            page.put(text, comment)
        except (wikipedia.Error):
            wikipedia.output('Error: unable to put %s' % page.aslink())
Beispiel #2
0
 def oneQuery(self):
     """Perform one step in the solution process"""
     # First find the best language to work on
     code = self.selectQueryCode()
     if code == None:
         print "NOTE: Nothing left to do"
         return False
     # Now assemble a reasonable list of pages to get
     group = []
     plgroup = []
     for subj in self.subjects:
         # Promise the subject that we will work on the code language
         # We will get a list of pages we can do.
         x = subj.willWorkOn(code)
         if x:
             plgroup.extend(x)
             group.append(subj)
             if len(plgroup)>=globalvar.maxquerysize:
                 break
     if len(plgroup) == 0:
         print "NOTE: Nothing left to do 2"
         return False
     # Get the content of the assembled list in one blow
     try:
         wikipedia.getall(code, plgroup)
     except wikipedia.SaxError:
         # Ignore this error, and get the pages the traditional way.
         pass
     # Tell all of the subjects that the promised work is done
     for subj in group:
         subj.workDone(self)
     return True
def workon(page):
    mysite = pywikibot.getSite()
    try:
        text = page.get()
    except pywikibot.IsRedirectPage:
        pywikibot.output(u'%s is a redirect page. Skipping'
                         % page.title(asLink=True))
        return
    except pywikibot.NoPage:
        pywikibot.output(u'%s does not exist. Skipping'
                         % page.title(asLink=True))
        return
    pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                     % page.title())
    links = page.linkedPages()
    if len(links) > 0:
        pywikibot.getall(mysite,links)
    else:
        pywikibot.output('Nothing left to do.')
        return

    for page2 in links:
        try:
            target = page2.getRedirectTarget()
        except (pywikibot.Error,pywikibot.SectionError):
            continue
        text = treat(text, page2, target)
    if text != page.get():
        comment = i18n.twtranslate(mysite, 'fixing_redirects-fixing')
        pywikibot.showDiff(page.get() ,text)
        try:
            page.put(text, comment)
        except (pywikibot.Error):
            pywikibot.output('Error: unable to put %s'
                             % page.title(asLink=True))
Beispiel #4
0
def workon(page):
    mysite = pywikibot.getSite()
    try:
        text = page.get()
    except pywikibot.IsRedirectPage:
        pywikibot.output(u'%s is a redirect page. Skipping' %
                         page.title(asLink=True))
        return
    except pywikibot.NoPage:
        pywikibot.output(u'%s does not exist. Skipping' %
                         page.title(asLink=True))
        return
    pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" %
                     page.title())
    links = page.linkedPages()
    if len(links) > 0:
        pywikibot.getall(mysite, links)
    else:
        pywikibot.output('Nothing left to do.')
        return

    for page2 in links:
        try:
            target = page2.getRedirectTarget()
        except (pywikibot.Error, pywikibot.SectionError):
            continue
        text = treat(text, page2, target)
    if text != page.get():
        comment = pywikibot.translate(mysite, msg)
        pywikibot.showDiff(page.get(), text)
        try:
            page.put(text, comment)
        except (pywikibot.Error):
            pywikibot.output('Error: unable to put %s' %
                             page.title(asLink=True))
Beispiel #5
0
def main():
    global mysite, linktrail, page
    start = []
    for arg in wikipedia.handleArgs():
        start.append(arg)
    if start:
        start = " ".join(start)
    else:
        start = "!"
    mysite = wikipedia.getSite()
    linktrail = mysite.linktrail()
    try:
        generator = pagegenerators.CategorizedPageGenerator(
            mysite.disambcategory(), start=start)
    except wikipedia.NoPage:
        print "The bot does not know the disambiguation category for your wiki."
        raise
    # only work on articles
    generator = pagegenerators.NamespaceFilterPageGenerator(generator, [0])
    generator = pagegenerators.PreloadingGenerator(generator)
    pagestodo = []
    pagestoload = []
    for page in generator:
        if page.isRedirectPage():
            continue
        linked = page.linkedPages()
        pagestodo.append((page, linked))
        pagestoload += linked
        if len(pagestoload) > 49:
            wikipedia.getall(mysite, pagestoload)
            for page, links in pagestodo:
                workon(page, links)
            pagestoload = []
            pagestodo = []
Beispiel #6
0
def main():
    global mysite, linktrail, page
    start = []
    for arg in wikipedia.handleArgs():
        start.append(arg)
    if start:
        start = " ".join(start)
    else:
        start = "!"
    mysite = wikipedia.getSite()
    linktrail = mysite.linktrail()
    try:
        generator = pagegenerators.CategorizedPageGenerator(mysite.disambcategory(), start = start)
    except wikipedia.NoPage:
        print "The bot does not know the disambiguation category for your wiki."
        raise
    # only work on articles
    generator = pagegenerators.NamespaceFilterPageGenerator(generator, [0])
    generator = pagegenerators.PreloadingGenerator(generator)
    pagestodo = []
    pagestoload = []
    for page in generator:
        if page.isRedirectPage():
            continue
        linked = page.linkedPages()
        pagestodo.append((page,linked))
        pagestoload += linked
        if len(pagestoload) > 49:
            wikipedia.getall(mysite,pagestoload)
            for page, links in pagestodo:
                workon(page,links)
            pagestoload = []
            pagestodo = []
Beispiel #7
0
def testSite(site):
    try:
        wikipedia.getall(site, [wikipedia.Page(site, 'Any page name')])
    except KeyboardInterrupt:
        raise
    except wikipedia.NoSuchSite:
        wikipedia.output( u'No such language %s' % site.lang )
    except:
        wikipedia.output( u'Error processing language %s' % site.lang )
        wikipedia.output( u''.join(traceback.format_exception(*sys.exc_info())))
Beispiel #8
0
def testSite(site):
    try:
        wikipedia.getall(site, [wikipedia.Page(site, "Any page name")])
    except KeyboardInterrupt:
        raise
    except wikipedia.NoSuchSite:
        wikipedia.output(u"No such language %s" % site.lang)
    except:
        wikipedia.output(u"Error processing language %s" % site.lang)
        wikipedia.output(u"".join(traceback.format_exception(*sys.exc_info())))
def testSite(site):
    try:
        pywikibot.getall(site, [pywikibot.Page(site, 'Any page name')])
    except KeyboardInterrupt:
        raise
    except pywikibot.NoSuchSite:
        pywikibot.output( u'No such language %s' % site.lang )
    except:
        pywikibot.output( u'Error processing language %s' % site.lang )
        pywikibot.output( u''.join(traceback.format_exception(*sys.exc_info())))
def workon(page):
    mysite = pywikibot.getSite()
    try:
        text = page.get()
    except pywikibot.IsRedirectPage:
        pywikibot.output(u'%s is a redirect page. Skipping' % page)
        return
    except pywikibot.NoPage:
        pywikibot.output(u'%s does not exist. Skipping' % page)
        return
    pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                     % page.title())
    links = page.linkedPages()
    if len(links) > 0:
        pywikibot.getall(mysite,links)
    else:
        pywikibot.output('Nothing left to do.')
        return

    for page2 in links:
        try:
            target = page2.getRedirectTarget()
        except pywikibot.NoPage:
            gen = mysite.logpages(number=1, mode='move', title=page2.title(),
                                  dump=True)
            try:
                lastmove = gen.next()['move']
            except StopIteration:
                continue
            target = pywikibot.Page(mysite, lastmove['new_title'])
        except (pywikibot.Error, pywikibot.SectionError):
            continue
        # no fix to user namespaces
        if target.namespace() in [0, 1] and not page2.namespace() in [0, 1]:
            continue
        text = treat(text, page2, target)
    if text != page.get():
        comment = i18n.twtranslate(mysite, 'fixing_redirects-fixing')
        pywikibot.showDiff(page.get(), text)
        try:
            page.put(text, comment)
        except (pywikibot.Error):
            pywikibot.output('Error: unable to put %s' % page)
Beispiel #11
0
def workon(page):
    mysite = pywikibot.getSite()
    try:
        text = page.get()
    except pywikibot.IsRedirectPage:
        pywikibot.output(u'%s is a redirect page. Skipping' % page)
        return
    except pywikibot.NoPage:
        pywikibot.output(u'%s does not exist. Skipping' % page)
        return
    pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" %
                     page.title())
    links = page.linkedPages()
    if len(links):
        pywikibot.getall(mysite, links)
    else:
        pywikibot.output('Nothing left to do.')
        return

    for page2 in links:
        try:
            target = page2.getRedirectTarget()
        except pywikibot.NoPage:
            try:
                target = page2.getMovedTarget()
            except pywikibot.NoPage:
                continue
            target = pywikibot.Page(mysite, lastmove['new_title'])
        except (pywikibot.Error, pywikibot.SectionError):
            continue
        # no fix to user namespaces
        if target.namespace() in [0, 1] and not page2.namespace() in [0, 1]:
            continue
        text = treat(text, page2, target)
    if text != page.get():
        comment = i18n.twtranslate(mysite, 'fixing_redirects-fixing')
        pywikibot.showDiff(page.get(), text)
        try:
            page.put(text, comment)
        except (pywikibot.Error):
            pywikibot.error('unable to put %s' % page)
 def preload(self, page_list, retry=False):
     try:
         while len(page_list) > 0:
             # It might be that the pages are on different sites,
             # e.g. because the -interwiki parameter was used.
             # Query the sites one by one.
             site = page_list[0].site()
             pagesThisSite = [page for page in page_list if page.site() == site]
             page_list = [page for page in page_list if page.site() != site]
             pywikibot.getall(site, pagesThisSite)
             for page in pagesThisSite:
                 yield page
     except IndexError:
         # Can happen if the pages list is empty. Don't care.
         pass
     except pywikibot.SaxError:
         if not retry:
             # Retry once.
             self.preload(page_list, retry=True)
         # Ignore this error, and get the pages the traditional way later.
         pass
def workon(page):
    mysite = wikipedia.getSite()
    try:
        text = page.get()
    except wikipedia.IsRedirectPage:
        return
    wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
    links = page.linkedPages()
    wikipedia.getall(mysite,links)
    for page2 in links:
        try:
            target = page2.getRedirectTarget()
        except (wikipedia.Error,wikipedia.SectionError):
            continue
        text = treat(text, page2, target)
    if text != page.get():
        comment = wikipedia.translate(mysite, msg)
        try:
            page.put(text, comment)
        except (wikipedia.Error):
            wikipedia.output('Error : unable to put %s' % page.aslink())
Beispiel #14
0
 def preload(self, page_list, retry=False):
     try:
         while len(page_list) > 0:
             # It might be that the pages are on different sites,
             # e.g. because the -interwiki parameter was used.
             # Query the sites one by one.
             site = page_list[0].site()
             pagesThisSite = [
                 page for page in page_list if page.site() == site
             ]
             page_list = [page for page in page_list if page.site() != site]
             wikipedia.getall(site, pagesThisSite)
             for page in pagesThisSite:
                 yield page
     except IndexError:
         # Can happen if the pages list is empty. Don't care.
         pass
     except wikipedia.SaxError:
         if not retry:
             # Retry once.
             self.preload(page_list, retry=True)
         # Ignore this error, and get the pages the traditional way later.
         pass
def main():
    automatic = False
    namespaces = []
    msg = {
        'ar': u'إزالة الوصلات إلى موقع سبام %s',
        'de': u'Entferne in Spam-Blacklist eingetragenen Weblink auf %s',
        'en': u'Removing links to spamming site %s',
        'es': u'Removiendo enlaces a sitio publicitario %s',
        'fa': u'حذف پیوند به وبگاه هرزنگاری %s',
        'he': u'מסיר קישורים לאתר ספאם %s',
        'fr': u'Suppression du lien blacklisté %s',
        'it': u'Rimuovo link contenuto nella Spam-Blacklist %s',
        'ja': u'ロボットによる: 迷惑リンク削除 %s',
        'nl': u'Links naar gespamde site: %s verwijderd',
        'pt': u'Removendo links de spam do site %s',
        'ta': u'எரிதமாக இணைக்கப்பட்ட %s இணையத்தளம் நீக்கப்பட்டது',
        'vi': u'xóa các liên kết đến website spam %s',
        'zh': u'機器人: 移除廣告黑名單連結 %s',
    }
    spamSite = ''
    for arg in pywikibot.handleArgs():
        if arg.startswith("-automatic"):
            automatic = True
        elif arg.startswith('-namespace:'):
            try:
                namespaces.append(int(arg[len('-namespace:'):]))
            except ValueError:
                namespaces.append(arg[len('-namespace:'):])
        else:
            spamSite = arg
    if not automatic:
        pywikibot.put_throttle.setDelay(1)
    if not spamSite:
        pywikibot.showHelp('spamremove')
        pywikibot.output(u"No spam site specified.")
        sys.exit()
    mysite = pywikibot.getSite()
    pages = list(set(mysite.linksearch(spamSite)))
    if namespaces:
        pages = list(set(pagegenerators.NamespaceFilterPageGenerator(pages,
                                                                     namespaces)))
    if len(pages) == 0:
        pywikibot.output('No page found.')
    else:
        pywikibot.getall(mysite, pages)
        for p in pages:
            text = p.get()
            if not spamSite in text:
                continue
            # Show the title of the page we're working on.
            # Highlight the title in purple.
            pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                             % p.title())
            lines = text.split('\n')
            newpage = []
            lastok = ""
            for line in lines:
                if spamSite in line:
                    if lastok:
                        pywikibot.output(lastok)
                    pywikibot.output('\03{lightred}%s\03{default}' % line)
                    lastok = None
                else:
                    newpage.append(line)
                    if line.strip():
                        if lastok is None:
                            pywikibot.output(line)
                        lastok = line
            if automatic:
                answer = "y"
            else:
                answer = pywikibot.inputChoice(u'\nDelete the red lines?',
                                               ['yes', 'no', 'edit'],
                                               ['y', 'N', 'e'], 'n')
            if answer == "n":
                continue
            elif answer == "e":
                editor = editarticle.TextEditor()
                newtext = editor.edit(text, highlight=spamSite,
                                      jumpIndex=text.find(spamSite))
            else:
                newtext = "\n".join(newpage)
            if newtext != text:
                p.put(newtext, pywikibot.translate(mysite, msg) % spamSite)
Beispiel #16
0
         exclude(line, real_exclude=False)
         pl = pywikibot.Page(mysite, line)
         checked[pl] = pl
     f.close()
     excludefile = codecs.open(filename, 'a', encoding=mysite.encoding())
 except IOError:
     # File does not exist
     excludefile = codecs.open(filename, 'w', encoding=mysite.encoding())
 try:
     parentcats = workingcat.categories()
 except pywikibot.Error:
     parentcats = []
 # Do not include articles already in subcats; only checking direct subcats
 subcatlist = workingcat.subcategoriesList()
 if subcatlist:
     pywikibot.getall(mysite, subcatlist)
     for cat in subcatlist:
         list = cat.articlesList()
         for page in list:
             exclude(page.title(), real_exclude=False)
             checked[page] = page
 list = workingcat.articlesList()
 if list:
     for pl in list:
         checked[pl] = pl
     pywikibot.getall(mysite, list)
     for pl in list:
         include(pl)
 else:
     pywikibot.output(
         u"Category %s does not exist or is empty. Which page to start with?"
Beispiel #17
0
def main():
    automatic = False
    namespaces = []
    msg = {
        'ar': u'إزالة الوصلات إلى موقع سبام %s',
        'de': u'Entferne in Spam-Blacklist eingetragenen Weblink auf %s',
        'en': u'Removing links to spamming site %s',
        'es': u'Removiendo enlaces a sitio publicitario %s',
        'fa': u'حذف پیوند به وبگاه هرزنگاری %s',
        'he': u'מסיר קישורים לאתר ספאם %s',
        'fr': u'Suppression du lien blacklisté %s',
        'it': u'Rimuovo link contenuto nella Spam-Blacklist %s',
        'ja': u'ロボットによる: 迷惑リンク削除 %s',
        'nl': u'Links naar gespamde site: %s verwijderd',
        'pt': u'Removendo links de spam do site %s',
        'ta': u'எரிதமாக இணைக்கப்பட்ட %s இணையத்தளம் நீக்கப்பட்டது',
        'vi': u'xóa các liên kết đến website spam %s',
        'zh': u'機器人: 移除廣告黑名單連結 %s',
    }
    spamSite = ''
    for arg in pywikibot.handleArgs():
        if arg.startswith("-automatic"):
            automatic = True
        elif arg.startswith('-namespace:'):
            try:
                namespaces.append(int(arg[len('-namespace:'):]))
            except ValueError:
                namespaces.append(arg[len('-namespace:'):])
        else:
            spamSite = arg
    if not automatic:
        pywikibot.put_throttle.setDelay(1)
    if not spamSite:
        pywikibot.showHelp('spamremove')
        pywikibot.output(u"No spam site specified.")
        sys.exit()
    mysite = pywikibot.getSite()
    pages = list(set(mysite.linksearch(spamSite)))
    if namespaces:
        pages = list(
            set(pagegenerators.NamespaceFilterPageGenerator(pages,
                                                            namespaces)))
    if len(pages) == 0:
        pywikibot.output('No page found.')
    else:
        pywikibot.getall(mysite, pages)
        for p in pages:
            text = p.get()
            if spamSite not in text:
                continue
            # Show the title of the page we're working on.
            # Highlight the title in purple.
            pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" %
                             p.title())
            lines = text.split('\n')
            newpage = []
            lastok = ""
            for line in lines:
                if spamSite in line:
                    if lastok:
                        pywikibot.output(lastok)
                    pywikibot.output('\03{lightred}%s\03{default}' % line)
                    lastok = None
                else:
                    newpage.append(line)
                    if line.strip():
                        if lastok is None:
                            pywikibot.output(line)
                        lastok = line
            if automatic:
                answer = "y"
            else:
                answer = pywikibot.inputChoice(u'\nDelete the red lines?',
                                               ['yes', 'no', 'edit'],
                                               ['y', 'N', 'e'], 'n')
            if answer == "n":
                continue
            elif answer == "e":
                editor = editarticle.TextEditor()
                newtext = editor.edit(text,
                                      highlight=spamSite,
                                      jumpIndex=text.find(spamSite))
            else:
                newtext = "\n".join(newpage)
            if newtext != text:
                p.put(newtext, pywikibot.translate(mysite, msg) % spamSite)
Beispiel #18
0
         exclude(line,real_exclude=False)
         pl = wikipedia.Page(mysite,line)
         checked[pl] = pl
     f.close()
     excludefile = codecs.open(filename, 'a', encoding = mysite.encoding())
 except IOError:
     # File does not exist
     excludefile = codecs.open(filename, 'w', encoding = mysite.encoding())
 try:
     parentcats = workingcat.categories()
 except wikipedia.Error:
     parentcats = []
 # Do not include articles already in subcats; only checking direct subcats
 subcatlist = workingcat.subcategoriesList()
 if subcatlist:
     wikipedia.getall(mysite,subcatlist)
     for cat in subcatlist:
         list = cat.articlesList()
         for page in list:
             exclude(page.title(),real_exclude=False)
             checked[page] = page
 list = workingcat.articlesList()
 if list:
     for pl in list:
         checked[pl]=pl
     wikipedia.getall(mysite,list)
     for pl in list:
         include(pl)
 else:
     wikipedia.output(u"Category %s does not exist or is empty. Which page to start with?"%workingcatname)
     answer = wikipedia.input(u"(Default is [[%s]]):"%workingcatname)
Beispiel #19
0
         exclude(line,real_exclude=False)
         pl = wikipedia.Page(mysite,line)
         checked[pl] = pl
     f.close()
     excludefile = codecs.open(filename, 'a', encoding = mysite.encoding())
 except IOError:
     # File does not exist
     excludefile = codecs.open(filename, 'w', encoding = mysite.encoding())
 try:
     parentcats = workingcat.categories()
 except wikipedia.Error:
     parentcats = []
 # Do not include articles already in subcats; only checking direct subcats
 subcatlist = workingcat.subcategoriesList()
 if subcatlist:
     wikipedia.getall(mysite,subcatlist)
     for cat in subcatlist:
         list = cat.articlesList()
         for page in list:
             exclude(page.title(),real_exclude=False)
             checked[page] = page
 list = workingcat.articlesList()
 if list:
     for pl in list:
         checked[pl]=pl
     wikipedia.getall(mysite,list)
     for pl in list:
         include(pl)
 else:
     wikipedia.output(u"Category %s does not exist or is empty. Which page to start with?"%workingcatname)
     answer = wikipedia.input(u"(Default is [[%s]]):"%workingcatname)
Beispiel #20
0
    else:
        start = "!"
    mysite = wikipedia.getSite()
    linktrail = mysite.linktrail()
    try:
        generator = pagegenerators.CategorizedPageGenerator(mysite.disambcategory(), start = start)
    except wikipedia.NoPage:
        print "The bot does not know the disambiguation category for your wiki."
        raise
    # only work on articles
    generator = pagegenerators.NamespaceFilterPageGenerator(generator, [0])
    generator = pagegenerators.PreloadingGenerator(generator)
    pagestodo = []
    pagestoload = []
    for page in generator:
        if page.isRedirectPage():
            continue
        linked = page.linkedPages()
        pagestodo.append((page,linked))
        pagestoload += linked
        if len(pagestoload) > 49:
            wikipedia.getall(mysite,pagestoload)
            for page, links in pagestodo:
                workon(page,links)
            pagestoload = []
            pagestodo = []

finally:
    wikipedia.stopme()

         exclude(line,real_exclude=False)
         pl = pywikibot.Page(mysite,line)
         checked[pl] = pl
     f.close()
     excludefile = codecs.open(filename, 'a', encoding = mysite.encoding())
 except IOError:
     # File does not exist
     excludefile = codecs.open(filename, 'w', encoding = mysite.encoding())
 try:
     parentcats = workingcat.categories()
 except pywikibot.Error:
     parentcats = []
 # Do not include articles already in subcats; only checking direct subcats
 subcatlist = workingcat.subcategoriesList()
 if subcatlist:
     pywikibot.getall(mysite,subcatlist)
     for cat in subcatlist:
         list = cat.articlesList()
         for page in list:
             exclude(page.title(),real_exclude=False)
             checked[page] = page
 list = workingcat.articlesList()
 if list:
     for pl in list:
         checked[pl]=pl
     pywikibot.getall(mysite,list)
     for pl in list:
         include(pl)
 else:
     pywikibot.output(
         u"Category %s does not exist or is empty. Which page to start with?"
Beispiel #22
0
         i += 1
         if i == 480:
             break
     start = todo[len(todo) - 1].title() + '_0'
 # todo is a list of pages to do, donow are the pages we will be doing in this run.
 if len(todo) > 60:
     # Take the first 60.
     donow = todo[0:60]
     todo = todo[60:]
 else:
     donow = todo
     # If there was more to do, the 'if len(todo)<61' part would have extended
     # todo beyond this size.
     cont = False
 try:
     wikipedia.getall(mysite, donow)
 except wikipedia.SaxError:
     # Ignore this error, and get the pages the traditional way.
     pass
 checked += len(donow)
 for pl in donow:
     R = re.compile(r'http://[^\s}<\]]+[^\s.,:;)\?!\]}<]')
     try:
         for url in R.findall(pl.get()):
             url = wikipedia.unicode2html(url, 'ascii')
             try:
                 error = URLerrorFinder().open(url)
             except IOError:
                 error = -1
             if error in allowederrorcodes:
                 working += 1
Beispiel #23
0
         i += 1
         if i == 480:
             break
     start = todo[len(todo) - 1].title() + "_0"
 # todo is a list of pages to do, donow are the pages we will be doing in this run.
 if len(todo) > 60:
     # Take the first 60.
     donow = todo[0:60]
     todo = todo[60:]
 else:
     donow = todo
     # If there was more to do, the 'if len(todo)<61' part would have extended
     # todo beyond this size.
     cont = False
 try:
     wikipedia.getall(mysite, donow)
 except wikipedia.SaxError:
     # Ignore this error, and get the pages the traditional way.
     pass
 checked += len(donow)
 for pl in donow:
     R = re.compile(r"http://[^\s}<\]]+[^\s.,:;)\?!\]}<]")
     try:
         for url in R.findall(pl.get()):
             url = wikipedia.unicode2html(url, "ascii")
             try:
                 error = URLerrorFinder().open(url)
             except IOError:
                 error = -1
             if error in allowederrorcodes:
                 working += 1
Beispiel #24
0
    def treat(self, page):
	"""
	""  Finds links, checks if they exist on wikibooks, then checks if they
	""  exist on Wikipedia, if not removes the link entirely. Also remove 
	""  all "citation needed" tags.
	"""
	
	text = self.load(page)
	newText = text	
	
	linksFoundInPage = []
	wikibooksPages = []
	wikipediaPages = []
	
	linksOnWikipedia = []
	redlinks = []
	
	def linkName(link):
		link = link.strip('[')
		link = link.strip(']')
		if link.find("|") != -1:
			return link[ link.find("|")+1:]
		else: return None
	def linkURL(link):
		link = link.strip('[')
		link = link.strip(']')
		if link.find("|") != -1:
			return link[ :link.find("|")]
		else: return link
	
	
	# Matches text between "[[" and "]]"
	linkRegex = re.compile("\[\[.*?]]")
	linksFoundInPage = linkRegex.findall(text)
	
	# Remove items that aren't links
	cleanLinks = []
	for link in linksFoundInPage:
		if link.find("#") != -1:
			continue
		elif link.find("Image:") != -1:
			continue
		elif link.find("File:") != -1:
			continue
		else:
			cleanLinks.append(link)

	linksFoundInPage = cleanLinks
	
	pregen = pagegenerators.PreloadingGenerator(self.generator)
	
	# Download wikibooksPages
	for link in linksFoundInPage:
		wikibooksPages.append( pywikibot.Page( page.site(), linkURL(link) ) )
	pywikibot.getall(page.site(), wikibooksPages)
	
	# Download wikipediaPages
	wikipediaSite = pywikibot.getSite(page.site().language(), 'wikipedia')
	for link in linksFoundInPage:
		wikipediaPages.append( pywikibot.Page( wikipediaSite, linkURL(link)) )
	pywikibot.getall(wikipediaSite, wikipediaPages)
	
	# sort links, sending to linksOnWikibooks, linksOnWikipedia, or redlinks
	i = 0
	for link in linksFoundInPage:
		if wikibooksPages[i].exists():
			print "Page \"" + wikibooksPages[i].title() + "\" exists on wikibooks."
			# no need to keep a list links on wikipedia
		else:
			#check on wikipedia
			if wikipediaPages[i].exists():
				print "Page \"" + wikipediaPages[i].title() + "\" exists on wikipedia."
				linksOnWikipedia.append( linksFoundInPage[i] )
			else:
				print "Could not find page \"" + wikibooksPages[i].title() + "\" removing."
				redlinks.append( linksFoundInPage[i] )
		i += 1
	
	#
	# remove redlinks, and change wikipedia links to use w:
	#
	
	for link in linksOnWikipedia:
		if linkName(link) == None:
			print linkURL(link)
			newLink = "[[w:" + linkURL(link) + "|" + linkURL(link) + "]]"
			newText = newText.replace(link, newLink)
		else:
			newText = newText.replace(link, "[[w:" + linkURL(link) + "|" + linkName(link) + "]]" )
			print "-" + linkName(link)

	for link in redlinks:
		if linkName(link) == None:
			newText = newText.replace(link, linkURL(link))
		else:
			newText = newText.replace(link, linkName(link) )	
	
	text = newText
	
	"""
	""  Finished
	"""
        if not self.save(text, page, self.summary):
            pywikibot.output(u'Page %s not saved.' % page.title(asLink=True))