def main(): wikipedia.setSite(wikipedia.getSite(u'commons', u'commons')) # Connect database, we need that conn = None cursor = None (conn, cursor) = connectDatabase() generator = None genFactory = pagegenerators.GeneratorFactory() for arg in wikipedia.handleArgs(): genFactory.handleArg(arg) generator = genFactory.getCombinedGenerator() if not generator: generator = getRijksmonumentWithoutLocation() # Get a preloading generator with only images pgenerator = pagegenerators.PreloadingGenerator( pagegenerators.NamespaceFilterPageGenerator(generator, [6])) for page in pgenerator: locationTemplate = locateImage(page, conn, cursor) if locationTemplate: addLocation(page, locationTemplate)
def main(): genFactory = pagegenerators.GeneratorFactory() # The generator gives the pages that should be worked upon. gen = None pageTitleParts = [] for arg in pywikibot.handleArgs(): if arg.startswith("-reg"): arg = '-transcludes:Infobox film' if not genFactory.handleArg(arg): pageTitleParts.append(arg) if pageTitleParts != []: # We will only work on a single page. pageTitle = ' '.join(pageTitleParts) page = pywikibot.Page(pywikibot.getSite(), pageTitle) gen = iter([page]) if not gen: gen = genFactory.getCombinedGenerator() if gen: # The preloading generator is responsible for downloading multiple # pages from the wiki simultaneously. gen = pagegenerators.PreloadingGenerator(gen) bot = FilmBannerBot(gen) bot.run() else: pywikibot.showHelp()
def main(): # The generator gives the pages that should be worked upon. gen = None # If debug is True, doesn't do any real changes, but only show # what would have been changed. debug = False wantHelp = False # Parse command line arguments for arg in wikipedia.handleArgs(): if arg.startswith("-debug"): debug = True else: wantHelp = True if not wantHelp: # The preloading generator is responsible for downloading multiple # pages from the wiki simultaneously. cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % 'IP-Network') nets_gen = pagegenerators.CategorizedPageGenerator(cat, start=None, recurse=False) nets_gen = pagegenerators.PreloadingGenerator(nets_gen) bot = IpNetworkBot(nets_gen, debug) bot.run() else: wikipedia.showHelp()
def main(args): """ Grab a bunch of images and tag them if they are not categorized. """ generator = None genFactory = pagegenerators.GeneratorFactory() site = pywikibot.getSite(u'commons', u'commons') pywikibot.setSite(site) for arg in pywikibot.handleArgs(): if arg.startswith('-yesterday'): generator = uploadedYesterday(site) elif arg.startswith('-recentchanges'): generator = recentChanges(site=site, delay=120) else: genFactory.handleArg(arg) if not generator: generator = genFactory.getCombinedGenerator() if not generator: pywikibot.output(u'You have to specify the generator you want to use ' u'for the program!') else: pregenerator = pagegenerators.PreloadingGenerator(generator) for page in pregenerator: if page.exists() and (page.namespace() == 6) \ and (not page.isRedirectPage()): if isUncat(page): addUncat(page)
def crawlerLink(pagename): #pagename = unicode(arg[len('-links:'):], 'utf-8') page = wikipedia.Page(site, pagename) gen = pagegenerators.ReferringPageGenerator(page) #gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) for Page in pagegenerators.PreloadingGenerator(gen, 100): modification(Page.title())
def UpdateRepoCats(*args): # Get List of all articles in Category:All add-ons site = pywikibot.getSite() # Download all repos as soup element soups = importAllAddonXML() # Get all pages in Category All add-ons cat = catlib.Category(site, u'Category:All add-ons') pages = cat.articlesList(False) allRepoCats = repoCatList(site) for Page in pagegenerators.PreloadingGenerator(pages, 100): # Get addon_id via regexp addon_id = None addon_id = re.search("\|ID=([a-zA-Z0-9_\.\-]+)", Page.get()) if not addon_id: pywikibot.output("Can't find addon_id for %s, skipping it..." % Page.title()) continue else: addon_id = addon_id.group(1) pywikibot.output("Identifying Repos for %s." % addon_id) # See if addon_id can be found in repos repos = checkInRepo(addon_id, soups) addRemoveRepoCats(Page, repos, allRepoCats)
def refreshGenerator(self): generator = pagegenerators.CategorizedPageGenerator( self.csdCat, start=self.savedProgress) # wrap another generator around it so that we won't produce orphaned talk pages. generator2 = pagegenerators.PageWithTalkPageGenerator(generator) self.preloadingGen = pagegenerators.PreloadingGenerator(generator2, pageNumber=20)
def main(): # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() # The generator gives the pages that should be worked upon. gen = None # This temporary array is used to read the page title if one single # page to work on is specified by the arguments. pageTitleParts = [] # Parse command line arguments for arg in pywikibot.handleArgs(): if arg.startswith("-reg"): arg = '-cat:Unassessed film articles' if not genFactory.handleArg(arg): pageTitleParts.append(arg) if pageTitleParts != []: # We will only work on a single page. pageTitle = ' '.join(pageTitleParts) page = pywikibot.Page(pywikibot.getSite(), pageTitle) gen = iter([page]) if not gen: gen = genFactory.getCombinedGenerator() if gen: # The preloading generator is responsible for downloading multiple # pages from the wiki simultaneously. gen = pagegenerators.PreloadingGenerator( filmfunctions.PagesFromTalkPagesGenerator(gen)) bot = FilmAssessBot(gen) bot.run() else: pywikibot.showHelp()
def crawlerCat(category, recursif, apres): modifier = u'False' cat = catlib.Category(site, category) pages = cat.articlesList(False) gen = pagegenerators.NamespaceFilterPageGenerator(pages, [ns]) for Page in pagegenerators.PreloadingGenerator(gen, 100): if not apres or apres == u'' or modifier == u'True': modification(Page.title()) #crawlerLink(Page.title()) elif Page.title() == apres: modifier = u'True' if recursif == True: subcat = cat.subcategories(recurse=True) for subcategory in subcat: pages = subcategory.articlesList(False) for Page in pagegenerators.PreloadingGenerator(pages, 100): modification(Page.title())
def main(): featured = False gen = None # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() for arg in pywikibot.handleArgs(): if arg == '-featured': featured = True else: genFactory.handleArg(arg) mysite = pywikibot.getSite() if mysite.sitename() == 'wikipedia:nl': pywikibot.output( u'\03{lightred}There is consensus on the Dutch Wikipedia that bots should not be used to fix redirects.\03{default}' ) sys.exit() if featured: featuredList = pywikibot.translate(mysite, featured_articles) ref = pywikibot.Page(pywikibot.getSite(), featuredList) gen = pagegenerators.ReferringPageGenerator(ref) gen = pagegenerators.NamespaceFilterPageGenerator(gen, [0]) if not gen: gen = genFactory.getCombinedGenerator() if gen: for page in pagegenerators.PreloadingGenerator(gen): workon(page) else: pywikibot.showHelp('fixing_redirects')
def main(): oldImage = None newImage = None summary = '' always = False loose = False # read command line parameters for arg in pywikibot.handleArgs(): if arg == '-always': always = True elif arg == '-loose': loose = True elif arg.startswith('-summary'): if len(arg) == len('-summary'): summary = pywikibot.input(u'Choose an edit summary: ') else: summary = arg[len('-summary:'):] else: if oldImage: newImage = arg else: oldImage = arg if not oldImage: pywikibot.showHelp('image') else: mysite = pywikibot.getSite() ns = mysite.image_namespace() oldImagePage = pywikibot.ImagePage(mysite, ns + ':' + oldImage) gen = pagegenerators.FileLinksGenerator(oldImagePage) preloadingGen = pagegenerators.PreloadingGenerator(gen) bot = ImageRobot(preloadingGen, oldImage, newImage, summary, always, loose) bot.run()
def main(): site = wikipedia.getSite() gen = site.allpages() preloadingGen = pagegenerators.PreloadingGenerator(gen) bot = SFRobot(preloadingGen) bot.run()
def crawlerAll(start): gen = pagegenerators.AllpagesPageGenerator(start, namespace=0, includeredirects=False) for Page in pagegenerators.PreloadingGenerator(gen, 100): #print (Page.title().encode(config.console_encoding, 'replace')) modification(Page.title())
def main(): #page generator gen = None # If the user chooses to work on a single page, this temporary array is # used to read the words from the page title. The words will later be # joined with spaces to retrieve the full title. pageTitle = [] # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() for arg in pywikibot.handleArgs(): if not genFactory.handleArg(arg): pageTitle.append(arg) if pageTitle: # work on a single page page = pywikibot.Page(pywikibot.getSite(), ' '.join(pageTitle)) gen = iter([page]) if not gen: gen = genFactory.getCombinedGenerator() if not gen: pywikibot.showHelp('inline_images') else: preloadingGen = pagegenerators.PreloadingGenerator(gen) bot = InlineImagesRobot(preloadingGen) bot.run()
def main(): always = False for arg in wikipedia.handleArgs(): if arg == '-always': always = True mysite = wikipedia.getSite() # If anything needs to be prepared, you can do it here template_image = wikipedia.translate(wikipedia.getSite(), template_to_the_image) template_user = wikipedia.translate(wikipedia.getSite(), template_to_the_user) except_text_translated = wikipedia.translate(wikipedia.getSite(), except_text) basicgenerator = pagegenerators.UnusedFilesGenerator() generator = pagegenerators.PreloadingGenerator(basicgenerator) for page in generator: wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) if except_text_translated not in page.getImagePageHtml( ) and 'http://' not in page.get(): wikipedia.output(u'\n' + page.title()) appendtext(page, template_image, always) uploader = page.getFileVersionHistory().pop()[1] usertalkname = u'User Talk:%s' % uploader usertalkpage = wikipedia.Page(mysite, usertalkname) msg2uploader = template_user % page.title() appendtext(usertalkpage, msg2uploader, always)
def __init__(self, pageToUnlink, namespaces, always): self.pageToUnlink = pageToUnlink gen = pagegenerators.ReferringPageGenerator(pageToUnlink) if namespaces != []: gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) self.generator = pagegenerators.PreloadingGenerator(gen) linktrail = pywikibot.getSite().linktrail() # The regular expression which finds links. Results consist of four # groups: # # group title is the target page title, that is, everything # before | or ]. # # group section is the page section. # It'll include the # to make life easier for us. # # group label is the alternative link title, that's everything # between | and ]. # # group linktrail is the link trail, that's letters after ]] which are # part of the word. # note that the definition of 'letter' varies from language to language. self.linkR = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>%s)' % linktrail) self.always = always self.done = False self.comment = i18n.twtranslate(pywikibot.getSite(), 'unlink-unlinking', self.pageToUnlink.title())
def Manual_main(): wikipedia.config.put_throttle = 0 wikipedia.put_throttle.setDelay() gen= None word=u'' PageTitles = [] genFactory = pagegenerators.GeneratorFactory() for arg in wikipedia.handleArgs(): if arg.startswith( '-page' ): PageTitles.append( arg[6:] ) break else: generator = genFactory.handleArg( arg ) if generator: gen = generator if PageTitles: pages = [wikipedia.Page(wikipedia.getSite(),PageTitle) for PageTitle in PageTitles] gen = iter( pages ) if not gen: wikipedia.stopme() sys.exit() preloadingGen = pagegenerators.PreloadingGenerator(gen,pageNumber = 60) for faTitle in preloadingGen: wikipedia.output(u'---'+faTitle.title()+u'-----') OurResult=main(faTitle.title(),word) OurResult=Open_json(OurResult,faTitle.title()) if OurResult.strip(): wikipedia.output(OurResult) with codecs.open(BotAdress_main+u'zz_most_miss_result.txt' ,mode = 'a',encoding = 'utf8' ) as f: f.write(OurResult.strip()+u'\n') with codecs.open(BotAdress_main+u'zz_most_miss_result_number.txt' ,mode = 'a',encoding = 'utf8' ) as f: f.write(OurResult.split(u'@')[0].strip()+u'\n')
def main(): global mysite, linktrail, page start = [] for arg in wikipedia.handleArgs(): start.append(arg) if start: start = " ".join(start) else: start = "!" mysite = wikipedia.getSite() linktrail = mysite.linktrail() try: generator = pagegenerators.CategorizedPageGenerator( mysite.disambcategory(), start=start) except wikipedia.NoPage: print "The bot does not know the disambiguation category for your wiki." raise # only work on articles generator = pagegenerators.NamespaceFilterPageGenerator(generator, [0]) generator = pagegenerators.PreloadingGenerator(generator) pagestodo = [] pagestoload = [] for page in generator: if page.isRedirectPage(): continue linked = page.linkedPages() pagestodo.append((page, linked)) pagestoload += linked if len(pagestoload) > 49: wikipedia.getall(mysite, pagestoload) for page, links in pagestodo: workon(page, links) pagestoload = [] pagestodo = []
def main(*args): # Disable cosmetic changes because we don't want to modify any page # content, so that we don't flood the histories with minor changes. config.cosmetic_changes = False #page generator gen = None genFactory = pagegenerators.GeneratorFactory() redirs = False # If the user chooses to work on a single page, this temporary array is # used to read the words from the page title. The words will later be # joined with spaces to retrieve the full title. pageTitle = [] for arg in pywikibot.handleArgs(*args): if genFactory.handleArg(arg): continue if arg == '-redir': redirs = True else: pageTitle.append(arg) gen = genFactory.getCombinedGenerator() if not gen: if pageTitle: # work on a single page page = pywikibot.Page(pywikibot.getSite(), ' '.join(pageTitle)) gen = iter([page]) else: pywikibot.showHelp() return preloadingGen = pagegenerators.PreloadingGenerator(gen) bot = TouchBot(preloadingGen, redirs) bot.run()
def main(): #page generator gen = None # This temporary array is used to read the page title if one single # page to work on is specified by the arguments. pageTitle = [] # Which namespaces should be processed? # default to [] which means all namespaces will be processed namespaces = [] # Never ask before changing a page always = False # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() for arg in pywikibot.handleArgs(): if arg.startswith('-xml'): if len(arg) == 4: xmlFilename = i18n.input('pywikibot-enter-xml-filename') else: xmlFilename = arg[5:] gen = XmlDumpNoReferencesPageGenerator(xmlFilename) elif arg.startswith('-namespace:'): try: namespaces.append(int(arg[11:])) except ValueError: namespaces.append(arg[11:]) elif arg == '-always': always = True else: if not genFactory.handleArg(arg): pageTitle.append(arg) if pageTitle: page = pywikibot.Page(pywikibot.getSite(), ' '.join(pageTitle)) gen = iter([page]) if not gen: gen = genFactory.getCombinedGenerator() if not gen: site = pywikibot.getSite() try: cat = maintenance_category[site.family.name][site.lang] except: pass else: import catlib if not namespaces: namespaces = [0] cat = catlib.Category(site, "%s:%s" % (site.category_namespace(), cat)) gen = pagegenerators.CategorizedPageGenerator(cat) if not gen: pywikibot.showHelp('noreferences') else: if namespaces: gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) preloadingGen = pagegenerators.PreloadingGenerator(gen) bot = NoReferencesBot(preloadingGen, always) bot.run()
def main(): global always always = False for arg in pywikibot.handleArgs(): if arg == '-always': always = True mysite = pywikibot.getSite() # If anything needs to be prepared, you can do it here template_image = pywikibot.translate(pywikibot.getSite(), template_to_the_image) template_user = pywikibot.translate(pywikibot.getSite(), template_to_the_user) except_text_translated = pywikibot.translate(pywikibot.getSite(), except_text) basicgenerator = pagegenerators.UnusedFilesGenerator() generator = pagegenerators.PreloadingGenerator(basicgenerator) for page in generator: pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) if except_text_translated not in page.getImagePageHtml() and \ 'http://' not in page.get(): pywikibot.output(u'\n' + page.title()) if template_image in page.get(): pywikibot.output(u"%s done already" % page.aslink()) continue appendtext(page, u"\n\n" + template_image) uploader = page.getFileVersionHistory().pop()[1] usertalkname = u'User Talk:%s' % uploader usertalkpage = pywikibot.Page(mysite, usertalkname) msg2uploader = template_user % {'title': page.title()} appendtext(usertalkpage, msg2uploader)
def Main () : site = pywikibot.getSite() d =datetime.today() datestring =d.isoformat() zipfilename="archive%s.zip" % datestring z = zipfile.ZipFile(zipfilename, "w") for x in ('Candidates_for_speedy_deletion_as_hoaxes', 'Candidates_for_speedy_deletion_as_importance_or_significance_not_asserted', 'Candidates_for_speedy_deletion_for_unspecified_reason') : cat = catlib.Category(site, x) pages = cat.articlesList(False) gen = pagegenerators.PreloadingGenerator(pages,100) for Page in gen: outfile = "PAGES/%s.txt" % Page.urlname() text= Page.get() sutf8 = text.encode('UTF-8') print outfile z.writestr(outfile,sutf8) count=0 for strings in gen.data: for string in strings: for string2 in string: count = count +1 # sutf8 = string2.encode('UTF-8') z.writestr("RawFiles/%s%d.xml" % (x,count) ,string2) z.close() push_zip(zipfilename)
def main(args): pywikibot.output(u'WARNING: This is an experimental bot') pywikibot.output( u'WARNING: It will only work on self published work images') pywikibot.output(u'WARNING: This bot is still full of bugs') pywikibot.output(u'WARNING: Use at your own risk!') generator = None autonomous = False checkTemplate = True # Load a lot of default generators genFactory = pagegenerators.GeneratorFactory() for arg in pywikibot.handleArgs(): if arg == '-nochecktemplate': checkTemplate = False elif arg == '-autonomous': autonomous = True else: genFactory.handleArg(arg) if not supportedSite(): pywikibot.output(u'Sorry, this site is not supported (yet).') return False generator = genFactory.getCombinedGenerator() if not generator: raise add_text.NoEnoughData( 'You have to specify the generator you want to use for the script!' ) pregenerator = pagegenerators.PreloadingGenerator(generator) prefetchQueue = Queue(maxsize=50) uploadQueue = Queue(maxsize=200) imageFetcherThread = imageFetcher(pregenerator, prefetchQueue) userInteractionThread = userInteraction(prefetchQueue, uploadQueue) uploaderThread = uploader(uploadQueue) imageFetcherThread.daemon = False userInteractionThread.daemon = False uploaderThread.daemon = False if autonomous: pywikibot.output( u'Bot is running in autonomous mode. There will be no user interaction.' ) userInteractionThread.setAutonomous() if not checkTemplate: pywikibot.output( u'No check template will be added to the uploaded files.') uploaderThread.nochecktemplate() fetchDone = imageFetcherThread.start() userDone = userInteractionThread.start() uploadDone = uploaderThread.start()
def main(): #page generator gen = None # This temporary array is used to read the page title if one single # page to work on is specified by the arguments. pageTitle = [] # Which namespaces should be processed? # default to [] which means all namespaces will be processed namespaces = [] # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() always = False for arg in pywikibot.handleArgs(): if arg.startswith('-xml'): if len(arg) == 4: xmlFilename = pywikibot.input( u'Please enter the XML dump\'s filename:') else: xmlFilename = arg[5:] gen = XmlDumpSelflinkPageGenerator(xmlFilename) elif arg == '-sql': # NOT WORKING YET query = """ SELECT page_namespace, page_title FROM page JOIN pagelinks JOIN text ON (page_id = pl_from AND page_id = old_id) WHERE pl_title = page_title AND pl_namespace = page_namespace AND page_namespace = 0 AND (old_text LIKE concat('%[[', page_title, ']]%') OR old_text LIKE concat('%[[', page_title, '|%')) LIMIT 100""" gen = pagegenerators.MySQLPageGenerator(query) elif arg.startswith('-namespace:'): try: namespaces.append(int(arg[11:])) except ValueError: namespaces.append(arg[11:]) elif arg == '-always': always = True else: if not genFactory.handleArg(arg): pageTitle.append(arg) if pageTitle: page = pywikibot.Page(pywikibot.getSite(), ' '.join(pageTitle)) gen = iter([page]) if not gen: gen = genFactory.getCombinedGenerator() if not gen: pywikibot.showHelp('selflink') else: if namespaces != []: gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) preloadingGen = pagegenerators.PreloadingGenerator(gen) bot = SelflinkBot(preloadingGen, always) bot.run()
def crawlerCat(category): cat = catlib.Category(site, category) pages = cat.articlesList(False) for Page in pagegenerators.PreloadingGenerator(pages, 100): modification(Page.title()) subcat = cat.subcategories(recurse=True) for subcategory in subcat: ''' if subcategory == u'[[Catégorie:Mammifère disparu]]': raw_input("oui") else: raw_input("non") ''' pages = subcategory.articlesList(False) for Page in pagegenerators.PreloadingGenerator(pages, 100): #if not crawlerFile(Page.title()): modification(Page.title())
def main(): summary_commandline,template,gen = None,None,None exceptions,PageTitles,namespaces = [],[],[] cat='' autoText,autoTitle = False,False genFactory = pagegenerators.GeneratorFactory() arg=False#------if you dont want to work with arguments leave it False if you want change it to True--- if arg==False: for arg in wikipedia.handleArgs(): if arg == '-autotitle': autoTitle = True elif arg == '-autotext': autoText = True elif arg.startswith( '-page:' ): if len(arg) == 6: PageTitles.append(wikipedia.input( u'Which page do you want to chage?' )) else: PageTitles.append(arg[6:]) elif arg.startswith( '-cat:' ): if len(arg) == 5: cat=wikipedia.input( u'Which Category do you want to chage?' ) else: cat='Category:'+arg[5:] elif arg.startswith( '-template:' ): if len(arg) == 10: template.append(wikipedia.input( u'Which Template do you want to chage?' )) else: template.append('Template:'+arg[10:]) elif arg.startswith('-except:'): exceptions.append(arg[8:]) elif arg.startswith( '-namespace:' ): namespaces.append( int( arg[11:] ) ) elif arg.startswith( '-ns:' ): namespaces.append( int( arg[4:] ) ) elif arg.startswith( '-summary:' ): wikipedia.setAction( arg[9:] ) summary_commandline = True else: generator = genFactory.handleArg(arg) if generator: gen = generator else: PageTitles = [raw_input(u'Page:> ').decode('utf-8')] if cat!='': facatfalist=facatlist(cat) if facatfalist!=False: run(facatfalist) if PageTitles: pages = [wikipedia.Page(faSite,PageTitle) for PageTitle in PageTitles] gen = iter( pages ) if not gen: wikipedia.stopme() sys.exit() if namespaces != []: gen = pagegenerators.NamespaceFilterPageGenerator( gen,namespaces ) preloadingGen = pagegenerators.PreloadingGenerator( gen,pageNumber = 60 )#---number of pages that you want load at same time run(preloadingGen)
def main(): # HACK: This can be removed when pywikipedia bug 3315395 has been fixed safetyLock = 'birthcat-unlock.dat' if not os.path.exists(safetyLock): choice = pywikibot.inputChoice( u'Have you patched textlib.py in pywikipedia?', ['Yes', 'No'], ['y', 'N'], 'N') if choice == 'y': open(safetyLock, 'w').close() else: return False # END OF HACK # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() # The generator gives the pages that should be worked upon. gen = None # This temporary array is used to read the page title if one single # page to work on is specified by the arguments. pageTitleParts = [] # If dry is True, doesn't do any real changes, but only show # what would have been changed. dry = False # If auto is True, run in autonomous mode. auto = False # Parse command line arguments for arg in pywikibot.handleArgs(): if arg.startswith("-dry"): dry = True elif arg.startswith("-auto"): auto = True else: # check if a standard argument like # -start:XYZ or -ref:Asdf was given. if not genFactory.handleArg(arg): pageTitleParts.append(arg) if pageTitleParts != []: # We will only work on a single page. pageTitle = ' '.join(pageTitleParts) page = pywikibot.Page(pywikibot.getSite(), pageTitle) gen = iter([page]) if not gen: gen = genFactory.getCombinedGenerator() if gen: # The preloading generator is responsible for downloading multiple # pages from the wiki simultaneously. gen = pagegenerators.PreloadingGenerator(gen) bot = BirthCatBot(gen, auto, dry) bot.run() else: pywikibot.showHelp()
def modification(PageHS): page = Page(site, PageHS) if page.exists(): if page.namespace() != 0 and page.title( ) != u'Utilisateur:JackBot/test': return else: try: PageEnd = page.getRedirectTarget() except wikipedia.NoPage: print "NoPage" return gen2 = pagegenerators.ReferringPageGenerator(page) for PageCourante in pagegenerators.PreloadingGenerator(gen2, 100): print(PageCourante.title().encode(config.console_encoding, 'replace')) try: PageBegin = PageCourante.get() except wikipedia.NoPage: print "NoPage" return except wikipedia.IsRedirectPage: print "Redirect page" return except wikipedia.LockedPage: print "Locked/protected page" return except wikipedia.ServerError: print "ServerError" return except wikipedia.NoSuchSite: print "NoSuchSite" return except wikipedia.InvalidTitle: print "InvalidTitle" return PageTemp = PageBegin while PageTemp.find(u'[[' + PageHS + u']]') != -1: PageTemp = PageTemp[ 0:PageTemp.find(u'[[' + PageHS + u']]') + 2] + PageEnd.title() + u'|' + PageHS + PageTemp[ PageTemp.find(u'[[' + PageHS + u']]') + len(u'[[' + PageHS + u']]') - 2:len(PageTemp)] while PageTemp.find(u'[[' + PageHS + u'|') != -1: PageTemp = PageTemp[ 0:PageTemp.find(u'[[' + PageHS + u'|') + 2] + PageEnd.title( ) + PageTemp[PageTemp.find(u'[[' + PageHS + u'|') + len(u'[[' + PageHS + u'|') - 1:len(PageTemp)] if PageTemp != PageBegin: sauvegarde(PageCourante, PageTemp) if PageHS.find(u'/') != -1 or PageHS.find(u' - ') != -1: page.delete(u'Suppression après gestion des pages liées', u'', throttle=True)
def main(): genFactory = pagegenerators.GeneratorFactory() PageTitles = [] xmlFilename = None always = False ignorepdf = False limit = None namespaces = [] generator = None for arg in pywikibot.handleArgs(): if arg.startswith('-namespace:'): try: namespaces.append(int(arg[11:])) except ValueError: namespaces.append(arg[11:]) elif arg.startswith('-summary:'): pywikibot.setAction(arg[9:]) elif arg == '-always': always = True elif arg == '-ignorepdf': ignorepdf = True elif arg.startswith('-limit:'): limit = int(arg[7:]) elif arg.startswith('-xmlstart'): if len(arg) == 9: xmlStart = pywikibot.input( u'Please enter the dumped article to start with:') else: xmlStart = arg[10:] elif arg.startswith('-xml'): if len(arg) == 4: xmlFilename = pywikibot.input( u'Please enter the XML dump\'s filename:') else: xmlFilename = arg[5:] else: genFactory.handleArg(arg) if xmlFilename: try: xmlStart except NameError: xmlStart = None generator = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces) if not generator: generator = genFactory.getCombinedGenerator() if not generator: # syntax error, show help text from the top of this file pywikibot.showHelp('reflinks') return generator = pagegenerators.PreloadingGenerator(generator, pageNumber=50) generator = pagegenerators.RedirectFilterPageGenerator(generator) bot = ReferencesRobot(generator, always, limit, ignorepdf) bot.run()
def crawlerCat(category, recursif, apres): modifier = u'False' cat = catlib.Category(site, category) pages = cat.articlesList(False) #gen = pagegenerators.NamespaceFilterPageGenerator(pages, [ns]) HS sur Commons for Page in pagegenerators.PreloadingGenerator(pages, 100): if not apres or apres == u'' or modifier == u'True': modification(Page.title()) #crawlerLink(Page.title()) elif Page.title() == apres: modifier = u'True' if recursif == True: subcat = cat.subcategories(recurse=True) for subcategory in subcat: if subcategory.title().find(u'.ogg') == -1 and subcategory.title( ).find(u'spoken') == -1 and subcategory.title().find( u'Wikipedia') == -1 and subcategory.title().find( u'Wikinews') == -1: pages = subcategory.articlesList(False) for Page in pagegenerators.PreloadingGenerator(pages, 100): modification(Page.title())