def main(): # The generator gives the pages that should be worked upon. gen = None # If debug is True, doesn't do any real changes, but only show # what would have been changed. debug = False wantHelp = False # Parse command line arguments for arg in wikipedia.handleArgs(): if arg.startswith("-debug"): debug = True else: wantHelp = True if not wantHelp: # The preloading generator is responsible for downloading multiple # pages from the wiki simultaneously. cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % 'IP-Host') hosts_gen = pagegenerators.CategorizedPageGenerator(cat, start = None, recurse = False) hosts_gen = pagegenerators.PreloadingGenerator(hosts_gen) cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % 'IP-Network') nets_gen = pagegenerators.CategorizedPageGenerator(cat, start = None, recurse = False) nets_gen = pagegenerators.PreloadingGenerator(nets_gen) bot = IpNetworkBot(nets_gen, hosts_gen, debug) bot.run() else: wikipedia.showHelp()
def facatlist(facat): wikipedia.config.put_throttle = 0 wikipedia.put_throttle.setDelay() count = 0 listenpageTitle = [] PageTitle = facat.replace(u'[[', u'').replace(u']]', u'').strip() language = 'fa' PageTitles = [PageTitle] for PageTitle in PageTitles: cat = catlib.Category(wikipedia.getSite(language), PageTitle) listacategory = [cat] listacategory = categorydown(listacategory) for enpageTitle in listacategory: enpageTitle = str(enpageTitle).split(u'|')[0].split( u']]')[0].replace(u'[[', u'').strip() cat = catlib.Category(wikipedia.getSite(language), enpageTitle) gent = pagegenerators.CategorizedPageGenerator(cat) for pagework in gent: count += 1 try: link = str(pagework).split(u'|')[0].split( u']]')[0].replace(u'[[', u'').strip() except: pagework = unicode(str(pagework), 'UTF-8') link = pagework.split(u'|')[0].split(u']]')[0].replace( u'[[', u'').strip() wikipedia.output(link) fapagetitle = link wikipedia.output(u'adding ' + fapagetitle + u' to fapage lists') listenpageTitle.append(fapagetitle) if listenpageTitle == []: return False return listenpageTitle
def setUp(self): self.site = pywikibot.getSite('en', 'wikipedia') self.data = [catlib.Category(self.site, 'Category:Cat1'), catlib.Category(self.site, 'Category:Cat2')] self.site_de = pywikibot.getSite('de', 'wikipedia') self.site_fr = pywikibot.getSite('fr', 'wikipedia')
def move_contents(self, oldCatTitle, newCatTitle, editSummary): """The worker function that moves pages out of oldCat into newCat""" while True: try: oldCat = catlib.Category(self.site, self.catprefix + oldCatTitle) newCat = catlib.Category(self.site, self.catprefix + newCatTitle) oldCatLink = oldCat.title() newCatLink = newCat.title() comment = editSummary % locals() # Move articles found, moved = 0, 0 for result in self.query_results(list="categorymembers", cmtitle=oldCat.title(), cmprop="title|sortkey", cmlimit="max"): found += len(result['categorymembers']) for item in result['categorymembers']: article = pywikibot.Page(self.site, item['title']) changed = self.change_category(article, oldCat, newCat, comment=comment) if changed: moved += 1 # pass 2: look for template doc pages for result in self.query_results(list="categorymembers", cmtitle=oldCat.title(), cmprop="title|sortkey", cmnamespace="10", cmlimit="max"): for item in result['categorymembers']: doc = pywikibot.Page(self.site, item['title'] + "/doc") try: old_text = doc.get() except pywikibot.Error: continue changed = self.change_category(doc, oldCat, newCat, comment=comment) if changed: moved += 1 if found: pywikibot.output(u"%s: %s found, %s moved" % (oldCat.title(), found, moved)) return (found, moved) except pywikibot.ServerError: pywikibot.output(u"Server error: retrying in 5 seconds...") time.sleep(5) continue except KeyboardInterrupt: raise except: return (None, None)
def run(self): self.count = { "target" : [], "done" : [] } pywikibot.setAction( self.summary ) musecat = catlib.Category( pywikibot.getSite(), u'Category:博物館' ) ignore_list = [ catlib.Category( pywikibot.getSite(), u'Category:登録博物館' ), catlib.Category( pywikibot.getSite(), u'Category:博物館相当施設' ), catlib.Category( pywikibot.getSite(), u'Category:全国博物館園職員録' ), catlib.Category( pywikibot.getSite(), u'Category:全国博物館総覧' ), ] catlist = musecat.subcategoriesList() catlist = set( catlist ) - set( ignore_list ) for page in self.generator: self.treat(page, catlist)
def Main(): site = pywikibot.getSite() d = datetime.today() datestring = d.isoformat() zipfilename = "archive%s.zip" % datestring z = zipfile.ZipFile(zipfilename, "w") for x in ( 'Candidates_for_speedy_deletion_as_hoaxes', 'Candidates_for_speedy_deletion_as_importance_or_significance_not_asserted', 'Candidates_for_speedy_deletion_for_unspecified_reason'): cat = catlib.Category(site, x) pages = cat.articlesList(False) gen = pagegenerators.PreloadingGenerator(pages, 100) for Page in gen: outfile = "PAGES/%s.txt" % Page.urlname() text = Page.get() sutf8 = text.encode('UTF-8') print outfile z.writestr(outfile, sutf8) count = 0 for strings in gen.data: for string in strings: for string2 in string: count = count + 1 # sutf8 = string2.encode('UTF-8') z.writestr("RawFiles/%s%d.xml" % (x, count), string2) z.close() push_zip(zipfilename)
def main(): #page generator gen = None # This temporary array is used to read the page title if one single # page to work on is specified by the arguments. pageTitle = [] # Which namespaces should be processed? # default to [] which means all namespaces will be processed namespaces = [] # Never ask before changing a page always = False # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() for arg in pywikibot.handleArgs(): if arg.startswith('-xml'): if len(arg) == 4: xmlFilename = i18n.input('pywikibot-enter-xml-filename') else: xmlFilename = arg[5:] gen = XmlDumpNoReferencesPageGenerator(xmlFilename) elif arg.startswith('-namespace:'): try: namespaces.append(int(arg[11:])) except ValueError: namespaces.append(arg[11:]) elif arg == '-always': always = True else: if not genFactory.handleArg(arg): pageTitle.append(arg) if pageTitle: page = pywikibot.Page(pywikibot.getSite(), ' '.join(pageTitle)) gen = iter([page]) if not gen: gen = genFactory.getCombinedGenerator() if not gen: site = pywikibot.getSite() try: cat = maintenance_category[site.family.name][site.lang] except: pass else: import catlib if not namespaces: namespaces = [0] cat = catlib.Category(site, "%s:%s" % (site.category_namespace(), cat)) gen = pagegenerators.CategorizedPageGenerator(cat) if not gen: pywikibot.showHelp('noreferences') else: if namespaces: gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) preloadingGen = pagegenerators.PreloadingGenerator(gen) bot = NoReferencesBot(preloadingGen, always) bot.run()
def getCategoryLinks(text, site=None): import catlib """Return a list of category links found in text. List contains Category objects. Do not call this routine directly, use Page.categories() instead. """ result = [] if site is None: site = pywikibot.getSite() # Ignore category links within nowiki tags, pre tags, includeonly tags, # and HTML comments text = removeDisabledParts(text) catNamespace = '|'.join(site.category_namespaces()) R = re.compile( r'\[\[\s*(?P<namespace>%s)\s*:\s*(?P<catName>.+?)' r'(?:\|(?P<sortKey>.+?))?\s*\]\]' % catNamespace, re.I) for match in R.finditer(text): cat = catlib.Category( site, '%s:%s' % (match.group('namespace'), match.group('catName')), sortKey=match.group('sortKey')) result.append(cat) return result
def main(args): ''' Main loop. Get a generator and options. Work on all images in the generator. ''' generator = None onlyFilter = False onlyUncat = False genFactory = pagegenerators.GeneratorFactory() global search_wikis global hint_wiki site = pywikibot.getSite(u'commons', u'commons') pywikibot.setSite(site) for arg in pywikibot.handleArgs(): if arg == '-onlyfilter': onlyFilter = True elif arg == '-onlyuncat': onlyUncat = True elif arg.startswith('-hint:'): hint_wiki = arg [len('-hint:'):] elif arg.startswith('-onlyhint'): search_wikis = arg [len('-onlyhint:'):] else: genFactory.handleArg(arg) generator = genFactory.getCombinedGenerator() if not generator: generator = pagegenerators.CategorizedPageGenerator( catlib.Category(site, u'Category:Media needing categories'), recurse=True) initLists() categorizeImages(generator, onlyFilter, onlyUncat) pywikibot.output(u'All done')
def main(): ''' Esta é a def onde o nosso script vai estar ''' site = wikipedia.Site("pt", "wikipedia") # definimos que o site é a pt.wp '''De seguida, definimos a categoria Ambiente e obtemos a listagem dos títulos dos artigos. Na demonstração o código está por extenso para mais fácil percepção, na prática, bastaria pages = catlib.Category(site, u"Ambiente").articles() para se obter a listagem ''' cat = catlib.Category(site, u"Ambiente") # Aqui definimos a categoria Ambiente. catList = cat.articlesList() '''Agora que temos uma listagem, e antes de contar os elementos, vamos ver os títulos que constam na catList. Esta abordagem serve bem para ilustrar este exemplo, caso fosse para interagir directamente com os artigos, como veremos noutro post, há abordagens mais eficientes. O primeiro print, ou seja, no caso o objecto page, é um objecto python, enquanto que o segundo print, o do page.title(), já tem o formato de unicode. ''' for page in catList: print u"página (objecto):", page print u"Título da página: ", page.title() # mostra o título do artigo ''' Por fim, fazemos a contagem dos artigos ''' print u"\n Nº de artigos na categoria: ", len(catList)
def createlist(cat, wpproj, raw=False, cats=True): category = catlib.Category(site, cat) gen = pagegenerators.CategorizedPageGenerator(category, recurse=True) wikitext = '' wikitext2 = '' wikitext3 = '' if not cats: for page in gen: wikitext = wikitext + '\n*' + str(page) link = delink(str(page)) print link wikitext2 = wikitext2 + '\n' + link wikitext = unicodify(wikitext) if cats: subcats = category.subcategories(recurse=True) for subcat in subcats: newtext = retpages(subcat) wikitext3 += newtext wikitext3 = unicodify(wikitext3) page = wikipedia.Page(site, wpproj + '/Articles') if not cats: page.put(wikitext, 'Updating watchlist') if cats: page.put(wikitext3, 'Updating watchlist') wikitext2 = '<pre>\n' + wikitext2 + '\n</pre>' wikitext2 = unicodify(wikitext2) if raw == True: page = wikipedia.Page(site, wpproj + '/Articles/raw') page.put(wikitext2, 'Updating raw watchlist')
def main(): # The generator gives the pages that should be worked upon. gen = None # If debug is True, doesn't do any real changes, but only show # what would have been changed. debug = False wantHelp = False cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % 'Instance') gen = pagegenerators.CategorizedPageGenerator(cat, start=None, recurse=False) # Parse command line arguments for arg in wikipedia.handleArgs(): if arg.startswith("-debug"): debug = True else: print arg, "yielding wanthelp" wantHelp = True if not wantHelp: # The preloading generator is responsible for downloading multiple # pages from the wiki simultaneously. gen = pagegenerators.PreloadingGenerator(gen) bot = PuppetUnmanagedListBot(gen, debug) bot.run() else: wikipedia.showHelp()
def filterCountries(categories): ''' Try to filter out ...by country categories. First make a list of any ...by country categories and try to find some countries. If a by country category has a subcategoy containing one of the countries found, add it. The ...by country categories remain in the set and should be filtered out by filterParents. ''' result = categories listByCountry = [] listCountries = [] for cat in categories: if (cat.endswith(u'by country')): listByCountry.append(cat) #If cat contains 'by country' add it to the list #If cat contains the name of a country add it to the list else: for country in countries: if country in cat: listCountries.append(country) if (len(listByCountry) > 0): for bc in listByCountry: category = catlib.Category(wikipedia.getSite(), u'Category:' + bc) for subcategory in category.subcategories(): for country in listCountries: if (subcategory.titleWithoutNamespace().endswith(country)): result.append(subcategory.titleWithoutNamespace()) return list(set(result))
def UpdateRepoCats(*args): # Get List of all articles in Category:All add-ons site = pywikibot.getSite() # Download all repos as soup element soups = importAllAddonXML() # Get all pages in Category All add-ons cat = catlib.Category(site, u'Category:All add-ons') pages = cat.articlesList(False) allRepoCats = repoCatList(site) for Page in pagegenerators.PreloadingGenerator(pages, 100): # Get addon_id via regexp addon_id = None addon_id = re.search("\|ID=([a-zA-Z0-9_\.\-]+)", Page.get()) if not addon_id: pywikibot.output("Can't find addon_id for %s, skipping it..." % Page.title()) continue else: addon_id = addon_id.group(1) pywikibot.output("Identifying Repos for %s." % addon_id) # See if addon_id can be found in repos repos = checkInRepo(addon_id, soups) addRemoveRepoCats(Page, repos, allRepoCats)
def getSDTitles(site): titles = [] for x in ( 'Declined_AfC_submissions', "Proposed_deletion", "Expired_proposed_deletions", 'Candidates_for_speedy_deletion_as_hoaxes', 'Candidates_for_speedy_deletion_as_importance_or_significance_not_asserted', 'Candidates_for_speedy_deletion_as_lacking_context', 'Candidates for speedy deletion as pages previously deleted via deletion discussion', 'Contested candidates for speedy deletion', 'Speedy_deletion_candidates_with_talk_pages', 'Candidates_for_speedy_deletion_as_duplicating_an_existing_topic', 'Candidates_for_speedy_deletion_for_unspecified_reason'): cat = catlib.Category(site, x) # pages = cat.articlesList(False) pages = cat.articlesList(recurse=True) # print pages for x in pages: # print x.urlname() n = x.urlname() an = n.encode("ascii", "ignore") if (isNewTitle(an)): titles += [n] return titles
def test_titles_Category(self): params = { 'action': 'query', 'prop': 'revisions', 'rvprop': ['ids', 'timestamp', 'user'], 'rvdir': 'newer', 'rvlimit': 1, 'titles': [catlib.Category(self.site, u'Category:Categories')], } expectedresult = { u'pages': { u'794823': { u'ns': 14, u'pageid': 794823, u'revisions': [{ u'revid': 4494485, u'user': u'SEWilco', u'timestamp': u'2004-07-07T18:45:50Z', }], u'title': u'Category:Categories', }, } } self.assertEqualQueryResult(params, expectedresult)
def run(self): site = pywikibot.getSite() newCat = catlib.Category(site, self.newCatTitle) # set edit summary message if not self.editSummary: try: self.editSummary = pywikibot.translate( site, msg_change) % (self.oldCat.title(), newCat.title()) except TypeError: self.editSummary = pywikibot.translate( site, msg_change) % self.oldCat.title() # Copy the category contents to the new category page copied = False oldMovedTalk = None if self.oldCat.exists() and self.moveCatPage: copied = self.oldCat.copyAndKeep( self.newCatTitle, pywikibot.translate(site, cfd_templates)) # Also move the talk page if copied: reason = pywikibot.translate(site, deletion_reason_move) \ % (self.newCatTitle, self.newCatTitle) oldTalk = self.oldCat.toggleTalkPage() if oldTalk.exists(): newTalkTitle = newCat.toggleTalkPage().title() try: talkMoved = oldTalk.move(newTalkTitle, reason) except (pywikibot.NoPage, pywikibot.PageNotSaved), e: #in order : #Source talk does not exist, or #Target talk already exists pywikibot.output(e.message) else: if talkMoved: oldMovedTalk = oldTalk
def main(): wikipedia.output(u'Testing 1 2 3') generator = None genFactory = pagegenerators.GeneratorFactory() site = wikipedia.getSite(u'commons', u'commons') wikipedia.setSite(site) for arg in wikipedia.handleArgs(): if arg.startswith('-page'): if len(arg) == 5: generator = [ wikipedia.Page( site, wikipedia.input(u'What page do you want to use?')) ] else: generator = [wikipedia.Page(site, arg[6:])] else: generator = genFactory.handleArg(arg) if generator: for page in generator: if (page.namespace() == 14): sort_TOL_Category(catlib.Category(site, page.title())) else: wikipedia.output(u'No categories to work on!')
def CategoryGenerator(generator): """ Wraps around another generator. Yields the same pages, but as Category objects instead of Page objects. Makes sense only if it is ascertained that only categories are being retrieved. """ for page in generator: yield catlib.Category(page.site(), page.title())
def __init__(self): self.site = pywikibot.getSite() self.cat = catlib.Category( self.site, 'Category:Wikipedia usernames with possible policy issues') self.recurse = False self.run_time = datetime.datetime.now() + datetime.timedelta(days=-7) self.sentinel_text = '[[Category:Wikipedia usernames with possible policy issues|{{PAGENAME}}]]'
def docat(tag, cat2): site = wikipedia.getSite() cat = catlib.Category(site, cat2) gen = pagegenerators.CategorizedPageGenerator(cat) dogen(gen, tag) wikipedia.output(u'\nFinished with Category:' + cat2 + '.\n') print 'Waiting 10 seconds' time.sleep(10)
def CAT(site, name, hide): name = site.namespace(14) + ':' + name cat = catlib.Category(site, name) for article in cat.articles(endsort=hide): yield article if hide: for article in cat.articles(startFrom=unichr(ord(hide) + 1)): yield article
def addRemoveRepoCats(article, repos, allRepoCats, comment=None): # Create list of repos to be removed notRepos = [] if not article.canBeEdited(): pywikibot.output("Can't edit %s, skipping it..." % article.aslink()) return False cats = article.categories(get_redirect=True) site = article.site() changesMade = False newCatList = [] newCatSet = set() repoCatList = [] #remove all repos for i in range(len(cats)): cat = cats[i] if cat in allRepoCats: changesMade = True continue if cat.title() not in newCatSet: newCatSet.add(cat.title()) newCatList.append(cat) #add relevant repos for i in range(len(repos)): repo = repos[i] newCatList.append(catlib.Category(site, 'Category:' + repoCats[repo])) changesMade = True if not changesMade: pywikibot.output(u'No changes necessary to %s!' % article.title()) else: text = article.get(get_redirect=True) try: text = pywikibot.replaceCategoryLinks(text, newCatList) except ValueError: # Make sure that the only way replaceCategoryLinks() can return # a ValueError is in the case of interwiki links to self. pywikibot.output(u'Skipping %s because of interwiki link to self' % article) try: article.put(text, comment='Addon-Bot repo category update', watchArticle=None, minorEdit=True) except pywikibot.EditConflict: pywikibot.output(u'Skipping %s because of edit conflict' % article.title()) except pywikibot.SpamfilterError, e: pywikibot.output(u'Skipping %s because of blacklist entry %s' % (article.title(), e.url)) except pywikibot.LockedPage: pywikibot.output(u'Skipping %s because page is locked' % article.title())
def category2testset(site, categoryName="Automated tests"): """Scan a category on the given pywikipedia site for valid extension tests, and generate a testset (a list of Test objects)""" category = catlib.Category(site, categoryName) testset = [] for article in category.articles(): testset += article2testset(article) return testset
def categories(self): for page in self.generator: try: wikipedia.output(u'\n>>>> %s <<<<' % page.title()) getCommons = wikipedia.getSite('commons', 'commons') commonsCategory = catlib.Category(getCommons, 'Category:%s' % page.title()) try: getcommonscat = commonsCategory.get(get_redirect=True) commonsCategoryTitle = commonsCategory.title() categoryname = commonsCategoryTitle.split('Category:', 1)[1] if page.title() == categoryname: oldText = page.get() text = oldText # for commonscat template findTemplate = re.compile(ur'\{\{[Cc]ommons') s = findTemplate.search(text) findTemplate2 = re.compile(ur'\{\{[Ss]isterlinks') s2 = findTemplate2.search(text) if s or s2: wikipedia.output(u'** Already done.') else: text = wikipedia.replaceCategoryLinks( text + u'{{commonscat|%s}}' % categoryname, page.categories()) if oldText != text: wikipedia.showDiff(oldText, text) if not self.acceptall: choice = wikipedia.inputChoice( u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N') if choice == 'a': self.acceptall = True if self.acceptall or choice == 'y': try: msg = wikipedia.translate( wikipedia.getSite(), comment2) page.put(text, msg) except wikipedia.EditConflict: wikipedia.output( u'Skipping %s because of edit conflict' % (page.title())) except wikipedia.NoPage: wikipedia.output(u'Category does not exist in Commons!') except wikipedia.NoPage: wikipedia.output(u'Page %s does not exist?!' % page.title()) except wikipedia.IsRedirectPage: wikipedia.output(u'Page %s is a redirect; skipping.' % page.title()) except wikipedia.LockedPage: wikipedia.output(u'Page %s is locked?!' % page.title())
def pagesInCat(site, categoryname, recurse): # Return pages in this category as a list, or die. try: category = catlib.Category(site, categoryname) pages = category.articlesList(recurse=recurse) except Exception: traceback.print_exc() print "Error, stopping." exit() return pages
def filterCategory(page): """ Loop over all subcategories of page and filter them """ # FIXME: category = catlib.Category(page) doesn't work site = page.site() title = page.title() category = catlib.Category(site, title) for subcat in category.subcategories(): filterSubCategory(subcat, category)
def crawlerCat(category): cat = catlib.Category(site, category) pages = cat.articlesList(False) for Page in pagegenerators.PreloadingGenerator(pages, 100): modification(Page.title()) subcat = cat.subcategories(recurse=True) for subcategory in subcat: pages = subcategory.articlesList(False) for Page in pagegenerators.PreloadingGenerator(pages, 100): modification(Page.title())
def __init__(self, catTitle, listTitle, editSummary, overwrite = False, showImages = False, subCats = False, talkPages = False, recurse = False): self.editSummary = editSummary self.overwrite = overwrite self.showImages = showImages self.site = pywikibot.getSite() self.cat = catlib.Category(self.site, 'Category:' + catTitle) self.list = pywikibot.Page(self.site, listTitle) self.subCats = subCats self.talkPages = talkPages self.recurse = recurse
def __init__(self): """ Arguments: none yet """ self.mySite = wikipedia.getSite() self.csdCat = catlib.Category( self.mySite, wikipedia.translate(self.mySite, self.csd_cat)) self.savedProgress = None self.preloadingGen = None