def isPeople(self, article): site = Site("en") page = Page(site, article.decode("utf8")) #print article #print page.get() #print page.get(get_redirect = True) #print "redirect?", page.isRedirectPage() if page.isRedirectPage(): page = page.getRedirectTarget() #print [cat.title() for cat in page.categories()] cats = {cat:1 for cat in page.categories()} if any(["People" in tcat.title() for tcat in cats]): print(cats) return True currcats = cats.copy() allcats = {} depth = 0 while currcats!={} and depth < 2: depth += 1 newcats = {} for cat in currcats: if cat in allcats: continue allcats[cat] = 1 parentcats = {cat:1 for cat in cat.categories()} if any(["People" in tcat.title() for tcat in parentcats]): print(parentcats) return True for parcat in parentcats: if parcat not in allcats and parcat not in newcats: newcats[parcat] = 1 currcats = newcats print(len(currcats), currcats) #self.historicCats.update(allcats) return False
def task(self): list_platzhalter = [] list_protected = [] lemma_list = self.get_list() for idx, item in enumerate(lemma_list): lemma = Page(self.wiki, item["title"]) if self.is_protected(lemma): list_protected.append(lemma.title()) lemma.protect(protections={ "edit": "autoconfirmed", "move": "autoconfirmed" }, reason="is now common") categories = [item.title() for item in lemma.categories()] if "Kategorie:RE:Platzhalter" in categories: list_platzhalter.append(lemma.title()) self.logger.info( f"{idx}/{len(lemma_list)} prot: {len(list_protected)}, plat: {len(list_platzhalter)} {lemma.title()}" ) page_protected = Page(self.wiki, "Benutzer:THE IT/RE/Arthur Stein/protected") page_protected.text = self.join_lists(list_protected) page_protected.save() page_platzhalter = Page(self.wiki, "Benutzer:THE IT/RE/Arthur Stein/platzhalter") page_platzhalter.text = self.join_lists(list_platzhalter) page_platzhalter.save() return True
def getCategories(self, article): baseDir = "articleCategoriesCache/" if not os.path.exists(baseDir): os.makedirs(baseDir) fname = baseDir + article if os.path.isfile(fname): lines = [] try: with codecs.open(fname, encoding='utf-8') as f: lines = [line.strip() for line in f.readlines()] #print "utf8 encoding" except: with codecs.open(fname) as f: lines = [line.strip() for line in f.readlines()] #print "ascii encoding" lines = self.filterCategories(lines) if lines != []: #print "get Cat Cache:", lines return lines site = Site("en") page = Page(site, article.decode("utf8")) #print article #print page.get() #print page.get(get_redirect = True) #print "redirect?", page.isRedirectPage() if page.isRedirectPage(): page = page.getRedirectTarget() #print [cat.title() for cat in page.categories()] cats = sorted([ cat.title() for cat in page.categories() if not cat.isHiddenCategory() ]) #print "downloaded cats1: ", cats cats = self.filterCategories(cats) #print "downloaded cats2: ", cats text = "" for cat in cats: text += cat + "\n" try: with codecs.open(fname, "a+") as f: f.write(text) except: with codecs.open(fname, "a+") as f: f.write(text.encode('utf-8')) return cats