def main(): """Main function.""" fg_colors = [col for col in colors if col != 'default'] bg_colors = fg_colors[:] n_fg_colors = len(fg_colors) fg_colors.insert(3 * int(n_fg_colors / 4), 'default') fg_colors.insert(2 * int(n_fg_colors / 4), 'default') fg_colors.insert(int(n_fg_colors / 4), 'default') fg_colors.insert(0, 'default') # Max len of color names for padding. max_len_fg_colors = len(max(fg_colors, key=len)) max_len_bc_color = len(max(bg_colors, key=len)) for bg_col in bg_colors: # Three lines per each backgoung color. for fg_col_group in itergroup(fg_colors, n_fg_colors / 4 + 1): line = '' for fg_col in fg_col_group: line += ' ' line += color_format('{color}{0}{default}', fg_col.ljust(max_len_fg_colors), color='%s;%s' % (fg_col, bg_col)) line = '{0} {1}'.format(bg_col.ljust(max_len_bc_color), line) pywikibot.output(line) pywikibot.output('')
def apiquery(self, alllinks): output = {} for links in itergroup(alllinks, 50): query = api.Request(site=self.siteSource, action='query', prop='langlinks', titles=links, redirects='', lllang=self.siteDest.code, lllimit=500) results = query.submit() if 'query-continue' in results: raise Exception('should not get query-continue') if 'query' not in results: continue results = results['query'] redirects = DefaultDict() normalized = DefaultDict() if 'pages' not in results: continue if 'redirects' in results: redirects = DefaultDict((item['to'], item['from']) for item in results['redirects']) if 'normalized' in results: normalized = DefaultDict((item['to'], item['from']) for item in results['normalized']) results = results['pages'] for pageid in results: if int(pageid) < 0: continue pagedata = results[pageid] if 'langlinks' not in pagedata: continue output[normalized[redirects[pagedata['title']]]] = pagedata['langlinks'][0]['*'] return output
def PageRevIdGenerator(site, pagelist, step=50): """ Generate page objects with their most recent revision ID. This generator is a modified version of `preloadpages` in pywikibot.site. :param site: site we're requesting page IDs from :param pagelist: an iterable that returns Page objects :param step: how many Pages to query at a time :type step: int """ for sublist in itergroup(pagelist, step): pageids = [str(p._pageid) for p in sublist if hasattr(p, "_pageid") and p._pageid > 0] cache = dict((p.title(withSection=False), p) for p in sublist) props = "revisions|info|categoryinfo" rvgen = api.PropertyGenerator(props, site=site) rvgen.set_maximum_items(-1) # suppress use of "rvlimit" parameter if len(pageids) == len(sublist): # only use pageids if all pages have them rvgen.request["pageids"] = "|".join(pageids) else: rvgen.request["titles"] = "|".join(list(cache.keys())) rvgen.request[u"rvprop"] = u"ids|flags|timestamp|user|comment" logging.debug(u"Retrieving {n} pages from {s}.".format(n=len(cache), s=site)) for pagedata in rvgen: logging.debug(u"Preloading {0}".format(pagedata)) try: if pagedata['title'] not in cache: # API always returns a "normalized" title which is # usually the same as the canonical form returned by # page.title(), but sometimes not (e.g., # gender-specific localizations of "User" namespace). # This checks to see if there is a normalized title in # the response that corresponds to the canonical form # used in the query. for key in cache: if site.sametitle(key, pagedata['title']): cache[pagedata['title']] = cache[key] break else: logging.warning( u"preloadpages: Query returned unexpected title" u"'%s'" % pagedata['title']) continue except KeyError: logging.debug(u"No 'title' in %s" % pagedata) logging.debug(u"pageids=%s" % pageids) logging.debug(u"titles=%s" % list(cache.keys())) continue page = cache[pagedata['title']] api.update_page(page, pagedata) # Since we're not loading content and the pages are already in # memory, let's yield the pages in the same order as they were # received in case that's important. for page in sublist: yield page
def _pagesexist(self, allpages): out = [] for pages in itergroup(allpages, 500): text = self.parse("\n".join( ["* {{PAGESIZE:%s|R}}" % page.title() for page in pages])) for i, ps in enumerate(lre.pats["site_APISite_li"].findall(text)): out.append((pages[i], int(ps) != 0)) return out
def check_titles(site, report_page_name, replacements): """ To avoid breaking links, adds page titles that will be changed to exception list :param site: site where the bot will run :param report_page_name: a page name to list of titles adds to exception :param replacements: dictionary of replacements """ from pywikibot import textlib from pywikibot.tools import itergroup all_pages = site.allpages(namespace=0, filterredir=False, content=False) evaluation_progress = 0 exceptions_dict = {} for titles_group in itergroup(all_pages, all_pages.query_limit): titles_group_t = [p.title(asLink=True) for p in titles_group] old_titles = titles_group_t evaluation_progress += len(titles_group_t) if evaluation_progress % 20000 == 0: print('\r%i page titles processed' % evaluation_progress) old_text = ' \n '.join(titles_group_t) for replacement_key, replacement in replacements.items(): replacement_exceptions = replacement.exceptions or {} replacement_exceptions_inside = replacement_exceptions.get('inside', []) new_text = textlib.replaceExcept( old_text, replacement.old_regex, replacement.new, replacement_exceptions_inside, site=site) # replacement change valid title changed_titles = ((old_title, new_title) for old_title, new_title in zip(old_titles, new_text.split(' \n ')) if old_title != new_title and old_title != '[[%s' % pywikibot.tools.first_upper(new_title[2:])) # breaks link # no special treat for link changed_titles = ((old_title, new_title) for old_title, new_title in changed_titles if replacement.old_regex.sub(replacement.new, ' %s ' % old_title[2:-2]) != ' %s ' % old_title[2:-2]) # valid title is not disambig changed_titles = [old_title[2:-2] for old_title, new_title in changed_titles if not pywikibot.Page(site, old_title[2:-2]).isDisambig() ] if len(changed_titles) > 0: replacement_exceptions['inside'] = replacement_exceptions_inside + \ [re.compile(re.escape(title), re.U) for title in changed_titles] replacement.exceptions = replacement_exceptions if replacement_key not in exceptions_dict: exceptions_dict[replacement_key] = [] exceptions_dict[replacement_key] += changed_titles exceptions_dict = OrderedDict(sorted((int(k), v) for k, v in exceptions_dict.items())) report_page = pywikibot.Page(site, report_page_name) exception_report = '' for replace_key, replaced_titles in exceptions_dict.items(): exception_report += '\n* %i\n%s' % (replace_key, '\n'.join(['** [[%s]]' % t for t in replaced_titles])) report_page.put(exception_report, summary='עדכון')
def main(): namespaces = [x for x in range(1, 16, 2) if x not in [3, 5]] for ns in namespaces: gen = site.allpages(namespace=ns, filterredir=True) for i in gen: pywikibot.output("deleting " + i.title()) i.delete(reason=u"โรบอต: หน้าเปลี่ยนทางไม่จำเป็น", prompt=False) for ns in namespaces: pywikibot.output("ns " + str(ns)) gen = site.allpages(namespace=ns, content=True) for i, pages in enumerate(itergroup(gen, 5000)): pywikibot.output("processing bunch %d" % i) process(pages)
def preload_entities(self, pagelist, groupsize=50): """ Yield subclasses of WikibasePage's with content prefilled. Note that pages will be iterated in a different order than in the underlying pagelist. @param pagelist: an iterable that yields either WikibasePage objects, or Page objects linked to an ItemPage. @param groupsize: how many pages to query at a time @type groupsize: int """ if not hasattr(self, '_entity_namespaces'): self._cache_entity_namespaces() for sublist in itergroup(pagelist, groupsize): req = {'ids': [], 'titles': [], 'sites': []} for p in sublist: if isinstance(p, pywikibot.page.WikibasePage): ident = p._defined_by() for key in ident: req[key].append(ident[key]) else: if p.site == self and p.namespace() in ( self._entity_namespaces.values()): req['ids'].append(p.title(with_ns=False)) else: assert p.site.has_data_repository, \ 'Site must have a data repository' req['sites'].append(p.site.dbName()) req['titles'].append(p._link._text) req = self._simple_request(action='wbgetentities', **req) data = req.submit() for entity in data['entities']: if 'missing' in data['entities'][entity]: continue cls = self._type_to_class[data['entities'][entity]['type']] page = cls(self, entity) # No api call is made because item._content is given page._content = data['entities'][entity] with suppress(pywikibot.IsRedirectPage): page.get() # cannot provide get_redirect=True (T145971) yield page
def apiquery(self, alllinks): output = {} for links in itergroup(alllinks, 50): query = api.Request(site=self.siteSource, action='query', prop='langlinks', titles=links, redirects='', lllang=self.siteDest.code, lllimit=500) results = query.submit() if 'query-continue' in results: raise Exception('should not get query-continue') if 'query' not in results: continue results = results['query'] redirects = DefaultDict() normalized = DefaultDict() if 'pages' not in results: continue if 'redirects' in results: redirects = DefaultDict((item['to'], item['from']) for item in results['redirects']) if 'normalized' in results: normalized = DefaultDict((item['to'], item['from']) for item in results['normalized']) results = results['pages'] for pageid in results: if int(pageid) < 0: continue pagedata = results[pageid] if 'langlinks' not in pagedata: continue output[normalized[redirects[ pagedata['title']]]] = pagedata['langlinks'][0]['*'] return output
def check_titles(site, report_page_name, replacements): """ To avoid breaking links, adds page titles that will be changed to exception list :param site: site where the bot will run :param report_page_name: a page name to list of titles adds to exception :param replacements: dictionary of replacements """ from pywikibot import textlib from pywikibot.tools import itergroup all_pages = site.allpages(namespace=0, filterredir=False, content=False) evaluation_progress = 0 exceptions_dict = {} for titles_group in itergroup(all_pages, all_pages.query_limit): titles_group_t = [ p.title(as_link=True, with_section=False) for p in titles_group ] old_titles = titles_group_t evaluation_progress += len(titles_group_t) if evaluation_progress % 20000 == 0: print('\r%i page titles processed' % evaluation_progress) old_text = ' \n '.join(titles_group_t) for replacement_key, replacement in replacements.items(): replacement_exceptions = replacement.exceptions or {} replacement_exceptions_inside = replacement_exceptions.get( 'inside', []) new_text = textlib.replaceExcept(old_text, replacement.old_regex, replacement.new, replacement_exceptions_inside, site=site) # replacement change valid title changed_titles = ( (old_title, new_title) for old_title, new_title in zip( old_titles, new_text.split(' \n ')) if old_title != new_title and old_title != '[[%s' % pywikibot.tools.first_upper(new_title[2:])) # breaks link # no special treat for link changed_titles = ( (old_title, new_title) for old_title, new_title in changed_titles if replacement.old_regex.sub(replacement.new, ' %s ' % old_title[2:-2]) != ' %s ' % old_title[2:-2]) # valid title is not disambig changed_titles = [ old_title[2:-2] for old_title, new_title in changed_titles if not pywikibot.Page(site, old_title[2:-2]).isDisambig() ] if len(changed_titles) > 0: #changed_titles_exceptions = [re.compile(re.escape(title), re.U) for title in changed_titles] changed_titles_exceptions = [ re.compile( '\[\[%s\|.+?\]\]|%s' % (re.escape(title), re.escape(title)), re.U) for title in changed_titles ] replacement_exceptions[ 'inside'] = replacement_exceptions_inside + changed_titles_exceptions replacement.exceptions = replacement_exceptions if replacement_key not in exceptions_dict: exceptions_dict[replacement_key] = [] exceptions_dict[replacement_key] += changed_titles exceptions_dict = OrderedDict( sorted((int(k), v) for k, v in exceptions_dict.items())) report_page = pywikibot.Page(site, report_page_name) exception_report = '' for replace_key, replaced_titles in exceptions_dict.items(): exception_report += '\n* %i\n%s' % (replace_key, '\n'.join( ['** [[%s]]' % t for t in replaced_titles])) report_page.put(exception_report, summary='עדכון')
def PredictionGenerator(site, pages, step=50): ''' Generate pages with quality predictions. :param site: site of the pages we are predicting for :type pages: pywikibot.Site :param pages: List of pages we are predicting. :type pages: list of pywikibot.Page :param step: Number of pages to get predictions for at a time, maximum is 50. :type step: int ''' # looks like the best way to do this is to first make one # API request to update the pages with the current revision ID, # then make one ORES request to get the predictions. if step > 50: step = 50 langcode = '{lang}wiki'.format(lang=site.lang) # example ORES URL predicting ratings for multiple revisions: # https://ores.wmflabs.org/v2/scores/enwiki/wp10/?revids=703654757%7C714153013%7C713916222%7C691301429%7C704638887%7C619467163 # sub "%7C" with "|" # pywikibot.tools.itergroup splits up the list of pages for page_group in itergroup(pages, step): revid_page_map = {} # rev id (str) -> page object # we use the generator to efficiently load most recent rev id for page in PageRevIdGenerator(site, page_group): revid_page_map[str(page.latestRevision())] = page # make a request to score the revisions url = '{ores_url}{langcode}/wp10/?revids={revids}'.format( ores_url=config.ORES_url, langcode=langcode, revids='|'.join([str(page.latestRevision()) for page in page_group])) logging.debug('Requesting predictions for {n} pages from ORES'.format( n=len(revid_page_map))) num_attempts = 0 while num_attempts < config.max_url_attempts: r = requests.get(url, headers={'User-Agent': config.http_user_agent, 'From': config.http_from}) num_attempts += 1 if r.status_code == 200: try: response = r.json() revid_pred_map = response['scores'][langcode]['wp10']['scores'] # iterate over returned predictions and update for revid, score_data in revid_pred_map.items(): revid_page_map[revid].set_prediction(score_data['prediction'].lower()) break except ValueError: logging.warning("Unable to decode ORES response as JSON") except KeyErrror: logging.warning("ORES response keys not as expected") # something didn't go right, let's wait and try again sleep(500) for page in page_group: yield page
def PredictionGenerator(site, pages, step=50): ''' Generate pages with quality predictions. :param site: site of the pages we are predicting for :type pages: pywikibot.Site :param pages: List of pages we are predicting. :type pages: list of pywikibot.Page :param step: Number of pages to get predictions for at a time, maximum is 50. :type step: int ''' # looks like the best way to do this is to first make one # API request to update the pages with the current revision ID, # then make one ORES request to get the predictions. if step > 50: step = 50 langcode = '{lang}wiki'.format(lang=site.lang) # example ORES URL predicting ratings for multiple revisions: # https://ores.wmflabs.org/v2/scores/enwiki/wp10/?revids=703654757%7C714153013%7C713916222%7C691301429%7C704638887%7C619467163 # sub "%7C" with "|" # pywikibot.tools.itergroup splits up the list of pages for page_group in itergroup(pages, step): revid_page_map = {} # rev id (str) -> page object # we use the generator to efficiently load most recent rev id for page in PageRevIdGenerator(site, page_group): revid_page_map[str(page.latestRevision())] = page # make a request to score the revisions url = '{ores_url}{langcode}/wp10/?revids={revids}'.format( ores_url=config.ORES_url, langcode=langcode, revids='|'.join( [str(page.latestRevision()) for page in page_group])) logging.debug('Requesting predictions for {n} pages from ORES'.format( n=len(revid_page_map))) num_attempts = 0 while num_attempts < config.max_url_attempts: r = requests.get(url, headers={ 'User-Agent': config.http_user_agent, 'From': config.http_from }) num_attempts += 1 if r.status_code == 200: try: response = r.json() revid_pred_map = response['scores'][langcode]['wp10'][ 'scores'] # iterate over returned predictions and update for revid, score_data in revid_pred_map.items(): revid_page_map[revid].set_prediction( score_data['prediction'].lower()) break except ValueError: logging.warning("Unable to decode ORES response as JSON") except KeyErrror: logging.warning("ORES response keys not as expected") # something didn't go right, let's wait and try again sleep(500) for page in page_group: yield page
def PageRevIdGenerator(site, pagelist, step=50): """ Generate page objects with their most recent revision ID. This generator is a modified version of `preloadpages` in pywikibot.site. :param site: site we're requesting page IDs from :param pagelist: an iterable that returns Page objects :param step: how many Pages to query at a time :type step: int """ for sublist in itergroup(pagelist, step): pageids = [ str(p._pageid) for p in sublist if hasattr(p, "_pageid") and p._pageid > 0 ] cache = dict((p.title(withSection=False), p) for p in sublist) props = "revisions|info|categoryinfo" rvgen = api.PropertyGenerator(props, site=site) rvgen.set_maximum_items(-1) # suppress use of "rvlimit" parameter if len(pageids) == len(sublist): # only use pageids if all pages have them rvgen.request["pageids"] = "|".join(pageids) else: rvgen.request["titles"] = "|".join(list(cache.keys())) rvgen.request[u"rvprop"] = u"ids|flags|timestamp|user|comment" logging.debug(u"Retrieving {n} pages from {s}.".format(n=len(cache), s=site)) for pagedata in rvgen: logging.debug(u"Preloading {0}".format(pagedata)) try: if pagedata['title'] not in cache: # API always returns a "normalized" title which is # usually the same as the canonical form returned by # page.title(), but sometimes not (e.g., # gender-specific localizations of "User" namespace). # This checks to see if there is a normalized title in # the response that corresponds to the canonical form # used in the query. for key in cache: if site.sametitle(key, pagedata['title']): cache[pagedata['title']] = cache[key] break else: logging.warning( u"preloadpages: Query returned unexpected title" u"'%s'" % pagedata['title']) continue except KeyError: logging.debug(u"No 'title' in %s" % pagedata) logging.debug(u"pageids=%s" % pageids) logging.debug(u"titles=%s" % list(cache.keys())) continue page = cache[pagedata['title']] api.update_page(page, pagedata) # Since we're not loading content and the pages are already in # memory, let's yield the pages in the same order as they were # received in case that's important. for page in sublist: yield page
def main(): exlist = [exc.group(1) for exc in lre.pats["exc"].finditer(wp.Page(conf.pageConf).get())] pages1, pages2, pages3 = [], [], [] if not args: pywikibot.output("quickscan mode") t = site.getcurrenttime() if t.day == 1: if t.month == 1: t = pywikibot.Timestamp(year=t.year - 1, month=12, day=31) else: t = pywikibot.Timestamp(year=t.year, month=t.month - 1, day=28) else: t = pywikibot.Timestamp(year=t.year, month=t.month, day=t.day - 1) gen1 = site.recentchanges( start=t, reverse=True, showRedirects=False, showBot=False, changetype=["new", "edit"], namespaces=conf.namespaces, ) pages1 = [page["title"] for page in gen1] gen2 = site.logevents(start=t, reverse=True, logtype="move") pages2 = [page.new_title().title() for page in gen2] elif args[0] == "-all": pywikibot.output("fullscan mode") gen3 = () for i in conf.namespaces: gen3 = itertools.chain(gen3, site.allpages(filterredir=False, start=u"ก", namespace=i)) pages3 = [page.title() for page in gen3] pywikibot.output("load all!") else: pages1 = [u"หมวดหมู่:ชาววิกิพีเดียรักองค์โสมฯ"] pywikibot.output("unknown argument") allpages = list(set(filter(lambda x: (ord(u"ก") <= ord(x[0]) <= ord(u"๛")), pages1 + pages2 + pages3))) datasite = site.data_repository() cnti = 0 pywikibot.output("processing %d pages" % len(allpages)) for check in conf.checklist: if check["detectFromTitle"] is None: check["detectFromTitle"] = "[]" # dummy string which invalid for title for checkClaim in check["claims"]: checkClaim["nameItem"] = pywikibot.ItemPage(datasite, checkClaim["nameItem"]) if checkClaim["refItem"] is not None: checkClaim["refItem"] = pywikibot.ItemPage(datasite, checkClaim["refItem"]) for pages in itergroup(allpages, 100): cnti += 1 pywikibot.output("round %d" % cnti) dat = datasite.loadcontent({"sites": site.dbName(), "titles": "|".join(pages)}) for i, qitem in enumerate(dat): pywikibot.output("item %d: %s" % (i, qitem)) if not qitem.lower().startswith("q"): continue item = pywikibot.ItemPage(datasite, qitem) item._content = dat[qitem] super(pywikibot.ItemPage, item).get() # For getting labels data = item.get() editdict = {} page = wp.Page(item.getSitelink(site)) if page.title() in exlist: continue for check in conf.checklist: passCriteria = False description = None if check["detectFromTitle"] in page.title(): passCriteria = True if check["detectFromNamespace"] == page.namespace(): passCriteria = True passAlItem = True for claimCheck in check["claims"]: passItem = False if claimCheck["name"] in data["claims"]: for claim in data["claims"][claimCheck["name"]]: if claim.getTarget() == claimCheck["nameItem"]: passItem = True break if not passItem: passAllItem = False if passCriteria: claim = pywikibot.Claim(datasite, claimCheck["name"]) claim.setTarget(claimCheck["nameItem"]) item.addClaim(claim) if claimCheck["ref"] is not None: claim2 = pywikibot.Claim(datasite, claimCheck["ref"]) claim2.setTarget(claimCheck["refItem"]) claim.addSource(claim2) pywikibot.output("added claim!") passCriteria = passCriteria or passAllItem if (description is None) and passCriteria: description = check["description"] if passCriteria: break oldlabels = None if "th" in data["labels"]: oldlabels = data["labels"]["th"] labels = lre.pats["rmdisam"].sub("", page.title()) if not lre.pats["thai"].search(labels): continue if labels != oldlabels: pywikibot.output("old label: " + unicode(oldlabels)) pywikibot.output("new label: " + unicode(labels)) editdict["labels"] = labels if passCriteria and ( ("th" in data["descriptions"] and data["descriptions"]["th"] != description) or ("th" not in data["descriptions"]) ): editdict["descriptions"] = description out = transform(editdict) if not out: continue pywikibot.output("item: " + qitem) pywikibot.output("title: " + page.title()) try: # raw_input("prompt: ...") item.editEntity(out) except: wp.error() pass