def filter(self, articles): out = odict() for article_key, article in articles.iteritems(): if article_key in self.pages: out[article_key] = article log(" [+] Applying page link filter (%s): %d -> %d" % (','.join(self.pages), len(articles), len(out))) return out
def filter(self, articles): log(" [+] Applying existing page filter") out = odict() for aname, article in articles.iteritems(): if not article.new: out[aname] = article return out
def __init__(self, verbose, sites, articles): """ Arguments: sites : dict { 'no': <mwclient.client.Site>, ... } articles : list of article names """ Filter.__init__(self, verbose) self.sites = sites self.articles = articles self.links = [] log(" [+] Initializing backlink filter: " + ','.join(self.articles)) #print sites for site_key, site in self.sites.iteritems(): for aname in self.articles: aname2 = aname kv = aname.split(':', 1) if len(kv) == 2 and len(kv[0]) == 2: if kv[0] != site_key: continue else: aname2 = kv[1] p = site.pages[aname2] if p.exists: for link in p.links(redirects=True): self.links.append(site_key+':'+link.name) for link in p.iwlinks(): self.links.append(link[0]+':'+link[1].replace('_', ' '))
def filter(self, articles): out = odict() for article_key, article in articles.iteritems(): if article.bytes >= self.bytelimit: out[article_key] = article log(" [+] Applying byte filter (%d bytes): %d -> %d" % (self.bytelimit, len(articles), len(out))) return out
def filter(self, articles): log(" [+] Applying new page filter") out = odict() for a, aa in articles.iteritems(): if aa.new and not aa.redirect: out[a] = aa return out
def filter(self, articles): log(" [+] Applying existing page filter") out = odict() for aname, article in articles.iteritems(): if not article.new: out[aname] = article log(" [+] Applying existing page filter: %d -> %d" % (len(articles), len(out))) return out
def filter(self, articles): log(" [+] Applying new page filter") out = odict() for a, aa in articles.iteritems(): if not self.redirects and aa.new_non_redirect: out[a] = aa elif self.redirects and aa.new: out[a] = aa log(" [+] Applying new page filter: %d -> %d" % (len(articles), len(out))) return out
def __init__(self, verbose, sites, catnames, maxdepth=4, ignore=[]): """ Arguments: sites : dict { 'no': <mwclient.client.Site>, ... } catnames : list of category names maxdepth : number of subcategory levels to traverse """ Filter.__init__(self, verbose) self.ignore = ignore self.sites = sites self.include = [c[c.rfind(':')+1:] for c in catnames] self.maxdepth = int(maxdepth) if self.verbose: log(" CatFilter: %s" % (" OR ".join(self.include)))
def __init__(self, verbose, sites, catnames, maxdepth=5, ignore=[]): """ Arguments: sites : dict { 'no': <mwclient.client.Site>, ... } catnames : list of category names maxdepth : number of subcategory levels to traverse ignore : list of categories to ignore """ Filter.__init__(self, verbose) self.ignore = ignore self.sites = sites self.include = catnames self.maxdepth = int(maxdepth) if self.verbose: log(" CatFilter: %s, maxdepth=%d" % (" OR ".join(self.include), maxdepth))
def filter(self, articles, debug=False): #if self.verbose: log(" [+] Applying category filter", newline=False) cats, parents = self.fetchcats(articles, debug=debug) out = odict() # loop over articles for article_key, article_cats in cats.iteritems(): #if debug: # print article = articles[article_key] lang = article_key.split(':')[0] if debug: log(">>> %s" % article.name, newline=False) for l, ca in enumerate(article_cats): log('[%d] %s' % (l, ', '.join(ca)), newline=False) #print #print article_key #print article_cats #print catname = self.check_article_cats(article_cats) if catname: # Add category path to the article object, so we can check how the article matched article.cat_path = [catname] # print '[%s]' % (article_key) try: i = 0 aname = article.site.key + ':' + article.name while not catname == aname: # print ' [%d] %s' % (i,catname) if not parents[article_key][catname] == aname: article.cat_path.append(parents[article_key][catname]) catname = parents[article_key][catname] i += 1 if i > 50: raise CategoryLoopError(article.cat_path) except CategoryLoopError as e: article.errors.append(_('Encountered an infinite category loop: ') + ' → '.join(['[[:%(catname)s|]]' % {'catname': c} for c in e.catpath])) out[article_key] = article #if self.verbose: log(": %d -> %d" % (len(articles), len(out))) return out
def filter(self, articles): out = odict() for article_key, article in articles.iteritems(): firstrevid = article.revisions.firstkey() firstrev = article.revisions[firstrevid] try: #if article.new == False and article.redirect == False: # Check if first revision is a stub t = self.has_template(firstrev.parenttext) if t: if self.verbose: log(' Found template {{%s}} in [[%s]] @ %d' % (t, article_key, firstrevid)) out[article_key] = article except DanmicholoParseError as e: log(" >> DanmicholoParser failed to parse " + article_key) parentid = firstrev.parentid args = {'article': article_key, 'prevrev': firstrev.parentid, 'rev': lastrev.revid, 'error': e.msg} article.site.errors.append(_('Could not analyze the article %(article)s because one of the revisions %(prevrev)d or %(rev)d could not be parsed: %(error)s') % args) log(" [+] Applying template filter: %d -> %d" % (len(articles), len(out))) return out
def test(self, rev): imgs0 = self.get_images(rev.parenttext) imgs1 = self.get_images(rev.text) imgs_added = set(imgs1).difference(set(imgs0)) #own_imgs_added = [] #others_imgs_added = [] counters = {'ownwork': [], 'own': [], 'other': []} for filename in imgs_added: filename = urllib.unquote(filename) image = rev.article.site.images[filename] imageinfo = image.imageinfo if len(imageinfo) > 0: # seems like image.exists only checks locally try: uploader = imageinfo['user'] except KeyError: log("ERR: Could not locate user for file '%s' in rev. %s " % (filename, rev.revid)) continue log("- File '%s' uploaded by '%s', revision made by '%s'" % (filename, uploader, rev.username)) if uploader == rev.username: #print "own image!" #own_imgs_added.append(filename) credit = '' extrainfo = rev.article.site.api('query', prop='imageinfo', titles=u'File:{}'.format(filename), iiprop='extmetadata') try: credit = extrainfo['query']['pages']['-1']['imageinfo'][0]['extmetadata']['Credit']['value'] except KeyError: pass if re.search('int-own-work', credit, re.I): counters['ownwork'].append(filename) else: counters['own'].append(filename) else: counters['other'].append(filename) else: log("- File '%s' does not exist" % (filename)) # If maxinitialcount is 0, only the first image counts. # If an user adds both an own image and an image by someone else, # we should make sure to credit the own image, not the other. # We therefore process the own images first. # imgs_added = own_imgs_added + others_imgs_added total_added = len(counters['own']) + len(counters['ownwork']) + len(counters['other']) self.totalimages += total_added revpoints = 0 for n, img in enumerate(imgs_added): if len(imgs0) + n <= self.maxinitialcount: if img in counters['ownwork']: revpoints += self.ownwork elif img in counters['own']: revpoints += self.own else: revpoints += self.points if revpoints > 0: self.add_points(rev, revpoints, 'image', '%d %s' % (len(imgs_added), _('images') if len(imgs_added) > 1 else _('image')), self.maxpoints)
def test(self, rev): imgs0 = self.get_images(rev.parenttext) imgs1 = self.get_images(rev.text) imgs_added = set(imgs1).difference(set(imgs0)) own_imgs_added = [] others_imgs_added = [] for filename in imgs_added: image = rev.article.site.Images[filename] imageinfo = image.imageinfo if len(imageinfo) > 0: # seems like image.exists only checks locally try: uploader = imageinfo['user'] except KeyError: log("ERR: Could not locate user for file '%s' in rev. %s " % (filename, rev.revid)) log("- File '%s' uploaded by '%s', revision made by '%s'" % (filename, uploader, rev.username)) if uploader == rev.username: #print "own image!" own_imgs_added.append(filename) else: others_imgs_added.append(filename) else: log("- File '%s' does not exist" % (filename)) # If maxinitialcount is 0, only the first image counts. # If an user adds both an own image and an image by someone else, # we should make sure to credit the own image, not the other. # We therefore process the own images first. imgs_added = own_imgs_added + others_imgs_added self.totalimages += len(imgs_added) revpoints = 0 for n, img in enumerate(imgs_added): if len(imgs0) + n <= self.maxinitialcount: if img in own_imgs_added: revpoints += self.own else: revpoints += self.points if revpoints > 0: self.add_points(rev, revpoints, 'image', '%d %s' % (len(imgs_added), _('images') if len(imgs_added) > 1 else _('image')), self.maxpoints)
_ = init_localization(config['locale']) runstart = server_tz.localize(datetime.now()) #log('UKBot-uploader starting at %s (server time), %s (wiki time)' % (runstart.strftime('%F %T'), runstart.astimezone(wiki_tz).strftime('%F %T'))) host = config['homesite'] homesite = mwclient.Site(host) now = server_tz.localize(datetime.now()) if args.page is not None: ktitle = args.page.decode('utf-8') else: log(' No page specified. Using default page') ktitle = config['pages']['default'] # subtract a few hours, so we close last week's contest right after midnight #ktitle = (now - timedelta(hours=1)).astimezone(wiki_tz).strftime(ktitle.encode('utf-8')).decode('utf-8') ktitle = config['pages']['default'] w = Week.withdate((now - timedelta(hours=3)).astimezone(wiki_tz).date()) # subtract one hour, so we close last week's contest right after midnight ktitle = ktitle % {'year': w.year, 'week': w.week} # Is ktitle redirect? Resolve log('@ ktitle is %s' % ktitle) pp = homesite.api('query', prop='pageprops', titles=ktitle, redirects='1') if 'redirects' in pp['query']: ktitle = pp['query']['redirects'][0]['to'] log(' -> Redirected to: %s' % ktitle)
def fetchcats(self, articles, debug=False): """ Fetches categories an overcategories for a set of articles """ # Make a list of the categories of a given article, with one list for each level # > cats[article_key][level] = [cat1, cat2, ...] cats = {p: [[] for n in range(self.maxdepth)] for p in articles} # Also, for each article, keep a list of category parents, so we can build # a path along the category tree from any matched category to the article # > parents[article_key][category] = parent_category # # Example: # /- cat 2 # /- cat1 -| # no:giraffe -| \- # \- # # parents['no:giraffe']['cat2'] = 'cat1' # parents['no:giraffe']['cat1'] = 'giraffe' # # We could also build full category trees for each article from the available # information, but they can grow quite big and slow to search parents = {p: {} for p in articles} #ctree = Tree() #for p in pages: # ctree.add_child( name = p.encode('utf-8') ) for site_key, site in self.sites.iteritems(): if 'bot' in site.rights: requestlimit = 500 returnlimit = 5000 else: requestlimit = 50 returnlimit = 500 # Titles of articles that belong to this site titles = [article.name for article in articles.itervalues() if article.site.key == site_key] log(' ['+site_key+':'+str(len(titles))+']', newline=False) #.flush() if len(titles) > 0: for level in range(self.maxdepth): titles0 = copy(titles) titles = [] # make a new list of titles to search nc = 0 nnc = 0 for s0 in range(0, len(titles0), requestlimit): if debug: print print "[%d] > Getting %d to %d of %d" % (level, s0, s0+requestlimit, len(titles0)) ids = '|'.join(titles0[s0:s0+requestlimit]) cont = True clcont = '' while cont: #print clcont if clcont != '': q = site.api('query', prop='categories', titles=ids, cllimit=returnlimit, clcontinue=clcont) else: q = site.api('query', prop='categories', titles=ids, cllimit=returnlimit) if 'warnings' in q: raise StandardError(q['warnings']['query']['*']) for pageid, page in q['query']['pages'].iteritems(): fulltitle = page['title'] shorttitle = fulltitle.split(':', 1)[-1] article_key = site_key + ':' + fulltitle if 'categories' in page: for cat in page['categories']: cat_title = cat['title'] cat_short = cat_title.split(':', 1)[1] follow = True for d in self.ignore: if re.search(d, cat_short): if self.verbose: log(' - Ignore: "%s" matched "%s"' % (cat_title, d)) follow = False if follow: nc += 1 titles.append(cat_title) if level == 0: cats[article_key][level].append(cat_short) parents[article_key][cat_short] = fulltitle #print cat_short # use iter_search_nodes instead? #ctree.search_nodes( name = fulltitle.encode('utf-8') )[0].add_child( name = cat_short.encode('utf-8') ) else: for article_key, ccc in cats.iteritems(): if shorttitle in ccc[level-1]: ccc[level].append(cat_short) parents[article_key][cat_short] = shorttitle #for node in ctree.search_nodes( name = shorttitle.encode('utf-8') ): # if not cat_short.encode('utf-8') in [i.name for i in node.get_children()]: # node.add_child(name = cat_short.encode('utf-8')) else: nnc += 1 if 'query-continue' in q: clcont = q['query-continue']['categories']['clcontinue'] else: cont = False titles = list(set(titles)) # to remove duplicates (not order preserving) #if level == 0: # cattree = [p for p in titles] #if self.verbose: log(' %d' % (len(titles)), newline=False) #.stdout.flush() #print "Found %d unique categories (%d total) at level %d (skipped %d categories)" % (len(titles), nc, level, nnc) return cats, parents
return delta.total_seconds() _ = init_localization(config['locale']) runstart = server_tz.localize(datetime.now()) #log('UKBot-uploader starting at %s (server time), %s (wiki time)' % (runstart.strftime('%F %T'), runstart.astimezone(wiki_tz).strftime('%F %T'))) host = config['homesite'] homesite = mwclient.Site(host) now = server_tz.localize(datetime.now()) if args.page is not None: ktitle = args.page.decode('utf-8') else: log(' No page specified. Using default page') ktitle = config['pages']['default'] # subtract a few hours, so we close last week's contest right after midnight #ktitle = (now - timedelta(hours=1)).astimezone(wiki_tz).strftime(ktitle.encode('utf-8')).decode('utf-8') ktitle = config['pages']['default'] w = Week.withdate((now - timedelta(hours=3)).astimezone(wiki_tz).date()) # subtract one hour, so we close last week's contest right after midnight ktitle = ktitle % { 'year': w.year, 'week': w.week } # Is ktitle redirect? Resolve log('@ ktitle is %s' % ktitle) pp = homesite.api('query', prop='pageprops', titles=ktitle, redirects='1') if 'redirects' in pp['query']: ktitle = pp['query']['redirects'][0]['to'] log(' -> Redirected to: %s' % ktitle)