Esempio n. 1
0
 def filter(self, articles):
     out = odict()
     for article_key, article in articles.iteritems():
         if article_key in self.pages:
             out[article_key] = article
     log("  [+] Applying page link filter (%s): %d -> %d" % (','.join(self.pages), len(articles), len(out)))
     return out
Esempio n. 2
0
 def filter(self, articles):
     log("  [+] Applying existing page filter")
     out = odict()
     for aname, article in articles.iteritems():
         if not article.new:
             out[aname] = article
     return out
Esempio n. 3
0
    def __init__(self, verbose, sites, articles):
        """
        Arguments:
            sites     : dict { 'no': <mwclient.client.Site>, ... }
            articles  : list of article names
        """
        Filter.__init__(self, verbose)
        self.sites = sites
        self.articles = articles
        self.links = []
        log("  [+] Initializing backlink filter: " + ','.join(self.articles))
        #print sites

        for site_key, site in self.sites.iteritems():
            for aname in self.articles:
                aname2 = aname
                kv = aname.split(':', 1)
                if len(kv) == 2 and len(kv[0]) == 2:
                    if kv[0] != site_key:
                        continue
                    else:
                        aname2 = kv[1]
                p = site.pages[aname2]
                if p.exists:
                    for link in p.links(redirects=True):
                        self.links.append(site_key+':'+link.name)
                    for link in p.iwlinks():
                        self.links.append(link[0]+':'+link[1].replace('_', ' '))
Esempio n. 4
0
 def filter(self, articles):
     out = odict()
     for article_key, article in articles.iteritems():
         if article.bytes >= self.bytelimit:
             out[article_key] = article
     log("  [+] Applying byte filter (%d bytes): %d -> %d" % (self.bytelimit, len(articles), len(out)))
     return out
Esempio n. 5
0
 def filter(self, articles):
     log("  [+] Applying new page filter")
     out = odict()
     for a, aa in articles.iteritems():
         if aa.new and not aa.redirect:
             out[a] = aa
     return out
Esempio n. 6
0
 def filter(self, articles):
     log("  [+] Applying existing page filter")
     out = odict()
     for aname, article in articles.iteritems():
         if not article.new:
             out[aname] = article
     log("  [+] Applying existing page filter: %d -> %d" % (len(articles), len(out)))
     return out
Esempio n. 7
0
 def filter(self, articles):
     log("  [+] Applying new page filter")
     out = odict()
     for a, aa in articles.iteritems():
         if not self.redirects and aa.new_non_redirect:
             out[a] = aa
         elif self.redirects and aa.new:
             out[a] = aa
     log("  [+] Applying new page filter: %d -> %d" % (len(articles), len(out)))
     return out
Esempio n. 8
0
    def __init__(self, verbose, sites, catnames, maxdepth=4, ignore=[]):
        """
        Arguments:
            sites     : dict { 'no': <mwclient.client.Site>, ... }
            catnames  : list of category names
            maxdepth  : number of subcategory levels to traverse
        """
        Filter.__init__(self, verbose)

        self.ignore = ignore
        self.sites = sites
        self.include = [c[c.rfind(':')+1:] for c in catnames]
        self.maxdepth = int(maxdepth)
        if self.verbose:
            log("  CatFilter: %s" % (" OR ".join(self.include)))
Esempio n. 9
0
    def __init__(self, verbose, sites, catnames, maxdepth=5, ignore=[]):
        """
        Arguments:
            sites     : dict { 'no': <mwclient.client.Site>, ... }
            catnames  : list of category names
            maxdepth  : number of subcategory levels to traverse
            ignore    : list of categories to ignore
        """
        Filter.__init__(self, verbose)

        self.ignore = ignore
        self.sites = sites
        self.include = catnames
        self.maxdepth = int(maxdepth)
        if self.verbose:
            log("  CatFilter: %s, maxdepth=%d" % (" OR ".join(self.include), maxdepth))
Esempio n. 10
0
    def filter(self, articles, debug=False):

        #if self.verbose:
        log("  [+] Applying category filter", newline=False)

        cats, parents = self.fetchcats(articles, debug=debug)

        out = odict()

        # loop over articles
        for article_key, article_cats in cats.iteritems():
            #if debug:
            #    print
            article = articles[article_key]
            lang = article_key.split(':')[0]
            if debug:
                log(">>> %s" % article.name, newline=False)
                for l, ca in enumerate(article_cats):
                    log('[%d] %s' % (l, ', '.join(ca)), newline=False)

            #print
            #print article_key
            #print article_cats
            #print
            catname = self.check_article_cats(article_cats)
            if catname:

                # Add category path to the article object, so we can check how the article matched
                article.cat_path = [catname]
                # print '[%s]' % (article_key)
                try:
                    i = 0
                    aname = article.site.key + ':' + article.name
                    while not catname == aname:
                        # print ' [%d] %s' % (i,catname)
                        if not parents[article_key][catname] == aname:
                            article.cat_path.append(parents[article_key][catname])
                        catname = parents[article_key][catname]
                        i += 1
                        if i > 50:
                            raise CategoryLoopError(article.cat_path)
                except CategoryLoopError as e:
                    article.errors.append(_('Encountered an infinite category loop: ')
                        + ' → '.join(['[[:%(catname)s|]]'
                        % {'catname': c} for c in e.catpath]))

                out[article_key] = article

        #if self.verbose:
        log(": %d -> %d" % (len(articles), len(out)))
        return out
Esempio n. 11
0
    def filter(self, articles):

        out = odict()
        for article_key, article in articles.iteritems():

            firstrevid = article.revisions.firstkey()
            firstrev = article.revisions[firstrevid]

            try:

                #if article.new == False and article.redirect == False:

                # Check if first revision is a stub
                t = self.has_template(firstrev.parenttext)
                if t:
                    if self.verbose:
                        log('      Found template {{%s}} in [[%s]] @ %d' % (t, article_key, firstrevid))
                    out[article_key] = article

            except DanmicholoParseError as e:
                log(" >> DanmicholoParser failed to parse " + article_key)
                parentid = firstrev.parentid
                args = {'article': article_key, 'prevrev': firstrev.parentid, 'rev': lastrev.revid, 'error': e.msg}
                article.site.errors.append(_('Could not analyze the article %(article)s because one of the revisions %(prevrev)d or %(rev)d could not be parsed: %(error)s') % args)

        log("  [+] Applying template filter: %d -> %d" % (len(articles), len(out)))

        return out
Esempio n. 12
0
    def test(self, rev):
        imgs0 = self.get_images(rev.parenttext)
        imgs1 = self.get_images(rev.text)
        imgs_added = set(imgs1).difference(set(imgs0))


        #own_imgs_added = []
        #others_imgs_added = []
        counters = {'ownwork': [], 'own': [], 'other': []}
        for filename in imgs_added:
            filename = urllib.unquote(filename)
            image = rev.article.site.images[filename]
            imageinfo = image.imageinfo
            if len(imageinfo) > 0:   # seems like image.exists only checks locally
                try:
                    uploader = imageinfo['user']
                except KeyError:
                    log("ERR: Could not locate user for file '%s' in rev. %s " % (filename, rev.revid))
                    continue

                log("- File '%s' uploaded by '%s', revision made by '%s'" % (filename, uploader, rev.username))
                if uploader == rev.username:
                    #print "own image!"
                    #own_imgs_added.append(filename)
                    credit = ''
                    extrainfo = rev.article.site.api('query', prop='imageinfo', titles=u'File:{}'.format(filename), iiprop='extmetadata')
                    try:
                        credit = extrainfo['query']['pages']['-1']['imageinfo'][0]['extmetadata']['Credit']['value']
                    except KeyError:
                        pass

                    if re.search('int-own-work', credit, re.I):
                        counters['ownwork'].append(filename)
                    else:
                        counters['own'].append(filename)
                else:
                    counters['other'].append(filename)
            else:
                log("- File '%s' does not exist" % (filename))


        # If maxinitialcount is 0, only the first image counts.
        # If an user adds both an own image and an image by someone else,
        # we should make sure to credit the own image, not the other.
        # We therefore process the own images first.
        # imgs_added = own_imgs_added + others_imgs_added
        total_added = len(counters['own']) + len(counters['ownwork']) + len(counters['other'])
        self.totalimages += total_added
        revpoints = 0
        for n, img in enumerate(imgs_added):
            if len(imgs0) + n <= self.maxinitialcount:
                if img in counters['ownwork']:
                    revpoints += self.ownwork
                elif img in counters['own']:
                    revpoints += self.own
                else:
                    revpoints += self.points

        if revpoints > 0:
            self.add_points(rev, revpoints, 'image', '%d %s' % (len(imgs_added), _('images') if len(imgs_added) > 1 else _('image')), self.maxpoints)
Esempio n. 13
0
    def test(self, rev):
        imgs0 = self.get_images(rev.parenttext)
        imgs1 = self.get_images(rev.text)
        imgs_added = set(imgs1).difference(set(imgs0))


        own_imgs_added = []
        others_imgs_added = []
        for filename in imgs_added:
            image = rev.article.site.Images[filename]
            imageinfo = image.imageinfo
            if len(imageinfo) > 0:   # seems like image.exists only checks locally
                try:
                    uploader = imageinfo['user']
                except KeyError:
                    log("ERR: Could not locate user for file '%s' in rev. %s " % (filename, rev.revid))

                log("- File '%s' uploaded by '%s', revision made by '%s'" % (filename, uploader, rev.username))
                if uploader == rev.username:
                    #print "own image!"
                    own_imgs_added.append(filename)
                else:
                    others_imgs_added.append(filename)
            else:
                log("- File '%s' does not exist" % (filename))


        # If maxinitialcount is 0, only the first image counts.
        # If an user adds both an own image and an image by someone else,
        # we should make sure to credit the own image, not the other.
        # We therefore process the own images first.
        imgs_added = own_imgs_added + others_imgs_added
        self.totalimages += len(imgs_added)
        revpoints = 0
        for n, img in enumerate(imgs_added):
            if len(imgs0) + n <= self.maxinitialcount:
                if img in own_imgs_added:
                    revpoints += self.own
                else:
                    revpoints += self.points

        if revpoints > 0:
            self.add_points(rev, revpoints, 'image', '%d %s' % (len(imgs_added), _('images') if len(imgs_added) > 1 else _('image')), self.maxpoints)
Esempio n. 14
0

_ = init_localization(config['locale'])

runstart = server_tz.localize(datetime.now())
#log('UKBot-uploader starting at %s (server time), %s (wiki time)' % (runstart.strftime('%F %T'), runstart.astimezone(wiki_tz).strftime('%F %T')))

host = config['homesite']
homesite = mwclient.Site(host)

now = server_tz.localize(datetime.now())

if args.page is not None:
    ktitle = args.page.decode('utf-8')
else:
    log('  No page specified. Using default page')
    ktitle = config['pages']['default']
    # subtract a few hours, so we close last week's contest right after midnight
    #ktitle = (now - timedelta(hours=1)).astimezone(wiki_tz).strftime(ktitle.encode('utf-8')).decode('utf-8')
    ktitle = config['pages']['default']
    w = Week.withdate((now - timedelta(hours=3)).astimezone(wiki_tz).date())
    # subtract one hour, so we close last week's contest right after midnight
    ktitle = ktitle % {'year': w.year, 'week': w.week}

# Is ktitle redirect? Resolve

log('@ ktitle is %s' % ktitle)
pp = homesite.api('query', prop='pageprops', titles=ktitle, redirects='1')
if 'redirects' in pp['query']:
    ktitle = pp['query']['redirects'][0]['to']
    log('  -> Redirected to:  %s' % ktitle)
Esempio n. 15
0
    def fetchcats(self, articles, debug=False):
        """ Fetches categories an overcategories for a set of articles """

        # Make a list of the categories of a given article, with one list for each level
        # > cats[article_key][level] = [cat1, cat2, ...]

        cats = {p: [[] for n in range(self.maxdepth)] for p in articles}

        # Also, for each article, keep a list of category parents, so we can build
        # a path along the category tree from any matched category to the article
        # > parents[article_key][category] = parent_category
        #
        # Example:
        #                   /- cat 2
        #             /- cat1 -|
        # no:giraffe -|        \-
        #             \-
        #
        # parents['no:giraffe']['cat2'] = 'cat1'
        # parents['no:giraffe']['cat1'] = 'giraffe'
        #
        # We could also build full category trees for each article from the available
        # information, but they can grow quite big and slow to search

        parents = {p: {} for p in articles}

        #ctree = Tree()
        #for p in pages:
        #    ctree.add_child( name = p.encode('utf-8') )

        for site_key, site in self.sites.iteritems():

            if 'bot' in site.rights:
                requestlimit = 500
                returnlimit = 5000
            else:
                requestlimit = 50
                returnlimit = 500

            # Titles of articles that belong to this site
            titles = [article.name for article in articles.itervalues() if article.site.key == site_key]

            log(' ['+site_key+':'+str(len(titles))+']', newline=False)
            #.flush()
            if len(titles) > 0:

                for level in range(self.maxdepth):

                    titles0 = copy(titles)
                    titles = []  # make a new list of titles to search
                    nc = 0
                    nnc = 0

                    for s0 in range(0, len(titles0), requestlimit):
                        if debug:
                            print
                            print "[%d] > Getting %d to %d of %d" % (level, s0, s0+requestlimit, len(titles0))
                        ids = '|'.join(titles0[s0:s0+requestlimit])

                        cont = True
                        clcont = ''
                        while cont:
                            #print clcont
                            if clcont != '':
                                q = site.api('query', prop='categories', titles=ids, cllimit=returnlimit, clcontinue=clcont)
                            else:
                                q = site.api('query', prop='categories', titles=ids, cllimit=returnlimit)

                            if 'warnings' in q:
                                raise StandardError(q['warnings']['query']['*'])

                            for pageid, page in q['query']['pages'].iteritems():
                                fulltitle = page['title']
                                shorttitle = fulltitle.split(':', 1)[-1]
                                article_key = site_key + ':' + fulltitle
                                if 'categories' in page:
                                    for cat in page['categories']:
                                        cat_title = cat['title']
                                        cat_short = cat_title.split(':', 1)[1]
                                        follow = True
                                        for d in self.ignore:
                                            if re.search(d, cat_short):
                                                if self.verbose:
                                                    log(' - Ignore: "%s" matched "%s"' % (cat_title, d))
                                                follow = False
                                        if follow:
                                            nc += 1
                                            titles.append(cat_title)
                                            if level == 0:
                                                cats[article_key][level].append(cat_short)
                                                parents[article_key][cat_short] = fulltitle
                                                #print cat_short
                                                # use iter_search_nodes instead?
                                                #ctree.search_nodes( name = fulltitle.encode('utf-8') )[0].add_child( name = cat_short.encode('utf-8') )
                                            else:
                                                for article_key, ccc in cats.iteritems():
                                                    if shorttitle in ccc[level-1]:
                                                        ccc[level].append(cat_short)
                                                        parents[article_key][cat_short] = shorttitle

                                                        #for node in ctree.search_nodes( name = shorttitle.encode('utf-8') ):
                                                        #    if not cat_short.encode('utf-8') in [i.name for i in node.get_children()]:
                                                        #        node.add_child(name = cat_short.encode('utf-8'))
                                        else:
                                            nnc += 1
                            if 'query-continue' in q:
                                clcont = q['query-continue']['categories']['clcontinue']
                            else:
                                cont = False
                    titles = list(set(titles))  # to remove duplicates (not order preserving)
                    #if level == 0:
                    #    cattree = [p for p in titles]
                    #if self.verbose:
                    log(' %d' % (len(titles)), newline=False)
                    #.stdout.flush()
                    #print "Found %d unique categories (%d total) at level %d (skipped %d categories)" % (len(titles), nc, level, nnc)

        return cats, parents
Esempio n. 16
0
    return delta.total_seconds()

_ = init_localization(config['locale'])

runstart = server_tz.localize(datetime.now())
#log('UKBot-uploader starting at %s (server time), %s (wiki time)' % (runstart.strftime('%F %T'), runstart.astimezone(wiki_tz).strftime('%F %T')))

host = config['homesite']
homesite = mwclient.Site(host)

now = server_tz.localize(datetime.now())

if args.page is not None:
    ktitle = args.page.decode('utf-8')
else:
    log('  No page specified. Using default page')
    ktitle = config['pages']['default']
    # subtract a few hours, so we close last week's contest right after midnight
    #ktitle = (now - timedelta(hours=1)).astimezone(wiki_tz).strftime(ktitle.encode('utf-8')).decode('utf-8')
    ktitle = config['pages']['default']
    w = Week.withdate((now - timedelta(hours=3)).astimezone(wiki_tz).date())
    # subtract one hour, so we close last week's contest right after midnight
    ktitle = ktitle % { 'year': w.year, 'week': w.week }

# Is ktitle redirect? Resolve

log('@ ktitle is %s' % ktitle)
pp = homesite.api('query', prop='pageprops', titles=ktitle, redirects='1')
if 'redirects' in pp['query']:
    ktitle = pp['query']['redirects'][0]['to']
    log('  -> Redirected to:  %s' % ktitle)